## 전자상거래 배송 데이터
### 제품 배송 시간에 맞춰 배송되었는지 예측모델 만들기
학습용 데이터 (X_train, y_train)을 이용하여 배송 예측 모형을 만든 후, 이를 평가용 데이터(X_test)에 적용하여 얻은 예측 확률값을 다음과 같은 형식의 CSV파일로 생성하시오(제출한 모델의 성능은 ROC-AUC 평가지표에 따라 채점)
```
ID, Reached.on.Time_Y.N
4733,0.6
2040,0.8
5114,0.45
2361,0.23
5996,0.43
```

[시험용 데이터셋 만들기] 코드는 예시문제와 동일한 형태의 X_train, y_train, X_test 데이터를 만들기 위함임

(유의사항)
- 성능이 우수한 예측모형을 구축하기 위해서는 적절한 데이터 전처리, 피처엔지니어링, 분류알고리즘, 하이퍼파라미터 튜닝, 모형 앙상블 등이 수반되어야 한다.
- 수험번호.csv파일이 만들어지도록 코드를 제출한다.
- 제출한 모델의 성능은 ROC-AUC형태로 읽어드린다.

## [참고]작업형2 문구
- 출력을 원하실 경우 print() 함수 활용
- 예시) print(df.head())
- getcwd(), chdir() 등 작업 폴더 설정 불필요
- 파일 경로 상 내부 드라이브 경로(C: 등) 접근 불가

### 데이터 파일 읽기 예제
- import pandas as pd
- X_test = pd.read_csv("data/X_test.csv")
- X_train = pd.read_csv("data/X_train.csv")
- y_train = pd.read_csv("data/y_train.csv")

### 사용자 코딩

### 답안 제출 참고
- 아래 코드 예측변수와 수험번호를 개인별로 변경하여 활용
- pd.DataFrame({'cust_id': X_test.cust_id, 'gender': pred}).to_csv('003000000.csv', index=False)

In [1]:
# 시험환경 세팅 (코드 변경 X)
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

def exam_data_load(df, target, id_name="", null_name=""):
    if id_name == "":
        df = df.reset_index().rename(columns={"index": "id"})
        id_name = 'id'
    else:
        id_name = id_name
    
    if null_name != "":
        df[df == null_name] = np.nan
    
    X_train, X_test = train_test_split(df, test_size=0.2, random_state=2021)
    
    y_train = X_train[[id_name, target]]
    X_train = X_train.drop(columns=[target])

    
    y_test = X_test[[id_name, target]]
    X_test = X_test.drop(columns=[target])
    return X_train, X_test, y_train, y_test 
    
df = pd.read_csv("../input/customer-analytics/Train.csv")
X_train, X_test, y_train, y_test = exam_data_load(df, target='Reached.on.Time_Y.N', id_name='ID')

X_train.shape, X_test.shape, y_train.shape, y_test.shape


((8799, 11), (2200, 11), (8799, 2), (2200, 2))

In [2]:
X_train.shape, X_test.shape, y_train.shape

((8799, 11), (2200, 11), (8799, 2))

In [3]:
X_train.head()

Unnamed: 0,ID,Warehouse_block,Mode_of_Shipment,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Gender,Discount_offered,Weight_in_gms
3999,4000,B,Ship,3,4,134,3,high,F,6,5680
9559,9560,F,Ship,4,3,173,3,medium,M,5,5331
2649,2650,B,Ship,2,1,192,3,high,M,46,3206
4843,4844,F,Ship,6,5,284,4,medium,M,8,5346
9601,9602,F,Flight,3,1,246,3,low,F,10,4707


In [4]:
import numpy as np
import pandas as pd

In [5]:
X_train.isnull().sum()

ID                     0
Warehouse_block        0
Mode_of_Shipment       0
Customer_care_calls    0
Customer_rating        0
Cost_of_the_Product    0
Prior_purchases        0
Product_importance     0
Gender                 0
Discount_offered       0
Weight_in_gms          0
dtype: int64

In [6]:
X_train.isnull().sum()

ID                     0
Warehouse_block        0
Mode_of_Shipment       0
Customer_care_calls    0
Customer_rating        0
Cost_of_the_Product    0
Prior_purchases        0
Product_importance     0
Gender                 0
Discount_offered       0
Weight_in_gms          0
dtype: int64

In [7]:
y_train.head()

Unnamed: 0,ID,Reached.on.Time_Y.N
3999,4000,0
9559,9560,1
2649,2650,1
4843,4844,1
9601,9602,1


In [8]:
y_train['Reached.on.Time_Y.N'].value_counts()

1    5251
0    3548
Name: Reached.on.Time_Y.N, dtype: int64

In [9]:
X_train[['Warehouse_block', 'Mode_of_Shipment', 'Product_importance', 'Gender']].nunique()

Warehouse_block       5
Mode_of_Shipment      3
Product_importance    3
Gender                2
dtype: int64

In [10]:
# object 컬럼 삭제 (또는 라벨인코딩, 원핫인코딩)
X_train = X_train.drop(['Warehouse_block', 'Mode_of_Shipment', 'Product_importance', 'Gender'], axis=1)
X_test = X_test.drop(['Warehouse_block', 'Mode_of_Shipment', 'Product_importance', 'Gender'], axis=1)
X_train

Unnamed: 0,ID,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Discount_offered,Weight_in_gms
3999,4000,3,4,134,3,6,5680
9559,9560,4,3,173,3,5,5331
2649,2650,2,1,192,3,46,3206
4843,4844,6,5,284,4,8,5346
9601,9602,3,1,246,3,10,4707
...,...,...,...,...,...,...,...
3934,3935,6,5,178,3,5,4096
2669,2670,3,1,206,2,56,2069
1152,1153,4,1,135,10,35,1989
6201,6202,4,4,251,2,6,4376


In [11]:
import sklearn

In [12]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train['Reached.on.Time_Y.N'], test_size = 0.15, random_state = 42)

In [15]:
from sklearn.metrics import roc_auc_score

In [16]:
model = DecisionTreeClassifier()
model.fit(X_tr, y_tr)
pred = model.predict_proba(X_val)
roc_auc_score(y_val, pred[:,1])

0.6374012648626143

In [17]:
model = RandomForestClassifier()
model.fit(X_tr, y_tr)
pred = model.predict_proba(X_val)
roc_auc_score(y_val, pred[:,1])

0.7438174635269231

In [18]:
y_train.head()

Unnamed: 0,ID,Reached.on.Time_Y.N
3999,4000,0
9559,9560,1
2649,2650,1
4843,4844,1
9601,9602,1


In [19]:
model = RandomForestClassifier()
model.fit(X_train, y_train['Reached.on.Time_Y.N'])
pred = model.predict_proba(X_test)

In [20]:
pred

array([[0.63, 0.37],
       [0.  , 1.  ],
       [0.51, 0.49],
       ...,
       [0.67, 0.33],
       [0.47, 0.53],
       [0.61, 0.39]])

In [21]:
X_test.head()

Unnamed: 0,ID,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Discount_offered,Weight_in_gms
4732,4733,5,5,250,2,10,5475
2039,2040,5,2,202,3,34,3885
5113,5114,5,4,154,2,5,5528
2360,2361,6,4,209,5,53,3169
5995,5996,6,1,250,4,4,1018


In [22]:
submission = pd.DataFrame({'ID' : X_test['ID'],
                          'Reached.on.Time_Y.N' : pred[:,1]})

In [23]:
submission.head()

Unnamed: 0,ID,Reached.on.Time_Y.N
4732,4733,0.37
2039,2040,1.0
5113,5114,0.49
2360,2361,1.0
5995,5996,0.63
