## 전자상거래 배송 데이터
- 제품 배송 시간에 맞춰 배송되었는지 예측모델 만들기
- 학습용 데이터 (X_train, y_train)을 이용하여 배송 예측 모형을 만든 후,
- 이를 평가용 데이터(X_test)에 적용하여 얻은 예측 확률값을 다음과 같은 형식의 CSV파일로 생성하시오(제출한 모델의 성능은 ROC-AUC 평가지표에 따라 채점)

In [None]:
# 시험환경 세팅 (코드 변경 X)
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

def exam_data_load(df, target, id_name="", null_name=""):
    if id_name == "":
        df = df.reset_index().rename(columns={"index": "id"})
        id_name = 'id'
    else:
        id_name = id_name

    if null_name != "":
        df[df == null_name] = np.nan

    X_train, X_test = train_test_split(df, test_size=0.2, random_state=2021)

    y_train = X_train[[id_name, target]]
    X_train = X_train.drop(columns=[target])


    y_test = X_test[[id_name, target]]
    X_test = X_test.drop(columns=[target])
    return X_train, X_test, y_train, y_test

df = pd.read_csv("/content/drive/MyDrive/빅데이터 분석기사/data/Train.csv")
X_train, X_test, y_train, y_test = exam_data_load(df, target='Reached.on.Time_Y.N', id_name='ID')

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((8799, 11), (2200, 11), (8799, 2), (2200, 2))

In [None]:
X_train.head()

Unnamed: 0,ID,Warehouse_block,Mode_of_Shipment,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Gender,Discount_offered,Weight_in_gms
3999,4000,B,Ship,3,4,134,3,high,F,6,5680
9559,9560,F,Ship,4,3,173,3,medium,M,5,5331
2649,2650,B,Ship,2,1,192,3,high,M,46,3206
4843,4844,F,Ship,6,5,284,4,medium,M,8,5346
9601,9602,F,Flight,3,1,246,3,low,F,10,4707


In [None]:
y_train.head()

Unnamed: 0,ID,Reached.on.Time_Y.N
3999,4000,0
9559,9560,1
2649,2650,1
4843,4844,1
9601,9602,1


In [None]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8799 entries, 3999 to 9332
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   ID                   8799 non-null   int64 
 1   Warehouse_block      8799 non-null   object
 2   Mode_of_Shipment     8799 non-null   object
 3   Customer_care_calls  8799 non-null   int64 
 4   Customer_rating      8799 non-null   int64 
 5   Cost_of_the_Product  8799 non-null   int64 
 6   Prior_purchases      8799 non-null   int64 
 7   Product_importance   8799 non-null   object
 8   Gender               8799 non-null   object
 9   Discount_offered     8799 non-null   int64 
 10  Weight_in_gms        8799 non-null   int64 
dtypes: int64(7), object(4)
memory usage: 824.9+ KB


In [None]:
X_train.isnull().sum()

ID                     0
Warehouse_block        0
Mode_of_Shipment       0
Customer_care_calls    0
Customer_rating        0
Cost_of_the_Product    0
Prior_purchases        0
Product_importance     0
Gender                 0
Discount_offered       0
Weight_in_gms          0
dtype: int64

In [None]:
X_test.isnull().sum()

ID                     0
Warehouse_block        0
Mode_of_Shipment       0
Customer_care_calls    0
Customer_rating        0
Cost_of_the_Product    0
Prior_purchases        0
Product_importance     0
Gender                 0
Discount_offered       0
Weight_in_gms          0
dtype: int64

In [None]:
from sklearn.preprocessing import LabelEncoder
cols = ['Warehouse_block' , 'Mode_of_Shipment' , 'Product_importance' , 'Gender']

for col in cols:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col])
    X_test[col] = le.transform(X_test[col])

In [None]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8799 entries, 3999 to 9332
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype
---  ------               --------------  -----
 0   ID                   8799 non-null   int64
 1   Warehouse_block      8799 non-null   int64
 2   Mode_of_Shipment     8799 non-null   int64
 3   Customer_care_calls  8799 non-null   int64
 4   Customer_rating      8799 non-null   int64
 5   Cost_of_the_Product  8799 non-null   int64
 6   Prior_purchases      8799 non-null   int64
 7   Product_importance   8799 non-null   int64
 8   Gender               8799 non-null   int64
 9   Discount_offered     8799 non-null   int64
 10  Weight_in_gms        8799 non-null   int64
dtypes: int64(11)
memory usage: 824.9 KB


In [None]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2200 entries, 4732 to 4413
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype
---  ------               --------------  -----
 0   ID                   2200 non-null   int64
 1   Warehouse_block      2200 non-null   int64
 2   Mode_of_Shipment     2200 non-null   int64
 3   Customer_care_calls  2200 non-null   int64
 4   Customer_rating      2200 non-null   int64
 5   Cost_of_the_Product  2200 non-null   int64
 6   Prior_purchases      2200 non-null   int64
 7   Product_importance   2200 non-null   int64
 8   Gender               2200 non-null   int64
 9   Discount_offered     2200 non-null   int64
 10  Weight_in_gms        2200 non-null   int64
dtypes: int64(11)
memory usage: 206.2 KB


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

In [None]:
X_train_id = X_train.pop('ID')
X_test_id = X_test.pop('ID')

KeyError: ignored

In [None]:
train_x , val_x , train_y , val_y = train_test_split(X_train , y_train['Reached.on.Time_Y.N'] , test_size = 0.2 , random_state = 42)

In [None]:
model = LogisticRegression()
model.fit(train_x,train_y)

y_pred = model.predict_proba(val_x)
roc_auc_score(val_y , y_pred[:,1])

0.7065678418803418

In [None]:
model = KNeighborsClassifier(n_neighbors = 3)
model.fit(train_x , train_y)

y_pred = model.predict_proba(val_x)
roc_auc_score(val_y , y_pred[:,1])

0.7186698717948717

In [None]:
model = RandomForestClassifier(max_depth = 5  , n_estimators = 100 , random_state = 42)
model.fit(train_x , train_y)

y_pred = model.predict_proba(val_x)
roc_auc_score(val_y , y_pred[:,1])

0.7473170405982906

In [None]:
model = DecisionTreeClassifier(max_depth = 6 , min_samples_leaf = 10 , random_state = 42)
model.fit(train_x , train_y)

y_pred = model.predict_proba(val_x)
roc_auc_score(val_y , y_pred[:,1])

0.7638047542735041

In [None]:
model = XGBClassifier(max_depth = 5 , n_estimators = 100 ,random_state = 42)
model.fit(train_x , train_y)

y_pred = model.predict_proba(val_x)
roc_auc_score(val_y , y_pred[:,1])

0.7520272435897436

In [None]:
model = RandomForestClassifier(max_depth = 5  , n_estimators = 100 , random_state = 42)
model.fit(train_x , train_y)

y_pred = model.predict_proba(val_x)
roc_auc_score(val_y , y_pred[:,1])

0.7473170405982906

In [None]:
y_pred = model.predict_proba(X_test)

In [None]:
df = pd.DataFrame({'ID':X_test_id , 'Reached.on.Time_Y.N':y_pred[:,1]})

In [None]:
df.shape

(2200, 2)

In [None]:
df.to_csv('submission.csv' , index = False)

In [None]:
pred = model.predict_proba(X_test)
roc_auc_score(y_test['Reached.on.Time_Y.N'] , pred[:,1])

0.7323017091023951