#  e-commerce 배송의 정시 도착여부 (1: 정시배송 0 : 정시미배송)

x_train 데이터로 학습한 모델을 x_test에 적용하여 예측한 결과를 제출하라. 평가 지표는 f1_score이다.

In [107]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import f1_score

In [None]:
#데이터 로드
x_train = pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/shipping/X_train.csv")
y_train = pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/shipping/y_train.csv")
x_test= pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/shipping/X_test.csv")


display(x_train.head())
display(y_train.head())

## 데이터 구조 확인

In [34]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6598 entries, 0 to 6597
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   ID                   6598 non-null   int64 
 1   Warehouse_block      6598 non-null   object
 2   Mode_of_Shipment     6598 non-null   object
 3   Customer_care_calls  6598 non-null   object
 4   Customer_rating      6598 non-null   int64 
 5   Cost_of_the_Product  6598 non-null   int64 
 6   Prior_purchases      6598 non-null   int64 
 7   Product_importance   6598 non-null   object
 8   Gender               6598 non-null   object
 9   Discount_offered     6598 non-null   int64 
 10  Weight_in_gms        6598 non-null   int64 
dtypes: int64(6), object(5)
memory usage: 567.1+ KB


In [35]:
x_train.describe()

Unnamed: 0,ID,Customer_rating,Cost_of_the_Product,Prior_purchases,Discount_offered,Weight_in_gms
count,6598.0,6598.0,6598.0,6598.0,6598.0,6598.0
mean,5476.977266,2.991361,210.393149,3.577751,13.353592,3604.191119
std,3172.946154,1.409624,48.258089,1.511394,16.187267,1635.697627
min,1.0,1.0,96.0,2.0,1.0,1001.0
25%,2731.25,2.0,170.0,3.0,4.0,1834.25
50%,5476.0,3.0,214.0,3.0,7.0,4119.5
75%,8187.75,4.0,251.0,4.0,10.0,5027.5
max,10998.0,5.0,310.0,10.0,65.0,7684.0


In [36]:
# 결측치 확인

x_train.isnull().sum()

ID                     0
Warehouse_block        0
Mode_of_Shipment       0
Customer_care_calls    0
Customer_rating        0
Cost_of_the_Product    0
Prior_purchases        0
Product_importance     0
Gender                 0
Discount_offered       0
Weight_in_gms          0
dtype: int64

In [73]:
# 타겟 컬럼 불균형 확인

y_train['Reached.on.Time_Y.N'].value_counts()

1    3937
0    2661
Name: Reached.on.Time_Y.N, dtype: int64

## 카테고리 변수 처리

트레인셋과 테스트셋의 값 종류는 동일하다.

Customer_care_calls 컬럼에서 $7을 7로 변경해주고

나머지 컬럼은 레이블 인코딩을 진행시킨다.

In [63]:
categorical = ['Warehouse_block', 'Mode_of_Shipment', 'Product_importance', 'Gender']

In [38]:
x_train['Customer_care_calls'].value_counts()

4     2115
3     1919
5     1403
6      604
2      404
$7     153
Name: Customer_care_calls, dtype: int64

In [39]:
x_test['Customer_care_calls'].value_counts()

4     1442
3     1298
5      925
6      409
2      234
$7      93
Name: Customer_care_calls, dtype: int64

In [55]:
x_train['Customer_care_calls'].replace('$7', '7', inplace=True)
x_test['Customer_care_calls'].replace('$7', '7', inplace=True)

In [60]:
x_train['Customer_care_calls'] = x_train['Customer_care_calls'].astype(int)
x_test['Customer_care_calls'] = x_test['Customer_care_calls'].astype(int)

In [56]:
x_test['Customer_care_calls'].value_counts()

4    1442
3    1298
5     925
6     409
2     234
7      93
Name: Customer_care_calls, dtype: int64

In [64]:
for category in categorical:
    le = LabelEncoder()
    x_train[category] = le.fit_transform(x_train[category])
    x_test[category] = le.fit_transform(x_test[category])
    
x_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6598 entries, 0 to 6597
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype
---  ------               --------------  -----
 0   ID                   6598 non-null   int64
 1   Warehouse_block      6598 non-null   int32
 2   Mode_of_Shipment     6598 non-null   int32
 3   Customer_care_calls  6598 non-null   int32
 4   Customer_rating      6598 non-null   int64
 5   Cost_of_the_Product  6598 non-null   int64
 6   Prior_purchases      6598 non-null   int64
 7   Product_importance   6598 non-null   int32
 8   Gender               6598 non-null   int32
 9   Discount_offered     6598 non-null   int64
 10  Weight_in_gms        6598 non-null   int64
dtypes: int32(5), int64(6)
memory usage: 438.3 KB


## 모델링

In [70]:
x = x_train.drop(columns='ID')
y = y_train.drop(columns='ID')

In [75]:
xtrain, xval, ytrain, yval = train_test_split(x, y, test_size=0.25, random_state=42, stratify=y)

### 랜덤포레스트

In [103]:
rfc = RandomForestClassifier(random_state=42)

In [104]:
rfc.fit(xtrain, ytrain)

  rfc.fit(xtrain, ytrain)


In [105]:
predict = rfc.predict(xval)

In [106]:
f1_score(predict, yval)

0.7005524861878454

### xgboost

In [129]:
xgb = XGBClassifier(random_state=42, max_depth=6)

In [130]:
xgb.fit(xtrain, ytrain)

In [131]:
predict = xgb.predict(xval)

In [132]:
f1_score(predict, yval)

0.7107350608143839

## 예측 - xgboost

In [134]:
xgb_final = XGBClassifier(random_state=42, max_depth=6)

In [135]:
xgb_final.fit(x, y)

In [136]:
predict_final = xgb_final.predict(x_test.drop(columns='ID'))

In [140]:
submission = pd.DataFrame()
submission['ID'] = x_test['ID']
submission['predict'] = predict_final
submission

Unnamed: 0,ID,predict
0,6811,0
1,4320,0
2,5732,0
3,7429,1
4,2191,1
...,...,...
4396,2610,1
4397,3406,0
4398,10395,0
4399,3646,0


In [141]:
#submission.to_csv("000000.csv" ,index=False)