# 빅데이터 분석기사 실기

## 퇴근후딴짓 참고

In [1]:
# 라이브러리 불러오기
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

In [2]:
# 1. 전자상거래 배송 데이터

In [3]:
# 시험환경 세팅
def exam_data_load(df, target, id_name = '', null_name = ''):
    if id_name == '':
        df = df.reset_index().rename(columns = {'index': 'id'})
        id_name = 'id'
    else:
        id_name = id_name

    if null_name != '':
        df[df == null_name] = np.nan

    X_train, X_test = train_test_split(df, test_size = 0.2, random_state = 42)

    y_train = X_train[[id_name, target]]
    X_train = X_train.drop(columns = [target])

    y_test = X_test[[id_name, target]]
    X_test = X_test.drop(columns = [target])
    return X_train, X_test, y_train, y_test

df = pd.read_csv('C:/Users/chunc/Desktop/빅분기 연습/Ecommerce/Train.csv')
X_train, X_test, y_train, y_test = exam_data_load(df, target = 'Reached.on.Time_Y.N', id_name = 'ID')

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((8799, 11), (2200, 11), (8799, 2), (2200, 2))

In [4]:
# 제공된 3개 데이터 (y_test데이터 활용X)
X_train.shape, X_test.shape, y_train.shape

((8799, 11), (2200, 11), (8799, 2))

In [5]:
# EDA
# 데이터 확인
print(X_train.shape)
X_train.head()

(8799, 11)


Unnamed: 0,ID,Warehouse_block,Mode_of_Shipment,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Gender,Discount_offered,Weight_in_gms
8965,8966,F,Ship,3,5,178,3,low,F,7,4135
5936,5937,A,Ship,6,3,202,4,medium,F,3,4496
10656,10657,D,Ship,2,5,277,3,medium,M,6,1032
3328,3329,C,Ship,5,5,219,2,medium,M,4,4289
9880,9881,C,Flight,6,3,296,4,low,F,4,1650


In [6]:
# 레이블(타겟) 확인
y_train['Reached.on.Time_Y.N'].value_counts()

Reached.on.Time_Y.N
1    5258
0    3541
Name: count, dtype: int64

In [7]:
# X_train 결측치 확인
X_train.isnull().sum()

ID                     0
Warehouse_block        0
Mode_of_Shipment       0
Customer_care_calls    0
Customer_rating        0
Cost_of_the_Product    0
Prior_purchases        0
Product_importance     0
Gender                 0
Discount_offered       0
Weight_in_gms          0
dtype: int64

In [8]:
# X_test 결측치 확인
X_test.isnull().sum()

ID                     0
Warehouse_block        0
Mode_of_Shipment       0
Customer_care_calls    0
Customer_rating        0
Cost_of_the_Product    0
Prior_purchases        0
Product_importance     0
Gender                 0
Discount_offered       0
Weight_in_gms          0
dtype: int64

In [9]:
# 데이터 타입 확인
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8799 entries, 8965 to 7270
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   ID                   8799 non-null   int64 
 1   Warehouse_block      8799 non-null   object
 2   Mode_of_Shipment     8799 non-null   object
 3   Customer_care_calls  8799 non-null   int64 
 4   Customer_rating      8799 non-null   int64 
 5   Cost_of_the_Product  8799 non-null   int64 
 6   Prior_purchases      8799 non-null   int64 
 7   Product_importance   8799 non-null   object
 8   Gender               8799 non-null   object
 9   Discount_offered     8799 non-null   int64 
 10  Weight_in_gms        8799 non-null   int64 
dtypes: int64(7), object(4)
memory usage: 824.9+ KB


In [10]:
# object 타입 컬럼, 고유값 개수 확인
X_train[['Warehouse_block', 'Mode_of_Shipment', 'Product_importance', 'Gender']].nunique()

Warehouse_block       5
Mode_of_Shipment      3
Product_importance    3
Gender                2
dtype: int64

In [11]:
# 데이터 전처리
# object컬럼 삭제(라벨인코딩, 원핫인코딩)
X_train = X_train.drop(['Warehouse_block', 'Mode_of_Shipment', 'Product_importance', 'Gender'], axis = 1)
X_test = X_test.drop(['Warehouse_block', 'Mode_of_Shipment', 'Product_importance', 'Gender'], axis = 1)
X_train

Unnamed: 0,ID,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Discount_offered,Weight_in_gms
8965,8966,3,5,178,3,7,4135
5936,5937,6,3,202,4,3,4496
10656,10657,2,5,277,3,6,1032
3328,3329,5,5,219,2,4,4289
9880,9881,6,3,296,4,4,1650
...,...,...,...,...,...,...,...
5734,5735,3,5,275,2,7,4619
5191,5192,5,5,187,4,9,5326
5390,5391,4,2,137,2,2,5804
860,861,5,3,144,6,35,1009


In [12]:
# 모델 및 평가
X_train_id = X_train.pop('ID')
X_test_id = X_test.pop('ID')

In [13]:
# train-valid 분할
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train['Reached.on.Time_Y.N'], test_size = 0.2, random_state = 42)

In [14]:
# 모델 적합
model = RandomForestClassifier()
model.fit(X_tr, y_tr)
pred = model.predict_proba(X_val)
roc_auc_score(y_val, pred[:,1])

0.7347122736418512

In [15]:
model = RandomForestClassifier()
model.fit(X_train, y_train['Reached.on.Time_Y.N'])
pred = model.predict_proba(X_test)
pred

array([[0.  , 1.  ],
       [0.59, 0.41],
       [0.54, 0.46],
       ...,
       [0.52, 0.48],
       [0.36, 0.64],
       [0.75, 0.25]])

In [16]:
submission = pd.DataFrame({
    'ID': X_test_id, 'Reached.on.Time_Y.N': pred[:,1]})
submission.head()

Unnamed: 0,ID,Reached.on.Time_Y.N
107,108,1.0
5594,5595,0.41
6997,6998,0.46
3984,3985,0.43
3111,3112,1.0


In [17]:
submission.to_csv('jcm01.csv', index = False)