In [385]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# 머신러닝 관련 도구
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
train_data = pd.read_csv('./data/bigdatasc/Train.csv', index_col='ID')
train_data

Unnamed: 0_level_0,Warehouse_block,Mode_of_Shipment,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Gender,Discount_offered,Weight_in_gms,Reached.on.Time_Y.N
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,C,Ship,2.0,2,135,3.0,low,M,,1339,1
2,F,Flight,5.0,3,225,5.0,low,F,6.0,1082,0
3,F,Ship,3.0,1,229,3.0,low,M,,4971,1
4,F,Ship,3.0,2,228,,medium,M,2.0,5640,0
5,D,Flight,,1,195,6.0,high,F,,4944,1
...,...,...,...,...,...,...,...,...,...,...,...
6995,D,Ship,4.0,2,232,5.0,medium,F,4.0,1783,0
6996,F,Ship,3.0,5,228,2.0,medium,F,,5208,0
6997,D,Ship,,1,300,5.0,low,F,10.0,1787,0
6998,A,Road,3.0,1,232,2.0,medium,M,,4719,0


In [267]:
test_data = pd.read_csv('./data/bigdatasc/test.csv', index_col='ID')
test_data

Unnamed: 0_level_0,Warehouse_block,Mode_of_Shipment,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Gender,Discount_offered,Weight_in_gms
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
7000,F,Ship,3.0,1,274,,high,M,,4352
7001,F,Ship,3.0,4,136,2.0,medium,F,,1056
7002,A,Flight,,5,140,3.0,low,F,7.0,5383
7003,C,Ship,,1,291,4.0,low,F,,1880
7004,F,?,4.0,2,147,3.0,low,F,5.0,5174
...,...,...,...,...,...,...,...,...,...,...
10995,A,?,4.0,1,204,4.0,?,F,,1667
10996,C,Ship,,3,195,2.0,medium,M,,3869
10997,B,Flight,,3,206,2.0,medium,M,7.0,4531
10998,C,Ship,6.0,4,255,4.0,low,M,7.0,1869


In [4]:
train_data.columns
# Warehouse_block : 물류창고
# Mode_of_Shipment : 배송방법
# Customer_care_calls : 고객 관리 전화
# Customer_rating : 고객 평가
# Cost_of_the_Product : 제품가격
# Prior_purchases : 이전 구매
# Product_importance : 제품 중요도
# Gender : 성별
# Discount_offered : 할인
# Weight_in_gms : 운송무게
# Reached.on.Time_Y.N : 재시간 도착 여부

Index(['Warehouse_block ', 'Mode_of_Shipment', 'Customer_care_calls',
       'Customer_rating', 'Cost_of_the_Product', 'Prior_purchases',
       'Product_importance', 'Gender', 'Discount_offered', 'Weight_in_gms',
       'Reached.on.Time_Y.N'],
      dtype='object')

In [5]:
print(train_data.shape)
print(test_data.shape)

(6999, 11)
(4000, 10)


In [232]:
train_data.info()    # Customer_care_calls, Prior_purchases, Discount_offered
test_data.info()    # Customer_care_calls, Prior_purchase

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6999 entries, 1 to 6999
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Warehouse_block      6999 non-null   int64  
 1   Mode_of_Shipment     6999 non-null   int64  
 2   Customer_care_calls  6999 non-null   float64
 3   Customer_rating      6999 non-null   int64  
 4   Cost_of_the_Product  6999 non-null   int64  
 5   Prior_purchases      6999 non-null   float64
 6   Product_importance   6999 non-null   int64  
 7   Gender               6999 non-null   int64  
 8   Discount_offered     6999 non-null   float64
 9   Weight_in_gms        6999 non-null   int64  
 10  Reached.on.Time_Y.N  6999 non-null   int64  
dtypes: float64(3), int64(8)
memory usage: 914.2 KB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 4000 entries, 7000 to 10999
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               ----

In [7]:
print('train_data :\n',train_data.isnull().sum())
test_data.isnull().sum()

train_data :
 Warehouse_block           0
Mode_of_Shipment          0
Customer_care_calls    1576
Customer_rating           0
Cost_of_the_Product       0
Prior_purchases         950
Product_importance        0
Gender                    0
Discount_offered       3531
Weight_in_gms             0
Reached.on.Time_Y.N       0
dtype: int64


Warehouse_block           0
Mode_of_Shipment          0
Customer_care_calls     856
Customer_rating           0
Cost_of_the_Product       0
Prior_purchases         367
Product_importance        0
Gender                    0
Discount_offered       1915
Weight_in_gms             0
dtype: int64

In [27]:
train_data['Customer_care_calls'].fillna(4, inplace=True)

train_data['Customer_care_calls'].isnull().sum()    # 결측치 확인 완료
train_data['Customer_care_calls'].describe()   # 결측치 4.0
train_data['Customer_care_calls'].value_counts()    # 7을 이상치로 제거...? 우선 keep

4.0    3332
3.0    1620
5.0    1147
6.0     496
2.0     287
7.0     117
Name: Customer_care_calls, dtype: int64

In [197]:
print(train_data['Discount_offered'].describe())    # 결측치 15
print(test_data['Discount_offered'].isnull().sum())
print(test_data['Discount_offered'].value_counts()) # 20 위로 다 잘라볼 생각
test_data['Discount_offered'] = test_data['Discount_offered'].fillna(15, inplace=True)

count    6999.000000
mean       14.142306
std        11.344323
min         1.000000
25%         7.000000
50%        15.000000
75%        15.000000
max        65.000000
Name: Discount_offered, dtype: float64
1915
3.0     171
7.0     170
5.0     169
6.0     166
8.0     161
       ... 
55.0      4
30.0      4
58.0      4
34.0      3
65.0      2
Name: Discount_offered, Length: 65, dtype: int64


In [32]:
print(train_data['Prior_purchases'].describe())    # 결측치 3
print(train_data['Prior_purchases'].value_counts())  # 이상치 8, 10 제거 생각중
train_data['Prior_purchases'].fillna(3, inplace=True)
print(train_data['Prior_purchases'].isnull().sum())

count    6999.000000
mean        3.498071
std         1.436854
min         2.000000
25%         3.000000
50%         3.000000
75%         4.000000
max        10.000000
Name: Prior_purchases, dtype: float64
3.0     3126
2.0     1423
4.0     1162
5.0      735
6.0      310
10.0     101
8.0       72
7.0       70
Name: Prior_purchases, dtype: int64
0


In [11]:
# 상관관계 분석
train_data.corr()['Prior_purchases'].abs().sort_values(ascending=False)

Prior_purchases        1.000000
Customer_care_calls    0.173709
Discount_offered       0.081868
Reached.on.Time_Y.N    0.065160
Cost_of_the_Product    0.028947
Customer_rating        0.008217
Name: Prior_purchases, dtype: float64

In [12]:
train_data.corr()['Discount_offered'].abs().sort_values(ascending=False)

Discount_offered       1.000000
Reached.on.Time_Y.N    0.397393
Customer_care_calls    0.158465
Cost_of_the_Product    0.139349
Prior_purchases        0.081868
Customer_rating        0.016456
Name: Discount_offered, dtype: float64

In [13]:
train_data.corr()['Customer_care_calls'].abs().sort_values(ascending=False)

Customer_care_calls    1.000000
Prior_purchases        0.173709
Discount_offered       0.158465
Reached.on.Time_Y.N    0.070266
Cost_of_the_Product    0.047041
Customer_rating        0.013420
Name: Customer_care_calls, dtype: float64

In [14]:
train_data['Mode_of_Shipment'].replace(to_replace=('Ship', 0))

ID
1          Ship
2        Flight
3          Ship
4          Ship
5        Flight
         ...   
6995       Ship
6996       Ship
6997       Ship
6998       Road
6999       Ship
Name: Mode_of_Shipment, Length: 6999, dtype: object

In [15]:
train_data['Cost_of_the_Product'].describe()

count    6999.000000
mean      214.249178
std       208.294273
min        96.000000
25%       169.000000
50%       214.000000
75%       251.000000
max      9999.000000
Name: Cost_of_the_Product, dtype: float64

In [16]:
  train_data['Warehouse_block '].unique()

array(['C', 'F', 'D', 'B', 'A'], dtype=object)

In [17]:
train_data['Weight_in_gms'].value_counts()

?       446
1817      8
1367      8
5709      7
1861      7
       ... 
4964      1
5612      1
2443      1
2497      1
1480      1
Name: Weight_in_gms, Length: 3332, dtype: int64

In [263]:
# test_data 결측치 채우기
test_data.isnull().sum()

Warehouse_block           0
Mode_of_Shipment          0
Customer_care_calls     856
Customer_rating           0
Cost_of_the_Product       0
Prior_purchases         367
Product_importance        0
Gender                    0
Discount_offered       1915
Weight_in_gms             0
dtype: int64

In [249]:
print(test_data['Prior_purchases'].describe())    # 결측치 3
print(test_data['Prior_purchases'].value_counts())  # 이상치 8, 10 제거 생각중
test_data['Prior_purchases'] = test_data['Prior_purchases'].fillna(3, inplace=True)
print(test_data['Prior_purchases'].isnull().sum())

count    4000.000000
mean        3.515250
std         1.460582
min         2.000000
25%         3.000000
50%         3.000000
75%         4.000000
max        10.000000
Name: Prior_purchases, dtype: float64
3.0     1660
2.0      866
4.0      723
5.0      414
6.0      184
10.0      56
7.0       53
8.0       44
Name: Prior_purchases, dtype: int64
4000


In [265]:
test_data['Customer_care_calls'] = test_data['Customer_care_calls'].fillna(4, inplace=True)
print(test_data['Customer_care_calls'].isnull().sum())    # 결측치 확인 완료
print(test_data['Customer_care_calls'].describe())   # 결측치 4.0
print(test_data['Customer_care_calls'].value_counts())    # 7을 이상치로 제거...? 우선 keep

4000
count       0
unique      0
top       NaN
freq      NaN
Name: Customer_care_calls, dtype: object
Series([], Name: Customer_care_calls, dtype: int64)


In [59]:
train_data.columns = train_data.columns.str.strip(' ')
train_data.columns

Index(['Warehouse_block', 'Mode_of_Shipment', 'Customer_care_calls',
       'Customer_rating', 'Cost_of_the_Product', 'Prior_purchases',
       'Product_importance', 'Gender', 'Discount_offered', 'Weight_in_gms',
       'Reached.on.Time_Y.N'],
      dtype='object')

In [201]:
test_data.columns = test_data.columns.str.strip(' ')
test_data.columns

Index(['Warehouse_block', 'Mode_of_Shipment', 'Customer_care_calls',
       'Customer_rating', 'Cost_of_the_Product', 'Prior_purchases',
       'Product_importance', 'Gender', 'Discount_offered', 'Weight_in_gms'],
      dtype='object')

In [67]:
train_data['Warehouse_block'].unique()

array(['C', 'F', 'D', 'B', 'A'], dtype=object)

In [78]:
# 데이터 타입 변경 및 값 변경
train_data['Warehouse_block'] = train_data['Warehouse_block'].replace('A', 0)
train_data['Warehouse_block'] = train_data['Warehouse_block'].replace('B', 1)
train_data['Warehouse_block'] = train_data['Warehouse_block'].replace('C', 2)
train_data['Warehouse_block'] = train_data['Warehouse_block'].replace('D', 3)
train_data['Warehouse_block'] = train_data['Warehouse_block'].replace('F', 4)
train_data['Warehouse_block']

ID
1       2
2       4
3       4
4       4
5       3
       ..
6995    3
6996    4
6997    3
6998    0
6999    0
Name: Warehouse_block, Length: 6999, dtype: int64

In [79]:
train_data['Warehouse_block']

ID
1       2
2       4
3       4
4       4
5       3
       ..
6995    3
6996    4
6997    3
6998    0
6999    0
Name: Warehouse_block, Length: 6999, dtype: int64

In [99]:
train_data['Mode_of_Shipment'] = train_data['Mode_of_Shipment'].replace('Shipzk', 'Ship')
train_data['Mode_of_Shipment'] = train_data['Mode_of_Shipment'].replace('Flightzk', 'Flight')
train_data['Mode_of_Shipment'] = train_data['Mode_of_Shipment'].replace('Roadzk', 'Road')
train_data['Mode_of_Shipment'] = train_data['Mode_of_Shipment'].replace('?', 'Ship')

In [90]:
train_data['Mode_of_Shipment'] = train_data['Mode_of_Shipment'].str.strip(' ')

In [104]:
train_data['Mode_of_Shipment'].unique()

array([0, 1, 2], dtype=int64)

In [98]:
train_data['Mode_of_Shipment'].value_counts()   # 배가 많으니까 배로 바꾸자 물음표는!!

Ship      4510
Road      1076
Flight    1040
?          373
Name: Mode_of_Shipment, dtype: int64

In [103]:
# 배 0, 길 1, 비행기 2
train_data['Mode_of_Shipment'] = train_data['Mode_of_Shipment'].replace('0', 0)
train_data['Mode_of_Shipment'] = train_data['Mode_of_Shipment'].replace('1', 1)
train_data['Mode_of_Shipment'] = train_data['Mode_of_Shipment'].replace('2', 2)

In [118]:
train_data['Product_importance'].unique()

# low 0 medium 1 high 2 변경

array([0, 1, 2], dtype=int64)

In [115]:
train_data['Product_importance'] = train_data['Product_importance'].replace('mediumm', 'medium')
train_data['Product_importance'] = train_data['Product_importance'].replace('loww', 'low')
train_data['Product_importance'] = train_data['Product_importance'].replace('highh', 'high')
train_data['Product_importance'] = train_data['Product_importance'].replace('?', 'low')

In [114]:
train_data['Product_importance'].value_counts()  # ?는 가장 많은 값인 low 값으로..

low       3345
medium    2983
high       574
?           97
Name: Product_importance, dtype: int64

In [117]:
train_data['Product_importance'] = train_data['Product_importance'].replace('medium', 1)
train_data['Product_importance'] = train_data['Product_importance'].replace('low', 0)
train_data['Product_importance'] = train_data['Product_importance'].replace('high', 2)

In [None]:
# Gender data - M 0 F 1

In [121]:
train_data['Gender'] = train_data['Gender'].replace('M', 0)
train_data['Gender'] = train_data['Gender'].replace('F', 1)

In [123]:
train_data['Gender'].unique()

array([0, 1], dtype=int64)

In [134]:
# Weight_in_gms 변환 ------- > ?는 3000으로
print(train_data['Weight_in_gms'].unique())
print(train_data['Weight_in_gms'].describe())
print(train_data['Weight_in_gms'].value_counts())

['1339' '1082' '4971' ... '4075' '2486' '1480']
count     6999
unique    3332
top       3000
freq       446
Name: Weight_in_gms, dtype: int64
3000    446
1817      8
1367      8
5709      7
1861      7
       ... 
4964      1
5612      1
2443      1
2497      1
1480      1
Name: Weight_in_gms, Length: 3332, dtype: int64


In [137]:
train_data['Weight_in_gms'] = train_data['Weight_in_gms'].replace('?', 3000)

In [143]:
# type 변경
train_data['Weight_in_gms'] = train_data['Weight_in_gms'].astype('int64')

In [184]:
# Nan 값 및 공백 제거를 위한 타입 변경, 처리 후 다시 형 변환
train_data['Discount_offered'] = train_data['Discount_offered'].astype('float64')
train_data['Discount_offered'].info()

<class 'pandas.core.series.Series'>
Int64Index: 6999 entries, 1 to 6999
Series name: Discount_offered
Non-Null Count  Dtype  
--------------  -----  
6999 non-null   float64
dtypes: float64(1)
memory usage: 367.4 KB


In [177]:
train_data['Discount_offered'].str.strip(' ')

ID
1        nan
2        6.0
3        nan
4        2.0
5        nan
        ... 
6995     4.0
6996     nan
6997    10.0
6998     nan
6999     nan
Name: Discount_offered, Length: 6999, dtype: object

In [179]:
train_data['Discount_offered'] = train_data['Discount_offered'].replace('nan',15)

In [187]:
train_data.info()    # 모든 데이터 타입 숫자로 바꿈. 결측치 다 채움. 이상값 다 고침, 오타 수정 완료

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6999 entries, 1 to 6999
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Warehouse_block      6999 non-null   int64  
 1   Mode_of_Shipment     6999 non-null   int64  
 2   Customer_care_calls  6999 non-null   float64
 3   Customer_rating      6999 non-null   int64  
 4   Cost_of_the_Product  6999 non-null   int64  
 5   Prior_purchases      6999 non-null   float64
 6   Product_importance   6999 non-null   int64  
 7   Gender               6999 non-null   int64  
 8   Discount_offered     6999 non-null   float64
 9   Weight_in_gms        6999 non-null   int64  
 10  Reached.on.Time_Y.N  6999 non-null   int64  
dtypes: float64(3), int64(8)
memory usage: 914.2 KB


In [270]:
# test_data -----> train_data 와 같이 변경하기
test_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4000 entries, 7000 to 10999
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Warehouse_block      4000 non-null   object 
 1   Mode_of_Shipment     4000 non-null   object 
 2   Customer_care_calls  3144 non-null   float64
 3   Customer_rating      4000 non-null   int64  
 4   Cost_of_the_Product  4000 non-null   int64  
 5   Prior_purchases      3633 non-null   float64
 6   Product_importance   4000 non-null   object 
 7   Gender               4000 non-null   object 
 8   Discount_offered     2085 non-null   float64
 9   Weight_in_gms        4000 non-null   object 
dtypes: float64(3), int64(2), object(5)
memory usage: 343.8+ KB


In [271]:
test_data['Warehouse_block']

ID
7000     F
7001     F
7002     A
7003     C
7004     F
        ..
10995    A
10996    C
10997    B
10998    C
10999    D
Name: Warehouse_block, Length: 4000, dtype: object

In [272]:
test_data['Warehouse_block'].unique()

array(['F', 'A', 'C', 'D', 'B'], dtype=object)

In [273]:
# 데이터 타입 변경 및 값 변경
test_data['Warehouse_block'] = test_data['Warehouse_block'].replace('A', 0)
test_data['Warehouse_block'] = test_data['Warehouse_block'].replace('B', 1)
test_data['Warehouse_block'] = test_data['Warehouse_block'].replace('C', 2)
test_data['Warehouse_block'] = test_data['Warehouse_block'].replace('D', 3)
test_data['Warehouse_block'] = test_data['Warehouse_block'].replace('F', 4)
test_data['Warehouse_block'].value_counts()

4    1355
1     671
3     666
0     659
2     649
Name: Warehouse_block, dtype: int64

In [274]:
# print(test_data['Mode_of_Shipment'].unique())
print(test_data['Mode_of_Shipment'].value_counts())
test_data['Mode_of_Shipment'] = test_data['Mode_of_Shipment'].str.strip(' ')

 Ship      2551
 Flight     631
 Road       595
?           221
 Shipzk       1
 Roadzk       1
Name: Mode_of_Shipment, dtype: int64


In [275]:
test_data['Mode_of_Shipment'] = test_data['Mode_of_Shipment'].replace('Shipzk', 'Ship')
test_data['Mode_of_Shipment'] = test_data['Mode_of_Shipment'].replace('Roadzk', 'Road')
test_data['Mode_of_Shipment'] = test_data['Mode_of_Shipment'].replace('?', 'Ship')

In [276]:
test_data['Mode_of_Shipment'].unique()

array(['Ship', 'Flight', 'Road'], dtype=object)

In [277]:
test_data['Mode_of_Shipment'] = test_data['Mode_of_Shipment'].replace('Ship', 0)
test_data['Mode_of_Shipment'] = test_data['Mode_of_Shipment'].replace('Flight', 1)
test_data['Mode_of_Shipment'] = test_data['Mode_of_Shipment'].replace('Road', 2)

In [278]:
test_data['Mode_of_Shipment'].unique()

array([0, 1, 2], dtype=int64)

In [279]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4000 entries, 7000 to 10999
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Warehouse_block      4000 non-null   int64  
 1   Mode_of_Shipment     4000 non-null   int64  
 2   Customer_care_calls  3144 non-null   float64
 3   Customer_rating      4000 non-null   int64  
 4   Cost_of_the_Product  4000 non-null   int64  
 5   Prior_purchases      3633 non-null   float64
 6   Product_importance   4000 non-null   object 
 7   Gender               4000 non-null   object 
 8   Discount_offered     2085 non-null   float64
 9   Weight_in_gms        4000 non-null   object 
dtypes: float64(3), int64(4), object(3)
memory usage: 343.8+ KB


In [268]:
test_data.columns = test_data.columns.str.strip(' ')

In [247]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6999 entries, 1 to 6999
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Warehouse_block      6999 non-null   int64  
 1   Mode_of_Shipment     6999 non-null   int64  
 2   Customer_care_calls  6999 non-null   float64
 3   Customer_rating      6999 non-null   int64  
 4   Cost_of_the_Product  6999 non-null   int64  
 5   Prior_purchases      6999 non-null   float64
 6   Product_importance   6999 non-null   int64  
 7   Gender               6999 non-null   int64  
 8   Discount_offered     6999 non-null   float64
 9   Weight_in_gms        6999 non-null   int64  
 10  Reached.on.Time_Y.N  6999 non-null   int64  
dtypes: float64(3), int64(8)
memory usage: 914.2 KB


In [281]:
test_data['Customer_care_calls'].unique()

array([ 3., nan,  4.,  6.,  7.,  2.,  5.])

In [282]:
test_data['Customer_care_calls'].value_counts()

4.0    990
3.0    901
5.0    667
6.0    311
2.0    208
7.0     67
Name: Customer_care_calls, dtype: int64

In [283]:
test_data['Customer_care_calls'].describe()

count    3144.000000
mean        4.055025
std         1.161132
min         2.000000
25%         3.000000
50%         4.000000
75%         5.000000
max         7.000000
Name: Customer_care_calls, dtype: float64

In [300]:
test_data['Customer_care_calls'] = test_data['Customer_care_calls'].astype('str')

In [302]:
test_data['Customer_care_calls'] = test_data['Customer_care_calls'].str.strip(' ')

In [303]:
test_data['Customer_care_calls'].value_counts()

4.0    990
3.0    901
nan    856
5.0    667
6.0    311
2.0    208
7.0     67
Name: Customer_care_calls, dtype: int64

In [304]:
test_data['Customer_care_calls'] = test_data['Customer_care_calls'].replace('nan', '4.0')

In [305]:
test_data['Customer_care_calls'] = test_data['Customer_care_calls'].astype('float64')

In [306]:
test_data['Customer_care_calls']

ID
7000     3.0
7001     3.0
7002     4.0
7003     4.0
7004     4.0
        ... 
10995    4.0
10996    4.0
10997    4.0
10998    6.0
10999    5.0
Name: Customer_care_calls, Length: 4000, dtype: float64

In [308]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4000 entries, 7000 to 10999
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Warehouse_block      4000 non-null   int64  
 1   Mode_of_Shipment     4000 non-null   int64  
 2   Customer_care_calls  4000 non-null   float64
 3   Customer_rating      4000 non-null   int64  
 4   Cost_of_the_Product  4000 non-null   int64  
 5   Prior_purchases      3633 non-null   float64
 6   Product_importance   4000 non-null   object 
 7   Gender               4000 non-null   object 
 8   Discount_offered     2085 non-null   float64
 9   Weight_in_gms        4000 non-null   object 
dtypes: float64(3), int64(4), object(3)
memory usage: 343.8+ KB


In [309]:
test_data['Prior_purchases'].info()

<class 'pandas.core.series.Series'>
Int64Index: 4000 entries, 7000 to 10999
Series name: Prior_purchases
Non-Null Count  Dtype  
--------------  -----  
3633 non-null   float64
dtypes: float64(1)
memory usage: 62.5 KB


In [315]:
test_data['Prior_purchases'].value_counts()

3.0     1293
2.0      866
4.0      723
5.0      414
6.0      184
10.0      56
7.0       53
8.0       44
Name: Prior_purchases, dtype: int64

In [314]:
test_data['Prior_purchases'].unique()

array([nan, 2.0, 3.0, 4.0, 6.0, 8.0, 5.0, 7.0, 10.0], dtype=object)

In [317]:
test_data['Prior_purchases'] = test_data['Prior_purchases'].astype('str')

In [318]:
test_data['Prior_purchases'] = test_data['Prior_purchases'].str.strip(' ')

In [322]:
test_data['Prior_purchases'].unique()

array(['nan', '2.0', '3.0', '4.0', '6.0', '8.0', '5.0', '7.0', '10.0'],
      dtype=object)

In [323]:
test_data['Prior_purchases'] = test_data['Prior_purchases'].replace('nan', '4.0')

In [326]:
test_data['Prior_purchases'] = test_data['Prior_purchases'].astype('float64')

In [354]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4000 entries, 7000 to 10999
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Warehouse_block      4000 non-null   int64  
 1   Mode_of_Shipment     4000 non-null   int64  
 2   Customer_care_calls  4000 non-null   float64
 3   Customer_rating      4000 non-null   int64  
 4   Cost_of_the_Product  4000 non-null   int64  
 5   Prior_purchases      4000 non-null   float64
 6   Product_importance   4000 non-null   int64  
 7   Gender               4000 non-null   int64  
 8   Discount_offered     4000 non-null   float64
 9   Weight_in_gms        4000 non-null   object 
dtypes: float64(3), int64(6), object(1)
memory usage: 343.8+ KB


In [335]:
test_data['Product_importance'].unique()

array([2, 1, 0], dtype=int64)

In [332]:
test_data['Product_importance'] = test_data['Product_importance'].replace('mediumm', 'medium')
test_data['Product_importance'] = test_data['Product_importance'].replace('?', 'low')

In [334]:
test_data['Product_importance'] = test_data['Product_importance'].replace('medium', 1)
test_data['Product_importance'] = test_data['Product_importance'].replace('low', 0)
test_data['Product_importance'] = test_data['Product_importance'].replace('high', 2)

In [339]:
test_data['Gender'].unique()

array([0, 1], dtype=int64)

In [338]:
test_data['Gender'] = test_data['Gender'].replace('M', 0)
test_data['Gender'] = test_data['Gender'].replace('F', 1)

In [345]:
test_data['Discount_offered'].unique()

array(['nan', '7.0', '5.0', '64.0', '4.0', '29.0', '10.0', '8.0', '31.0',
       '53.0', '1.0', '22.0', '49.0', '3.0', '52.0', '6.0', '9.0', '20.0',
       '61.0', '2.0', '39.0', '43.0', '48.0', '54.0', '26.0', '42.0',
       '46.0', '41.0', '16.0', '13.0', '15.0', '35.0', '28.0', '34.0',
       '33.0', '37.0', '27.0', '36.0', '63.0', '40.0', '62.0', '38.0',
       '25.0', '51.0', '45.0', '32.0', '50.0', '21.0', '58.0', '65.0',
       '57.0', '17.0', '24.0', '12.0', '47.0', '56.0', '60.0', '59.0',
       '30.0', '18.0', '55.0', '19.0', '11.0', '23.0', '44.0', '14.0'],
      dtype=object)

In [343]:
test_data['Discount_offered'] = test_data['Discount_offered'].astype('str')

In [344]:
test_data['Discount_offered'] = test_data['Discount_offered'].str.strip(' ')

In [353]:
test_data['Discount_offered'].describe()

count    4000.000000
mean       13.147000
std        11.432085
min         1.000000
25%         7.000000
50%        13.000000
75%        13.000000
max        65.000000
Name: Discount_offered, dtype: float64

In [350]:
test_data['Discount_offered'] = test_data['Discount_offered'].replace('nan', '13')

In [352]:
test_data['Discount_offered'] = test_data['Discount_offered'].astype('float64')

In [360]:
test_data['Weight_in_gms'] = test_data['Weight_in_gms'].str.strip(' ')

In [364]:
test_data['Weight_in_gms'].value_counts()

?       252
4883      9
5724      7
5783      6
4410      6
       ... 
5953      1
2774      1
4084      1
5864      1
3869      1
Name: Weight_in_gms, Length: 2452, dtype: int64

In [366]:
test_data['Weight_in_gms'] = test_data['Weight_in_gms'].replace('?', 3600)

In [367]:
test_data['Weight_in_gms'].describe()

count     4000
unique    2452
top       3600
freq       252
Name: Weight_in_gms, dtype: int64

In [369]:
test_data['Weight_in_gms'] = test_data['Weight_in_gms'].astype('int64')

In [370]:
test_data.info()
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4000 entries, 7000 to 10999
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Warehouse_block      4000 non-null   int64  
 1   Mode_of_Shipment     4000 non-null   int64  
 2   Customer_care_calls  4000 non-null   float64
 3   Customer_rating      4000 non-null   int64  
 4   Cost_of_the_Product  4000 non-null   int64  
 5   Prior_purchases      4000 non-null   float64
 6   Product_importance   4000 non-null   int64  
 7   Gender               4000 non-null   int64  
 8   Discount_offered     4000 non-null   float64
 9   Weight_in_gms        4000 non-null   int64  
dtypes: float64(3), int64(7)
memory usage: 343.8 KB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 6999 entries, 1 to 6999
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Warehouse_block      6999

In [382]:
X_train = train_data.iloc[:,:-1]
y_train = train_data.iloc[:,-1]
X_test = test_data

In [383]:
print('훈련용문제 : ', X_train.shape)
print('훈련용답 : ', y_train.shape)
print('테스트용문제 : ', X_test.shape)

훈련용문제 :  (6999, 10)
훈련용답 :  (6999,)
테스트용문제 :  (4000, 10)


In [851]:
# 모델생성
tree_model = DecisionTreeClassifier()
# 모델 학습
tree_model.fit(X_train2, y_train2)
# 예측
pre = tree_model.predict(X_test2)
pre

array([1, 1, 0, ..., 0, 1, 1], dtype=int64)

In [392]:
result = pd.read_csv('./data/bigdatasc/sampleSubmission.csv')
result

Unnamed: 0,ID,Reached.on.Time_Y.N
0,7000,
1,7001,
2,7002,
3,7003,
4,7004,
...,...,...
3995,10995,
3996,10996,
3997,10997,
3998,10998,


In [852]:
result['Reached.on.Time_Y.N'] = pre

In [853]:
result.to_csv('top5_sampleSubmission.csv', index=False)

In [391]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6999 entries, 1 to 6999
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Warehouse_block      6999 non-null   int64  
 1   Mode_of_Shipment     6999 non-null   int64  
 2   Customer_care_calls  6999 non-null   float64
 3   Customer_rating      6999 non-null   int64  
 4   Cost_of_the_Product  6999 non-null   int64  
 5   Prior_purchases      6999 non-null   float64
 6   Product_importance   6999 non-null   int64  
 7   Gender               6999 non-null   int64  
 8   Discount_offered     6999 non-null   float64
 9   Weight_in_gms        6999 non-null   int64  
 10  Reached.on.Time_Y.N  6999 non-null   int64  
dtypes: float64(3), int64(8)
memory usage: 914.2 KB


In [839]:
forest_model = RandomForestClassifier()

In [840]:
forest_model.fit(X_train2, y_train2)

RandomForestClassifier()

In [841]:
rf_result = cross_val_score(forest_model, X_train2, y_train2, cv = 5)

In [842]:
rf_result.mean()

0.6502394567548249

In [843]:
model = RandomForestClassifier(random_state = 33)

In [844]:
params = {'n_estimators' : [50, 70, 100, 150], 
          'max_depth' : [3, 5, 7], 
          'max_leaf_nodes' : [2, 4, 6],
         'min_samples_split' : [3, 5, 7]}

In [845]:
grid = GridSearchCV(model, params, cv = 5, scoring = 'accuracy', n_jobs = -1)

In [846]:
grid.fit(X_train2, y_train2)

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=33), n_jobs=-1,
             param_grid={'max_depth': [3, 5, 7], 'max_leaf_nodes': [2, 4, 6],
                         'min_samples_split': [3, 5, 7],
                         'n_estimators': [50, 70, 100, 150]},
             scoring='accuracy')

In [847]:
print(grid.best_score_)
print(grid.best_params_)
print(grid.best_estimator_)

0.6666685387521699
{'max_depth': 3, 'max_leaf_nodes': 4, 'min_samples_split': 3, 'n_estimators': 150}
RandomForestClassifier(max_depth=3, max_leaf_nodes=4, min_samples_split=3,
                       n_estimators=150, random_state=33)


In [835]:
grid.best_index_

0

In [440]:
from sklearn.ensemble import AdaBoostClassifier

In [508]:
ada_clf = AdaBoostClassifier(n_estimators = 100, random_state = 42)

In [509]:
ada_clf.fit(X_train, y_train)

AdaBoostClassifier(n_estimators=100, random_state=42)

In [510]:
pre_ada = ada_clf.predict(X_test)

In [511]:
pre_ada

array([0, 1, 0, ..., 0, 1, 1], dtype=int64)

In [512]:
result['Reached.on.Time_Y.N'] = pre_ada

In [513]:
result.to_csv('top3_sampleSubmission.csv', index=False)

In [527]:
train_data.corr()['Reached.on.Time_Y.N'].abs().sort_values(ascending=False)

Reached.on.Time_Y.N    1.000000
Discount_offered       0.278709
Weight_in_gms          0.259856
Customer_care_calls    0.062842
Prior_purchases        0.057114
Product_importance     0.020316
Cost_of_the_Product    0.012811
Customer_rating        0.011275
Mode_of_Shipment       0.005801
Warehouse_block        0.003872
Gender                 0.000378
Name: Reached.on.Time_Y.N, dtype: float64

In [848]:
X_train2 = train_data[['Warehouse_block','Customer_care_calls','Cost_of_the_Product','Discount_offered','Weight_in_gms','Product_importance']]
y_train2 = train_data['Reached.on.Time_Y.N']

In [849]:
X_test2 = test_data[['Warehouse_block','Customer_care_calls','Cost_of_the_Product','Discount_offered','Weight_in_gms','Product_importance']]

In [850]:
X_test2

Unnamed: 0_level_0,Warehouse_block,Customer_care_calls,Cost_of_the_Product,Discount_offered,Weight_in_gms,Product_importance
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
7000,4,3.0,274,13.0,4352,2
7001,4,3.0,136,13.0,1056,1
7002,0,4.0,140,7.0,5383,0
7003,2,4.0,291,13.0,1880,0
7004,4,4.0,147,5.0,5174,0
...,...,...,...,...,...,...
10995,0,4.0,204,13.0,1667,0
10996,2,4.0,195,13.0,3869,1
10997,1,4.0,206,7.0,4531,1
10998,2,6.0,255,7.0,1869,0


In [820]:
pre_fr = forest_model.predict(X_test2)
pre_fr

array([0, 1, 0, ..., 0, 1, 1], dtype=int64)