In [174]:
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import cv2
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.utils import to_categorical

from tqdm import tqdm_notebook
import matplotlib.pyplot as plt

from matplotlib import rcParams

In [175]:
df_train = pd.read_csv('./train.csv')
df_train_copy=df_train.copy()

df_test=pd.read_csv('./test.csv')
df_test_copy=df_test.copy()
f_test_ID=df_test.iloc[:,0]

df_test_y=pd.read_csv('./sampleSubmission.csv')
df_test_y_copy=df_test_y.copy()

print (df_train_copy.shape)
print (df_test_copy.shape)
print (df_test_y_copy.shape)
print(df_train_copy.columns)
print(df_test_copy.columns)

(108800, 11)
(27200, 10)
(27200, 2)
Index(['user_id', 'device_id', 'age', 'sex', 'browser', 'source', 'country',
       'purchase_value', 'signup_time', 'purchase_time', 'class'],
      dtype='object')
Index(['user_id', 'device_id', 'age', 'sex', 'browser', 'source', 'country',
       'purchase_value', 'signup_time', 'purchase_time'],
      dtype='object')


In [176]:
#train data & test data合併再做資料整理，避免不一玫
combine_data=pd.concat((df_train_copy.loc[:,'user_id':'purchase_time'],
                     df_test_copy.loc[:,'user_id':'purchase_time']))
print(f'combine_data.shape={combine_data.shape}')
combine_data.info()

combine_data.shape=(136000, 10)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 136000 entries, 0 to 27199
Data columns (total 10 columns):
user_id           136000 non-null object
device_id         136000 non-null object
age               136000 non-null int64
sex               136000 non-null int64
browser           136000 non-null object
source            136000 non-null object
country           116234 non-null object
purchase_value    136000 non-null int64
signup_time       136000 non-null object
purchase_time     136000 non-null object
dtypes: int64(3), object(7)
memory usage: 11.4+ MB


## 開始整理資料

In [177]:
#1.將'country'之NA值other取代
combine_data['country'] = combine_data['country'].fillna(value='Other')
combine_data

Unnamed: 0,user_id,device_id,age,sex,browser,source,country,purchase_value,signup_time,purchase_time
0,6b5aecb444b26,7a543b4bf3647,33,0,Safari,Ads,United States,48,2018-05-27 12:02:29,2018-08-19 14:09:55
1,f00edc3db68f0,c35ab03e7ff0f,26,1,Chrome,Direct,European Union,50,2018-06-06 08:09:12,2018-09-09 01:26:43
2,6f144a20b2e41,31a274c4b1d58,28,0,Chrome,Direct,United States,31,2018-03-29 18:24:17,2018-06-13 04:56:21
3,21db12c470157,74e7d74dbd61d,50,0,FireFox,Ads,United States,31,2018-06-18 02:49:33,2018-06-29 23:31:45
4,af1753dfd703c,b5f31ea91ae62,27,1,IE,SEO,United States,16,2018-01-31 18:26:38,2018-02-13 16:36:51
...,...,...,...,...,...,...,...,...,...,...
27195,cd16362b71572,2515db9e4437e,29,1,IE,SEO,United States,48,2018-03-08 05:02:33,2018-05-05 23:04:43
27196,132fb9f609701,58511ac9e2ebb,42,0,Safari,Ads,France,12,2018-05-15 23:23:29,2018-05-19 06:35:14
27197,4c6d7c325963f,8964657427d3d,41,1,Chrome,Ads,Other,26,2018-06-08 15:39:11,2018-06-13 17:24:44
27198,61daa6f70df29,df795a2402c3f,36,1,IE,Ads,Romania,17,2018-06-25 12:47:27,2018-08-27 19:17:54


In [178]:
#2.將'user_id' & 'device_id' dropout
combine_data.drop(['user_id','device_id'],inplace = True,axis=1)
print(combine_data.columns)
print(combine_data.shape)

Index(['age', 'sex', 'browser', 'source', 'country', 'purchase_value',
       'signup_time', 'purchase_time'],
      dtype='object')
(136000, 8)


In [179]:
#3.將'browser','source','country'做one-hot-encoding

pd.get_dummies(combine_data['browser'])
onehot_encoding=pd.get_dummies(combine_data['browser'],prefix='browser')
combine_data=combine_data.drop('browser',axis=1)
combine_data=pd.concat([onehot_encoding,combine_data],axis=1)


pd.get_dummies(combine_data['source'])
onehot_encoding=pd.get_dummies(combine_data['source'],prefix='source')
combine_data=combine_data.drop('source',axis=1)
combine_data=pd.concat([onehot_encoding,combine_data],axis=1)

pd.get_dummies(combine_data['country'])
onehot_encoding=pd.get_dummies(combine_data['country'],prefix='country')
combine_data=combine_data.drop('country',axis=1)
combine_data=pd.concat([onehot_encoding,combine_data],axis=1)

In [180]:
print(combine_data.shape)
#combine_data.columns.values

(136000, 194)


In [181]:
#4 分割'signup_time' & 'purchase_time'
combine_signup_time = []
combine_signup_day=[]
for each_sig in combine_data['signup_time']:
        if each_sig.find('checkpoint') == -1:
            combine_signup_time.append(each_sig.split(" ")[-1])
            combine_signup_day.append(each_sig.split(" ")[0])

            
combine_purchase_time = []
combine_purchase_day=[]
for each_pur in combine_data['purchase_time']:
        if each_pur.find('checkpoint') == -1:
            combine_purchase_time.append(each_pur.split(" ")[-1])
            combine_purchase_day.append(each_pur.split(" ")[0])
            

In [182]:
combine_data['S_Time']=combine_signup_time
combine_data['S_Date']=combine_signup_day
combine_data['P_Time']=combine_purchase_time
combine_data['P_Date']=combine_purchase_day
print(combine_data.shape)

(136000, 198)


In [183]:
#4.1 將S_Time & P_Time 取'時'出來
x_S_Time_hour=[]
for each_S_Time in combine_data['S_Time']:
        if each_S_Time.find('checkpoint') == -1:
            x_S_Time_hour.append(each_S_Time.split(":")[0])
            
#4.2 將P_Time 取'時'出來
x_P_Time_hour=[]
for each_P_Time in combine_data['P_Time']:
        if each_P_Time.find('checkpoint') == -1:
            x_P_Time_hour.append(each_P_Time.split(":")[0])
            



In [184]:
combine_data['S_Time']=x_S_Time_hour 
combine_data['P_Time']=x_P_Time_hour 

In [185]:
#4.1.1 將S_Time & P_Time 取'分'出來

x_S_Time_min=[]
for each_S_Time in combine_data['S_Time']:
        if each_S_Time.find('checkpoint') == -1:
            x_S_Time_min.append(each_S_Time.split(":")[0])
            
x_P_Time_min=[]
for each_P_Time in combine_data['P_Time']:
        if each_P_Time.find('checkpoint') == -1:
            x_P_Time_min.append(each_P_Time.split(":")[0])

In [186]:
combine_data['S_Time_min']=x_S_Time_min
combine_data['P_Time_min']=x_P_Time_min

In [187]:
#4.3 將S_Date & P_Date 取'月' & '日'出來
x_S_Date_month=[]
x_S_Date_day=[]
for each_S_Date in combine_data['S_Date']:
        if each_S_Date.find('checkpoint') == -1:
            x_S_Date_month.append(each_S_Date.split("-")[1])
            x_S_Date_day.append(each_S_Date.split("-")[2])

x_P_Date_month=[]
x_P_Date_day=[]
for each_P_Date in combine_data['P_Date']:
        if each_P_Date.find('checkpoint') == -1:
            x_P_Date_month.append(each_P_Date.split("-")[1])
            x_P_Date_day.append(each_P_Date.split("-")[2])

In [188]:
combine_data['S_Date_month']= x_S_Date_month
combine_data['S_Date_day']= x_S_Date_day
combine_data['P_Date_month']= x_P_Date_month
combine_data['P_Date_day']= x_P_Date_day

In [189]:
#4.4.將'S_Date' & 'P_Date' dropout
combine_data.drop(['S_Date','P_Date'],inplace = True,axis=1)

In [190]:
combine_data

Unnamed: 0,country_Afghanistan,country_Albania,country_Algeria,country_Angola,country_Antigua and Barbuda,country_Argentina,country_Armenia,country_Australia,country_Austria,country_Azerbaijan,...,signup_time,purchase_time,S_Time,P_Time,S_Time_min,P_Time_min,S_Date_month,S_Date_day,P_Date_month,P_Date_day
0,0,0,0,0,0,0,0,0,0,0,...,2018-05-27 12:02:29,2018-08-19 14:09:55,12,14,12,14,05,27,08,19
1,0,0,0,0,0,0,0,0,0,0,...,2018-06-06 08:09:12,2018-09-09 01:26:43,08,01,08,01,06,06,09,09
2,0,0,0,0,0,0,0,0,0,0,...,2018-03-29 18:24:17,2018-06-13 04:56:21,18,04,18,04,03,29,06,13
3,0,0,0,0,0,0,0,0,0,0,...,2018-06-18 02:49:33,2018-06-29 23:31:45,02,23,02,23,06,18,06,29
4,0,0,0,0,0,0,0,0,0,0,...,2018-01-31 18:26:38,2018-02-13 16:36:51,18,16,18,16,01,31,02,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27195,0,0,0,0,0,0,0,0,0,0,...,2018-03-08 05:02:33,2018-05-05 23:04:43,05,23,05,23,03,08,05,05
27196,0,0,0,0,0,0,0,0,0,0,...,2018-05-15 23:23:29,2018-05-19 06:35:14,23,06,23,06,05,15,05,19
27197,0,0,0,0,0,0,0,0,0,0,...,2018-06-08 15:39:11,2018-06-13 17:24:44,15,17,15,17,06,08,06,13
27198,0,0,0,0,0,0,0,0,0,0,...,2018-06-25 12:47:27,2018-08-27 19:17:54,12,19,12,19,06,25,08,27


In [191]:
#5.將'signup_time' & 'purchase_time' dropout
combine_data.drop(['signup_time','purchase_time'],inplace = True,axis=1)
print(combine_data.shape)

(136000, 200)


In [192]:
combine_data.columns.values

array(['country_Afghanistan', 'country_Albania', 'country_Algeria',
       'country_Angola', 'country_Antigua and Barbuda',
       'country_Argentina', 'country_Armenia', 'country_Australia',
       'country_Austria', 'country_Azerbaijan', 'country_Bahamas',
       'country_Bahrain', 'country_Bangladesh', 'country_Barbados',
       'country_Belarus', 'country_Belgium', 'country_Belize',
       'country_Benin', 'country_Bermuda', 'country_Bhutan',
       'country_Bolivia', 'country_Bonaire; Sint Eustatius; Saba',
       'country_Bosnia and Herzegowina', 'country_Botswana',
       'country_Brazil', 'country_British Indian Ocean Territory',
       'country_Brunei Darussalam', 'country_Bulgaria',
       'country_Burkina Faso', 'country_Cambodia', 'country_Cameroon',
       'country_Canada', 'country_Cape Verde', 'country_Cayman Islands',
       'country_Chile', 'country_China', 'country_Colombia',
       'country_Congo', 'country_Congo The Democratic Republic of The',
       'country_Costa

In [195]:
combine_data

Unnamed: 0,country_Afghanistan,country_Albania,country_Algeria,country_Angola,country_Antigua and Barbuda,country_Argentina,country_Armenia,country_Australia,country_Austria,country_Azerbaijan,...,sex,purchase_value,S_Time,P_Time,S_Time_min,P_Time_min,S_Date_month,S_Date_day,P_Date_month,P_Date_day
0,0,0,0,0,0,0,0,0,0,0,...,0,48,12,14,12,14,05,27,08,19
1,0,0,0,0,0,0,0,0,0,0,...,1,50,08,01,08,01,06,06,09,09
2,0,0,0,0,0,0,0,0,0,0,...,0,31,18,04,18,04,03,29,06,13
3,0,0,0,0,0,0,0,0,0,0,...,0,31,02,23,02,23,06,18,06,29
4,0,0,0,0,0,0,0,0,0,0,...,1,16,18,16,18,16,01,31,02,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27195,0,0,0,0,0,0,0,0,0,0,...,1,48,05,23,05,23,03,08,05,05
27196,0,0,0,0,0,0,0,0,0,0,...,0,12,23,06,23,06,05,15,05,19
27197,0,0,0,0,0,0,0,0,0,0,...,1,26,15,17,15,17,06,08,06,13
27198,0,0,0,0,0,0,0,0,0,0,...,1,17,12,19,12,19,06,25,08,27


In [197]:
print(combine_data.shape)

(136000, 200)


In [198]:
#對'S_Time','P_Time','S_Date_month','S_Date_day','P_Date_month','P_Date_day'做one hot encoding
pd.get_dummies(combine_data['S_Time'])
onehot_encoding=pd.get_dummies(combine_data['S_Time'],prefix='S_Time')
combine_data=combine_data.drop('S_Time',axis=1)
combine_data=pd.concat([onehot_encoding,combine_data],axis=1)

pd.get_dummies(combine_data['P_Time'])
onehot_encoding=pd.get_dummies(combine_data['P_Time'],prefix='P_Time')
combine_data=combine_data.drop('P_Time',axis=1)
combine_data=pd.concat([onehot_encoding,combine_data],axis=1)

pd.get_dummies(combine_data['S_Date_month'])
onehot_encoding=pd.get_dummies(combine_data['S_Date_month'],prefix='S_Date_month')
combine_data=combine_data.drop('S_Date_month',axis=1)
combine_data=pd.concat([onehot_encoding,combine_data],axis=1)

pd.get_dummies(combine_data['S_Date_day'])
onehot_encoding=pd.get_dummies(combine_data['S_Date_day'],prefix='S_Date_day')
combine_data=combine_data.drop('S_Date_day',axis=1)
combine_data=pd.concat([onehot_encoding,combine_data],axis=1)

pd.get_dummies(combine_data['P_Date_month'])
onehot_encoding=pd.get_dummies(combine_data['P_Date_month'],prefix='P_Date_month')
combine_data=combine_data.drop('P_Date_month',axis=1)
combine_data=pd.concat([onehot_encoding,combine_data],axis=1)

pd.get_dummies(combine_data['P_Date_day'])
onehot_encoding=pd.get_dummies(combine_data['P_Date_day'],prefix='P_Date_day')
combine_data=combine_data.drop('P_Date_day',axis=1)
combine_data=pd.concat([onehot_encoding,combine_data],axis=1)

In [199]:
print(combine_data.shape)

(136000, 324)


In [200]:
combine_data

Unnamed: 0,P_Date_day_01,P_Date_day_02,P_Date_day_03,P_Date_day_04,P_Date_day_05,P_Date_day_06,P_Date_day_07,P_Date_day_08,P_Date_day_09,P_Date_day_10,...,browser_Chrome,browser_FireFox,browser_IE,browser_Opera,browser_Safari,age,sex,purchase_value,S_Time_min,P_Time_min
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,33,0,48,12,14
1,0,0,0,0,0,0,0,0,1,0,...,1,0,0,0,0,26,1,50,08,01
2,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,28,0,31,18,04
3,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,50,0,31,02,23
4,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,27,1,16,18,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27195,0,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,29,1,48,05,23
27196,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,42,0,12,23,06
27197,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,41,1,26,15,17
27198,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,36,1,17,12,19


In [201]:
X_train = combine_data[:108800]
X_test = combine_data[108800:]
y_train = df_train_copy['class'].astype(str).map({'False': 0 ,'True':1})
print('train_y',y_train.shape)
print('test_data',X_test.shape)
print('train_data',X_train.shape)
X_train.head(5)

train_y (108800,)
test_data (27200, 324)
train_data (108800, 324)


Unnamed: 0,P_Date_day_01,P_Date_day_02,P_Date_day_03,P_Date_day_04,P_Date_day_05,P_Date_day_06,P_Date_day_07,P_Date_day_08,P_Date_day_09,P_Date_day_10,...,browser_Chrome,browser_FireFox,browser_IE,browser_Opera,browser_Safari,age,sex,purchase_value,S_Time_min,P_Time_min
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,33,0,48,12,14
1,0,0,0,0,0,0,0,0,1,0,...,1,0,0,0,0,26,1,50,8,1
2,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,28,0,31,18,4
3,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,50,0,31,2,23
4,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,27,1,16,18,16


## 建模/預測

In [202]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [203]:
random_seed = 91015  # set seed for same train test data split
x_train, x_test, y_train, y_test = train_test_split(X_train,
                                                    y_train, test_size=0.2,
                                                    random_state=random_seed)

In [204]:
print("shape of X_train: ", x_train.shape)

shape of X_train:  (87040, 324)


In [205]:
print("shape of X_test: ", x_test.shape)

shape of X_test:  (21760, 324)


In [206]:
#clf = RandomForestClassifier(n_estimators=20)
clf = RandomForestClassifier(n_estimators=60, criterion="gini", max_depth=None, bootstrap=True, random_state=None)

In [207]:
clf.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='sqrt', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=20, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=60,
                       n_jobs=None, oob_score=True, random_state=10, verbose=0,
                       warm_start=False)

In [208]:
y_pred = clf.predict(x_test)

In [209]:
accuracy_score(y_test, y_pred)

0.9540441176470589

In [210]:
clf.feature_importances_

array([7.41674325e-03, 1.20340896e-02, 6.40708436e-03, 6.38822785e-03,
       6.30805718e-03, 9.66272772e-03, 8.86288243e-03, 7.02517399e-03,
       4.25285131e-03, 5.36390658e-03, 6.84730778e-03, 1.20288348e-02,
       6.52201013e-04, 1.85154005e-03, 1.21839556e-03, 7.21590942e-04,
       1.12916603e-03, 2.99440342e-03, 3.17360927e-03, 2.46052257e-03,
       3.85441072e-03, 2.90646414e-03, 4.87023561e-03, 5.58261567e-03,
       4.69567226e-03, 5.83063402e-03, 4.49351511e-03, 5.29710133e-03,
       5.73224117e-03, 4.30103683e-03, 3.82304792e-03, 4.55156087e-01,
       1.28535396e-02, 1.52173622e-02, 2.10397833e-02, 1.26006277e-02,
       4.42511332e-03, 4.47669822e-03, 3.66035101e-03, 1.84867649e-03,
       1.45776300e-03, 5.30824040e-04, 7.19160578e-05, 5.32475650e-03,
       9.42634144e-03, 3.79171084e-03, 6.54148331e-03, 7.52339524e-03,
       7.72832050e-03, 1.05280249e-02, 5.72583402e-03, 5.62904003e-03,
       8.45476339e-03, 9.27479099e-03, 7.63511307e-03, 4.50681947e-04,
      

In [211]:
y_pred=clf.predict(X_test)

In [212]:
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [213]:
submit_file = pd.DataFrame({'user_id': f_test_ID, 'class': y_pred})

In [215]:
submit_file['user_id']=submit_file['user_id'].astype('str')

In [216]:
submit_file['class'][1:220]

1      0
2      0
3      0
4      0
5      0
      ..
215    0
216    0
217    0
218    0
219    0
Name: class, Length: 219, dtype: int64

In [218]:
submit_file.to_csv('cw_credit_2336.csv',index=False)