In [27]:
import pandas as pd
import numpy as np
import copy
import warnings
from sklearn import linear_model, metrics
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV

warnings.filterwarnings('ignore')

In [2]:
data_path = '../data/data-science-london-scikit-learn/'
train = pd.read_csv(data_path + 'train.csv', header=None)
train_label = pd.read_csv(data_path + 'trainLabels.csv', header=None)
test = pd.read_csv(data_path + 'test.csv', header=None)

train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30,31,32,33,34,35,36,37,38,39
0,0.299403,-1.226624,1.498425,-1.17615,5.289853,0.208297,2.404498,1.594506,-0.051608,0.663234,...,-0.850465,-0.62299,-1.833057,0.293024,3.552681,0.717611,3.305972,-2.715559,-2.682409,0.10105
1,-1.174176,0.332157,0.949919,-1.285328,2.199061,-0.151268,-0.427039,2.619246,-0.765884,-0.09378,...,-0.81975,0.012037,2.038836,0.468579,-0.517657,0.422326,0.803699,1.213219,1.382932,-1.817761
2,1.192222,-0.414371,0.067054,-2.233568,3.658881,0.089007,0.203439,-4.219054,-1.184919,-1.24031,...,-0.604501,0.750054,-3.360521,0.856988,-2.751451,-1.582735,1.672246,0.656438,-0.932473,2.987436
3,1.57327,-0.580318,-0.866332,-0.603812,3.125716,0.870321,-0.161992,4.499666,1.038741,-1.092716,...,1.022959,1.275598,-3.48011,-1.065252,2.153133,1.563539,2.767117,0.215748,0.619645,1.883397
4,-0.613071,-0.644204,1.112558,-0.032397,3.490142,-0.011935,1.443521,-4.290282,-1.761308,0.807652,...,0.513906,-1.803473,0.518579,-0.205029,-4.744566,-1.520015,1.830651,0.870772,-1.894609,0.408332


In [3]:
test.shape

(9000, 40)

In [4]:
# 查看 target 資料，以判斷是迴歸 or 分類問題
train_label[0].value_counts()

1    510
0    490
Name: 0, dtype: int64

In [5]:
# 查看是否有 Columns 缺漏值 (若無任何輸出表示無缺值)
for column in train.columns:
    if np.sum(train[column].isnull()) > 0:
        print('Column: "%s" contains null' % column)

In [6]:
# 查看相關係數
train_and_target = pd.DataFrame.copy(train)
train_and_target[40] = train_label
corr_target = train_and_target.corr()[40]
corr_target

0    -0.028058
1    -0.052153
2    -0.005618
3    -0.007439
4     0.150652
5     0.015151
6    -0.252976
7     0.040523
8     0.037534
9     0.026841
10   -0.010233
11   -0.075276
12    0.460738
13    0.018214
14    0.476171
15   -0.060556
16    0.012234
17   -0.018021
18    0.289508
19    0.046036
20   -0.075135
21   -0.018645
22    0.046953
23    0.172938
24   -0.017121
25   -0.036663
26    0.064633
27    0.019778
28    0.207949
29   -0.012022
30   -0.015797
31    0.012458
32   -0.227027
33   -0.053928
34    0.236116
35    0.012533
36   -0.231733
37    0.039376
38   -0.019690
39    0.264656
40    1.000000
Name: 40, dtype: float64

In [15]:
x_train, x_test, y_train, y_test = train_test_split(train, train_label, test_size=0.25, random_state=111)

In [21]:
# 以邏輯斯回歸評估正確率
reg = linear_model.LogisticRegression()
reg.fit(x_train, y_train)
y_pred = reg.predict(x_test)

acc = metrics.accuracy_score(y_test, y_pred)

print("Accuracy: ", acc)

Accuracy:  0.808


In [22]:
# 以隨機森林評估正確率
clf = RandomForestClassifier(n_estimators=20, max_depth=4)
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)

acc = metrics.accuracy_score(y_test, y_pred)
print("Accuracy: ", acc)

Accuracy:  0.796


In [25]:
# 以梯度提升評估正確率 (Day 46)
clf = GradientBoostingClassifier(loss="deviance", learning_rate=0.1, n_estimators=100)
clf.fit(x_train, y_train)

y_pred = clf.predict(x_test)

acc = metrics.accuracy_score(y_test, y_pred)
print("Acuuracy: ", acc)

Acuuracy:  0.828


在未對資料進行任何處理時 `GradientBoostingClassifier` 的表現最佳

In [29]:
# 基於梯度提升，調整超參數 (Day 47)

# 設定要訓練的超參數組合
learning_rate = [0.05, 0.1, 0.25, 0.5, 0.75]
n_estimators = [25, 50, 100, 125]
param_grid = dict(learning_rate=learning_rate, n_estimators=n_estimators)

## 建立搜尋物件，放入模型及參數組合字典 (n_jobs=-1 會使用全部 cpu 平行運算)
grid_search = GridSearchCV(clf, param_grid, scoring="accuracy", n_jobs=-1, verbose=1)

# 開始搜尋最佳參數
grid_result = grid_search.fit(x_train, y_train)

# 預設會跑 3-fold cross-validadtion，總共 20 種參數組合，總共要 train 60 次模型

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    7.0s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    8.6s finished


In [30]:
# 印出最佳結果與最佳參數
print("Best Accuracy: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Best Accuracy: 0.873333 using {'learning_rate': 0.25, 'n_estimators': 100}


In [31]:
grid_result.best_params_

{'learning_rate': 0.25, 'n_estimators': 100}

In [36]:
# 使用最佳參數重新建立模型
clf_bestparam = GradientBoostingClassifier(learning_rate=grid_result.best_params_['learning_rate'],
                                           n_estimators=grid_result.best_params_['n_estimators'])

clf_bestparam.fit(x_train, y_train)
y_pred = clf_bestparam.predict(x_test)

acc = metrics.accuracy_score(y_test, y_pred)
print("Acuuracy: ", acc)

Acuuracy:  0.88


In [37]:
submit_pred = clf_bestparam.predict(test)

df_pred = pd.DataFrame({'Id': np.arange(1, len(test)+1, 1), 'Solution': submit_pred})

print(df_pred.head(20))
print(df_pred.shape)

df_pred.to_csv("data-science-london-scikit-learn_pred.csv", index=False)

    Id  Solution
0    1         1
1    2         0
2    3         0
3    4         0
4    5         0
5    6         0
6    7         0
7    8         1
8    9         0
9   10         0
10  11         1
11  12         1
12  13         1
13  14         0
14  15         0
15  16         1
16  17         1
17  18         1
18  19         1
19  20         1
(9000, 2)
