# 作業
請使用不同的資料集，並使用 hyper-parameter search 的方式，看能不能找出最佳的超參數組合

# 使用 K-fold Cross-validation 來切分資料

In [2]:
import numpy as np
X = np.arange(50).reshape(10, 5) # 生成從 0 到 50 的 array，並 reshape 成 (10, 5) 的 matrix
y = np.zeros(10) # 生成一個全零 arrary
y[:5] = 1 # 將一半的值改為 1
print("Shape of X: ", X.shape)
print("Shape of y: ", y.shape)

Shape of X:  (10, 5)
Shape of y:  (10,)


In [5]:
from sklearn.model_selection import train_test_split, KFold
kf = KFold(n_splits=5)
kf.split(X)
for train_index, test_index in kf.split(X):
    print()

<generator object _BaseKFold.split at 0x0000000008B789A8>

In [6]:
kf = KFold(n_splits=5)
i = 0
for train_index, test_index in kf.split(X):
    i +=1 
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    print("index: ", train_index)
    print("index2: ", test_index)
    print("FOLD {}: ".format(i))
    print("X_test: ", X_test)
    print("Y_test: ", y_test)
    print("-"*30)

index:  [2 3 4 5 6 7 8 9]
index2:  [0 1]
FOLD 1: 
X_test:  [[0 1 2 3 4]
 [5 6 7 8 9]]
Y_test:  [1. 1.]
------------------------------
index:  [0 1 4 5 6 7 8 9]
index2:  [2 3]
FOLD 2: 
X_test:  [[10 11 12 13 14]
 [15 16 17 18 19]]
Y_test:  [1. 1.]
------------------------------
index:  [0 1 2 3 6 7 8 9]
index2:  [4 5]
FOLD 3: 
X_test:  [[20 21 22 23 24]
 [25 26 27 28 29]]
Y_test:  [1. 0.]
------------------------------
index:  [0 1 2 3 4 5 8 9]
index2:  [6 7]
FOLD 4: 
X_test:  [[30 31 32 33 34]
 [35 36 37 38 39]]
Y_test:  [0. 0.]
------------------------------
index:  [0 1 2 3 4 5 6 7]
index2:  [8 9]
FOLD 5: 
X_test:  [[40 41 42 43 44]
 [45 46 47 48 49]]
Y_test:  [0. 0.]
------------------------------


In [90]:
from sklearn import datasets, metrics
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
iris = datasets.load_iris()

In [91]:
iris.target.shape

(150,)

# 流程:
- 先把資料切成 train data(3/4)、 test data(1/4)
- 把train data 用Cross validation(避免使model依賴同一組train data，泛化) 分別套用在Grid search找最佳參數
- 統計每一輪cross validation的參數，選擇出現最多次數的那組參數
- 套用model在test data

In [117]:
x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.25, random_state = 30)
clf = GradientBoostingClassifier()

print(x_train.shape)
print(x_test.shape)

clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print(metrics.accuracy_score(y_test, y_pred))

(112, 4)
(38, 4)
0.9473684210526315


In [118]:
kf = KFold(n_splits=5)
n_estimators = [50, 75, 100, 125, 150]
max_depth = [1,2,3,4,5]
param_grid = dict(n_estimators=n_estimators, max_depth=max_depth)

i = 0
for train_index, test_index in kf.split(x_train):
    i +=1 
    X_train_tmp, X_test_tmp = x_train[train_index], x_train[test_index]
    y_train_tmp, y_test_tmp = y_train[train_index], y_train[test_index]
#     print("index: ", train_index)
#     print("index2: ", test_index)
    
    clf = GradientBoostingClassifier()
#     clf.fit(X_train_tmp, y_train_tmp)
#     y_pred_tmp = clf.predict(X_test_tmp)
#     accuracy_tmp = metrics.accuracy_score(y_test_tmp, y_pred_tmp)
    
    ## 建立搜尋物件，放入模型及參數組合字典 (n_jobs=-1 會使用全部 cpu 平行運算)
    grid_search = GridSearchCV(clf, param_grid, scoring="neg_mean_squared_error", n_jobs=-1, verbose=1)
    # 開始搜尋最佳參數
    grid_result = grid_search.fit(X_train_tmp, y_train_tmp)
    
    # 使用最佳參數重新建立模型
    clf_bestparam = GradientBoostingClassifier(max_depth=grid_result.best_params_['max_depth'],
                                               n_estimators=grid_result.best_params_['n_estimators'])

    # 訓練模型
    clf_bestparam.fit(X_train_tmp, y_train_tmp)

    # 預測測試集
    y_pred_tmp = clf_bestparam.predict(X_test_tmp)
    
    print("FOLD {}: ".format(i))
    print("X_test: ", X_train_tmp.shape)
    print("Y_test: ", X_test_tmp.shape)
    print("accuracy_tmp: ", metrics.accuracy_score(y_test_tmp, y_pred_tmp))
    print("Best Accuracy: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
    print("-"*30)



Fitting 3 folds for each of 25 candidates, totalling 75 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   25.9s
[Parallel(n_jobs=-1)]: Done  75 out of  75 | elapsed:   29.2s finished


FOLD 1: 
X_test:  (89, 4)
Y_test:  (23, 4)
accuracy_tmp:  0.9130434782608695
Best Accuracy: -0.033708 using {'max_depth': 1, 'n_estimators': 75}
------------------------------
Fitting 3 folds for each of 25 candidates, totalling 75 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  68 out of  75 | elapsed:    7.3s remaining:    0.7s
[Parallel(n_jobs=-1)]: Done  75 out of  75 | elapsed:    7.7s finished


FOLD 2: 
X_test:  (89, 4)
Y_test:  (23, 4)
accuracy_tmp:  0.9565217391304348
Best Accuracy: -0.033708 using {'max_depth': 1, 'n_estimators': 75}
------------------------------
Fitting 3 folds for each of 25 candidates, totalling 75 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done  75 out of  75 | elapsed:    5.6s finished


FOLD 3: 
X_test:  (90, 4)
Y_test:  (22, 4)
accuracy_tmp:  1.0
Best Accuracy: -0.044444 using {'max_depth': 2, 'n_estimators': 50}
------------------------------
Fitting 3 folds for each of 25 candidates, totalling 75 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  68 out of  75 | elapsed:    5.9s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done  75 out of  75 | elapsed:    6.4s finished


FOLD 4: 
X_test:  (90, 4)
Y_test:  (22, 4)
accuracy_tmp:  1.0
Best Accuracy: -0.044444 using {'max_depth': 2, 'n_estimators': 50}
------------------------------
Fitting 3 folds for each of 25 candidates, totalling 75 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  68 out of  75 | elapsed:    4.8s remaining:    0.4s


FOLD 5: 
X_test:  (90, 4)
Y_test:  (22, 4)
accuracy_tmp:  0.9090909090909091
Best Accuracy: -0.022222 using {'max_depth': 1, 'n_estimators': 50}
------------------------------


[Parallel(n_jobs=-1)]: Done  75 out of  75 | elapsed:    5.2s finished


# 由每一輪的Cross Validation可以得知
- 第2次 grid_search 參數:{max_depth:2 , n_estimators: 50}  的結果最準，Best Accuracy= 1.0

In [124]:
x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.25, random_state = 30)
clf = GradientBoostingClassifier(max_depth=2, n_estimators=50)

print(x_train.shape)
print(x_test.shape)

clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print(metrics.accuracy_score(y_test, y_pred))

(112, 4)
(38, 4)
0.9473684210526315


# grid search使用參數: {max_depth:2 , n_estimators: 50} 得到最高的accuracy: 0.947

# 隨機搜索參數調整
- 時隨機從給定區間中選擇參數是很有效的方法，然後根據這些參數來評估算法的效果進而選擇最佳的那個
- https://www.jishuwen.com/d/2vQJ/zh-tw

In [125]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform as sp_rand

x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.25, random_state = 30)
clf = GradientBoostingClassifier()

print(x_train.shape)
print(x_test.shape)

clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print(metrics.accuracy_score(y_test, y_pred))

(112, 4)
(38, 4)
0.9473684210526315


In [112]:
from random import uniform

sp_rand()
np.floor(1.5)
randint(1, 100)
# uniform(0,3)
np.random.randint(10, 15, (1, 5))
np.random.randint(10, 15, (1, 1))

array([[12]])

In [127]:
from random import randint
from scipy.stats import uniform as sp_rand
from scipy.stats import randint as sp_randint

kf = KFold(n_splits=5)
# n_estimators = [50, 75, 100, 125, 150]
# max_depth = [1,2,3,4,5]
param_grid = dict(n_estimators= sp_randint(1,100), max_depth= sp_randint(1,10))

i = 0
for train_index, test_index in kf.split(x_train):
    i +=1 
    X_train_tmp, X_test_tmp = x_train[train_index], x_train[test_index]
    y_train_tmp, y_test_tmp = y_train[train_index], y_train[test_index]
#     print("index: ", train_index)
#     print("index2: ", test_index)
    
    clf = GradientBoostingClassifier()
#     clf.fit(X_train_tmp, y_train_tmp)
#     y_pred_tmp = clf.predict(X_test_tmp)
#     accuracy_tmp = metrics.accuracy_score(y_test_tmp, y_pred_tmp)
    
    ## 建立搜尋物件，放入模型及參數組合字典 (n_jobs=-1 會使用全部 cpu 平行運算)
#     grid_search = GridSearchCV(clf, param_grid, scoring="neg_mean_squared_error", n_jobs=-1, verbose=1)

    rand_search = RandomizedSearchCV(estimator=clf, param_distributions=param_grid, n_iter=100)


    # 開始搜尋最佳參數
    rand_result = rand_search.fit(X_train_tmp, y_train_tmp)
    print(rand_search)
    print("param_grid {}: ", param_grid)

    
    # 使用最佳參數重新建立模型
    clf_bestparam = GradientBoostingClassifier(max_depth=rand_result.best_params_['max_depth'],
                                               n_estimators=rand_result.best_params_['n_estimators'])

    # 訓練模型
    clf_bestparam.fit(X_train_tmp, y_train_tmp)

    # 預測測試集
    y_pred_tmp = clf_bestparam.predict(X_test_tmp)
    
    print("FOLD {}: ".format(i))
    print("X_test: ", X_train_tmp.shape)
    print("Y_test: ", X_test_tmp.shape)
    print("accuracy_tmp: ", metrics.accuracy_score(y_test_tmp, y_pred_tmp))
    print("Best Accuracy: %f using %s" % (rand_result.best_score_, rand_result.best_params_))
    print("-"*30)



RandomizedSearchCV(cv='warn', error_score='raise-deprecating',
          estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_sampl...      subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=100, n_jobs=None,
          param_distributions={'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000000000A3D4DD8>, 'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000000000A3E0550>},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=0)
param_grid {}:  {'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000000000A3D4DD8>, 'max_dep



RandomizedSearchCV(cv='warn', error_score='raise-deprecating',
          estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_sampl...      subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=100, n_jobs=None,
          param_distributions={'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000000000A3D4DD8>, 'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000000000A3E0550>},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=0)
param_grid {}:  {'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000000000A3D4DD8>, 'max_dep



RandomizedSearchCV(cv='warn', error_score='raise-deprecating',
          estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_sampl...      subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=100, n_jobs=None,
          param_distributions={'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000000000A3D4DD8>, 'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000000000A3E0550>},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=0)
param_grid {}:  {'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000000000A3D4DD8>, 'max_dep



RandomizedSearchCV(cv='warn', error_score='raise-deprecating',
          estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_sampl...      subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=100, n_jobs=None,
          param_distributions={'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000000000A3D4DD8>, 'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000000000A3E0550>},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=0)
param_grid {}:  {'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000000000A3D4DD8>, 'max_dep



RandomizedSearchCV(cv='warn', error_score='raise-deprecating',
          estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_sampl...      subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=100, n_jobs=None,
          param_distributions={'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000000000A3D4DD8>, 'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000000000A3E0550>},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=0)
param_grid {}:  {'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000000000A3D4DD8>, 'max_dep

# 由每一輪的Cross Validation可以得知
- 第5次 random_search 得到參數:{max_depth:2 , n_estimators: 38}  最準，Best Accuracy = 0.9777

In [129]:
x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.25, random_state = 30)
clf = GradientBoostingClassifier(max_depth=2, n_estimators=38)

print(x_train.shape)
print(x_test.shape)

clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print(metrics.accuracy_score(y_test, y_pred))

(112, 4)
(38, 4)
0.9473684210526315


# random search使用參數: {max_depth:2 , n_estimators: 38} 得到最高的accuracy: 0.947