### 作業
請使用不同的資料集，並使用 hyper-parameter search 的方式，看能不能找出最佳的超參數組合

In [2]:
from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import accuracy_score

In [3]:
cancer = load_breast_cancer()
x_train, x_test, y_train, y_test = train_test_split(cancer.data, cancer.target, test_size = 0.2, random_state = 5)

clf = GradientBoostingClassifier()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print("Accuracy : ", accuracy_score(y_test, y_pred))

Accuracy :  0.9736842105263158


In [5]:
# using grid search
n_estimators = [50, 100, 150, 200, 250]
max_depth = [1, 3, 5, 7, 9]
learning_rate = [0.06, 0.08, 0.1, 0.2, 0.25]
params_grid = dict(n_estimators = n_estimators, max_depth = max_depth, learning_rate = learning_rate)

grid_search = GridSearchCV(clf, params_grid, scoring = "accuracy", n_jobs = -1, verbose = 1)
grid_result = grid_search.fit(x_train, y_train)
print("Best score = %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Fitting 5 folds for each of 125 candidates, totalling 625 fits
Best score = 0.973626 using {'learning_rate': 0.1, 'max_depth': 1, 'n_estimators': 250}


In [7]:
clf_bestparam = GradientBoostingClassifier(n_estimators = grid_result.best_params_['n_estimators'],
                                           max_depth = grid_result.best_params_['max_depth'],
                                           learning_rate = grid_result.best_params_['learning_rate'])
clf_bestparam.fit(x_train, y_train)
y_pred = clf_bestparam.predict(x_test)
print("Accuracy with best params : ", accuracy_score(y_test, y_pred))

Accuracy with best params :  0.9736842105263158


In [9]:
from sklearn.datasets import load_wine
wine = load_wine()

x_train, x_test, y_train, y_test = train_test_split(wine.data, wine.target, test_size = 0.2, random_state = 5)
clf = GradientBoostingClassifier()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print("Accuracy : ", accuracy_score(y_test, y_pred))

Accuracy :  0.9166666666666666


In [11]:
# using grid search
n_estimators = [150, 200, 250, 300, 350]
learning_rate = [0.05, 0.1, 0.15, 0.2, 0.25]
max_depth = [1, 3, 5, 7, 9]
params_grid = dict(n_estimators = n_estimators, learning_rate = learning_rate, max_depth = max_depth)

grid_search = GridSearchCV(clf, params_grid, scoring = "accuracy", n_jobs = -1, verbose = 1)
grid_result = grid_search.fit(x_train, y_train)
print("Best score = %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Fitting 5 folds for each of 125 candidates, totalling 625 fits
Best score = 0.958128 using {'learning_rate': 0.05, 'max_depth': 1, 'n_estimators': 200}


In [12]:
clf_bestparams = GradientBoostingClassifier(n_estimators = grid_result.best_params_["n_estimators"],
                                            learning_rate = grid_result.best_params_["learning_rate"],
                                            max_depth = grid_result.best_params_["max_depth"])
clf_bestparams.fit(x_train, y_train)
y_pred = clf_bestparams.predict(x_test)
print("Accuracy with best params : ", accuracy_score(y_test, y_pred))

Accuracy with best params :  0.9722222222222222
