## [作業重點]
了解如何使用 Sklearn 中的 hyper-parameter search 找出最佳的超參數

### 作業
請使用不同的資料集，並使用 hyper-parameter search 的方式，看能不能找出最佳的超參數組合

In [1]:
from sklearn import datasets, metrics
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
import pandas as pd
import numpy as np

In [2]:
iris = datasets.load_iris()

In [3]:
iris.keys()

dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names'])

In [4]:
x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size = 0.3, random_state = 900)
clf = GradientBoostingClassifier()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print('Accuracy: {}'.format(metrics.accuracy_score(y_test, y_pred)))

Accuracy: 0.9777777777777777


In [5]:
clf.get_params()

{'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'deviance',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'presort': 'auto',
 'random_state': None,
 'subsample': 1.0,
 'verbose': 0,
 'warm_start': False}

In [6]:
# Grid search on max_depth, n_estimator, criterion
# Check accuracy and time for each combination
max_depth = [2, 3, 5, 10]
n_estimators = [50, 100, 200, 300]
criterion = ['friedman_mse', 'mse']
params = dict(max_depth = max_depth, n_estimators = n_estimators, criterion = criterion)

grid_search = GridSearchCV(clf, param_grid = params, cv = 3, verbose = 1)

grid_result = grid_search.fit(x_train, y_train)

Fitting 3 folds for each of 32 candidates, totalling 96 fits


[Parallel(n_jobs=1)]: Done  96 out of  96 | elapsed:   16.1s finished


In [7]:
import warnings
warnings.simplefilter('ignore')

In [8]:
grid_cv_results = pd.DataFrame(grid_result.cv_results_)
grid_cv_results.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,0.056578,0.003789,0.000428,6.4e-05,friedman_mse,2,50,"{'criterion': 'friedman_mse', 'max_depth': 2, ...",0.916667,0.972222,0.969697,0.952381,0.025817,1,1.0,1.0,1.0,1.0,0.0
1,0.108364,0.005926,0.000504,0.000178,friedman_mse,2,100,"{'criterion': 'friedman_mse', 'max_depth': 2, ...",0.916667,0.972222,0.969697,0.952381,0.025817,1,1.0,1.0,1.0,1.0,0.0
2,0.182395,0.013797,0.000807,0.000209,friedman_mse,2,200,"{'criterion': 'friedman_mse', 'max_depth': 2, ...",0.916667,0.972222,0.969697,0.952381,0.025817,1,1.0,1.0,1.0,1.0,0.0
3,0.273815,0.024427,0.000549,2.6e-05,friedman_mse,2,300,"{'criterion': 'friedman_mse', 'max_depth': 2, ...",0.916667,0.972222,0.969697,0.952381,0.025817,1,1.0,1.0,1.0,1.0,0.0
4,0.076972,0.031332,0.000637,0.000276,friedman_mse,3,50,"{'criterion': 'friedman_mse', 'max_depth': 3, ...",0.916667,0.972222,0.969697,0.952381,0.025817,1,1.0,1.0,1.0,1.0,0.0


In [9]:
print(f'Best parameters are: {grid_result.best_params_} with score of {grid_result.best_score_}')

Best parameters are: {'criterion': 'friedman_mse', 'max_depth': 2, 'n_estimators': 50} with score of 0.9523809523809523


In [10]:
grid_cv_results[grid_cv_results['mean_test_score'] == grid_result.best_score_].sort_values(by = 'mean_fit_time').head(3)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,0.056578,0.003789,0.000428,6.4e-05,friedman_mse,2,50,"{'criterion': 'friedman_mse', 'max_depth': 2, ...",0.916667,0.972222,0.969697,0.952381,0.025817,1,1.0,1.0,1.0,1.0,0.0
4,0.076972,0.031332,0.000637,0.000276,friedman_mse,3,50,"{'criterion': 'friedman_mse', 'max_depth': 3, ...",0.916667,0.972222,0.969697,0.952381,0.025817,1,1.0,1.0,1.0,1.0,0.0
25,0.087164,0.001914,0.000404,9e-06,mse,5,100,"{'criterion': 'mse', 'max_depth': 5, 'n_estima...",0.916667,0.972222,0.969697,0.952381,0.025817,1,1.0,1.0,1.0,1.0,0.0


In [11]:
grid_cv_results.loc[grid_cv_results['params'] == {'criterion': 'friedman_mse', 'max_depth': 3, 'n_estimators': 100}, 'mean_test_score'] == grid_result.best_score_

5    True
Name: mean_test_score, dtype: bool

In [12]:
clf_best = GradientBoostingClassifier(criterion = 'friedman_mse', max_depth = 2, n_estimators = 50)
clf_best.fit(x_train, y_train)
y_pred_new = clf_best.predict(x_test)
print('Accuracy:', metrics.accuracy_score(y_test, y_pred_new))

Accuracy: 0.9777777777777777


In [13]:
clf_best2 = GradientBoostingClassifier(criterion = 'mse', max_depth = 2, n_estimators = 50)
clf_best2.fit(x_train, y_train)
y_pred_new2 = clf_best2.predict(x_test)
print('Accuracy:', metrics.accuracy_score(y_test, y_pred_new2))

Accuracy: 0.9777777777777777
