## [作業重點]
了解如何使用 Sklearn 中的 hyper-parameter search 找出最佳的超參數

### 作業
請使用不同的資料集，並使用 hyper-parameter search 的方式，看能不能找出最佳的超參數組合

In [3]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split, RandomizedSearchCV

diabetes = load_diabetes()
print(dir(diabetes))

['DESCR', 'data', 'data_filename', 'feature_names', 'target', 'target_filename']


In [4]:
import numpy as np
print(diabetes.data.shape)
print(diabetes.feature_names)
print(np.unique(diabetes.target))

(442, 10)
['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']
[ 25.  31.  37.  39.  40.  42.  43.  44.  45.  47.  48.  49.  50.  51.
  52.  53.  54.  55.  57.  58.  59.  60.  61.  63.  64.  65.  66.  67.
  68.  69.  70.  71.  72.  73.  74.  75.  77.  78.  79.  80.  81.  83.
  84.  85.  86.  87.  88.  89.  90.  91.  92.  93.  94.  95.  96.  97.
  98.  99. 100. 101. 102. 103. 104. 107. 108. 109. 110. 111. 113. 114.
 115. 116. 118. 120. 121. 122. 123. 124. 125. 126. 127. 128. 129. 131.
 132. 134. 135. 136. 137. 138. 139. 140. 141. 142. 143. 144. 145. 146.
 147. 148. 150. 151. 152. 153. 154. 155. 156. 158. 160. 161. 162. 163.
 164. 166. 167. 168. 170. 171. 172. 173. 174. 175. 177. 178. 179. 180.
 181. 182. 183. 184. 185. 186. 187. 189. 190. 191. 192. 195. 196. 197.
 198. 199. 200. 201. 202. 206. 208. 209. 210. 212. 214. 215. 216. 217.
 219. 220. 221. 222. 225. 229. 230. 232. 233. 235. 236. 237. 241. 242.
 243. 244. 245. 246. 248. 249. 252. 253. 257. 258. 259. 261. 262. 263.
 26

In [5]:
import pandas as pd

pd.DataFrame(diabetes.data).head(30)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.06833,-0.092204
2,0.085299,0.05068,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.02593
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641
5,-0.092695,-0.044642,-0.040696,-0.019442,-0.068991,-0.079288,0.041277,-0.076395,-0.04118,-0.096346
6,-0.045472,0.05068,-0.047163,-0.015999,-0.040096,-0.0248,0.000779,-0.039493,-0.062913,-0.038357
7,0.063504,0.05068,-0.001895,0.06663,0.09062,0.108914,0.022869,0.017703,-0.035817,0.003064
8,0.041708,0.05068,0.061696,-0.040099,-0.013953,0.006202,-0.028674,-0.002592,-0.014956,0.011349
9,-0.0709,-0.044642,0.039062,-0.033214,-0.012577,-0.034508,-0.024993,-0.002592,0.067736,-0.013504


In [29]:
from sklearn import metrics

train_x, test_x, train_y, test_y = train_test_split(diabetes.data, diabetes.target, random_state=221, test_size = 0.2)
base_m = GradientBoostingRegressor(random_state=221)

base_m.fit(train_x, train_y)
pred_y = base_m.predict(test_x)
print(base_m.get_params)
print(metrics.r2_score(test_y, pred_y))

<bound method BaseEstimator.get_params of GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=221, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)>
0.4570882213723534


In [31]:
params_test1 = {'n_estimators':range(90,121),
                'learning_rate':np.linspace(0.01,0.1,11),
                'max_features':['auto', 'sqrt', 'log2'],
                'max_depth':range(2,10),
                'min_samples_leaf':range(1,30),
                'min_samples_split':range(2,60),
                'subsample':np.linspace(0.7,1,11)}

rsearch = RandomizedSearchCV(base_m, params_test1, n_iter = 300, n_jobs = -1, random_state = 221)
rsearch_result1 = rsearch.fit(train_x, train_y)

print('Best Accuracy: %f using %s' % (rsearch_result1.best_score_, rsearch_result1.best_params_))

Best Accuracy: 0.435222 using {'subsample': 1.0, 'n_estimators': 112, 'min_samples_split': 57, 'min_samples_leaf': 16, 'max_features': 'sqrt', 'max_depth': 2, 'learning_rate': 0.082}


In [37]:
bestparams_m = rsearch_result1.best_estimator_
bestparams_m.fit(train_x, train_y)
pred_y = bestparams_m.predict(test_x)
print(bestparams_m.get_params)
print(metrics.r2_score(test_y, pred_y))

<bound method BaseEstimator.get_params of GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.082, loss='ls',
                          max_depth=2, max_features='sqrt', max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=16, min_samples_split=57,
                          min_weight_fraction_leaf=0.0, n_estimators=112,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=221, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)>
0.5375838603767542
