# パラメータ・チューニング（グリッドサーチ）

ここではグリッドサーチの実装方法について学びます。<br>
まずは分類用のサンプルデータを読み込みます。

In [1]:
# Breast cancer dataset for binary classification
import pandas as pd
from sklearn.datasets import load_breast_cancer

# Set X and y
dataset = load_breast_cancer()
X = pd.DataFrame(dataset.data, columns=dataset.feature_names)
y = pd.DataFrame(dataset.target, columns=['y'])
X.join(y).head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,y
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


分類器としてロジスティック回帰と勾配ブースティングを設定しておきます。<br><b>推定器の略称として"est"、次元圧縮に"pca"を使っていますが、本記号がグリッドサーチ時の設定にも使われます</b>。

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score

# パイプラインの設定
pipe_logistic = Pipeline([('scl',StandardScaler()),
                          ('pca', PCA(random_state=1)),
                          ('est',LogisticRegression(random_state=1))])

グリッドサーチによる探索条件の設定は、<br><b>上記で指定した"pca"や"est"に紐付と各アルゴリズムの持つパラメータ名を結合させて指定します。</b><br>下記例の設定内容は以下の通り計24通りです。
- 主成分圧縮
 - 抽出主成分数を5,7,9の3通り
- ロジスティック回帰
 - 正則化パラメータは0.1から100の4通り、正則化はL1とL2の2通り

In [4]:
# パラメータグリッドの設定
param_grid_logistic = {'pca__n_components':[5,7,9],
                       'est__C':[0.1,1.0,10.0,100.0],
                       'est__penalty':['l1','l2']}

設定は以上です。ハイパーパラメータの異なるモデルの構築と評価はGridSerachCVで行うことができます。<br>gsをfitした時点で、各パラメータのモデルの構築と評価を終えています。

In [6]:
print('探索空間:%s' %param_grid_logistic)
gs = GridSearchCV(estimator=pipe_logistic,
                  param_grid=param_grid_logistic,
                  scoring='f1',
                  cv=3,
                  return_train_score=False)
gs = gs.fit(X, y.as_matrix().ravel())

# 探索した結果のベストスコアとパラメータの取得
print('Best Score:', gs.best_score_)
print('Best Params', gs.best_params_)

探索空間:{'pca__n_components': [5, 7, 9], 'est__C': [0.1, 1.0, 10.0, 100.0], 'est__penalty': ['l1', 'l2']}


  import sys


Best Score: 0.9833314910965807
Best Params {'est__C': 0.1, 'est__penalty': 'l2', 'pca__n_components': 9}


ベストモデルで予測をしたい場合は以下です。

In [5]:
gs.predict_proba(X)

array([[9.99990949e-01, 9.05116719e-06],
       [9.95490682e-01, 4.50931764e-03],
       [9.99899156e-01, 1.00844229e-04],
       ...,
       [9.51518892e-01, 4.84811076e-02],
       [9.99999833e-01, 1.66748227e-07],
       [4.85054239e-04, 9.99514946e-01]])

グリッドサーチの探索結果を閲覧したい場合は以下です。

In [7]:
pd.DataFrame(gs.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_est__C,param_est__penalty,param_pca__n_components,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.0,0.0,0.0,0.0,0.1,l1,5,"{'est__C': 0.1, 'est__penalty': 'l1', 'pca__n_...",0.966387,0.979424,0.983051,0.976275,0.007156,5
1,0.0,0.0,0.005209,0.007366,0.1,l1,7,"{'est__C': 0.1, 'est__penalty': 'l1', 'pca__n_...",0.966387,0.979424,0.983051,0.976275,0.007156,5
2,0.005207,0.007363,0.0,0.0,0.1,l1,9,"{'est__C': 0.1, 'est__penalty': 'l1', 'pca__n_...",0.966387,0.979424,0.983051,0.976275,0.007156,5
3,0.0,0.0,0.0,0.0,0.1,l2,5,"{'est__C': 0.1, 'est__penalty': 'l2', 'pca__n_...",0.979253,0.979424,0.978723,0.979134,0.000298,4
4,0.0,0.0,0.005207,0.007363,0.1,l2,7,"{'est__C': 0.1, 'est__penalty': 'l2', 'pca__n_...",0.975,0.979424,0.974359,0.976264,0.002252,8
5,0.0,0.0,0.005208,0.007365,0.1,l2,9,"{'est__C': 0.1, 'est__penalty': 'l2', 'pca__n_...",0.983471,0.983471,0.983051,0.983331,0.000198,1
6,0.005207,0.007364,0.0,0.0,1.0,l1,5,"{'est__C': 1.0, 'est__penalty': 'l1', 'pca__n_...",0.97479,0.979253,0.974359,0.976137,0.002213,13
7,0.005216,0.007377,0.0,0.0,1.0,l1,7,"{'est__C': 1.0, 'est__penalty': 'l1', 'pca__n_...",0.97479,0.971193,0.974359,0.973446,0.001604,19
8,0.0,0.0,0.0,0.0,1.0,l1,9,"{'est__C': 1.0, 'est__penalty': 'l1', 'pca__n_...",0.970464,0.97541,0.978723,0.974859,0.003393,14
9,0.005207,0.007363,0.0,0.0,1.0,l2,5,"{'est__C': 1.0, 'est__penalty': 'l2', 'pca__n_...",0.983333,0.983471,0.974359,0.980398,0.00426,3


以上でグリッドサーチの実装方法は終了です。