# ある特定の範囲のなかで最適なパラメータを発見したい

In [26]:
import numpy as np
from sklearn import  linear_model,datasets
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import accuracy_score




iris=datasets.load_iris()
features=iris.data
target=iris.target
f_train,f_test,t_train,t_test=train_test_split(features,target,random_state=1)

logit=linear_model.LogisticRegression()

penalty=['l1','l2']
C=np.logspace(0,4,10)

hypers=dict(C=C,penalty=penalty)

# verboseを1~3に設定すると、実行経過のログを出力する
gridsearch=GridSearchCV(logit,hypers,cv=5,verbose=0)

best_model=gridsearch.fit(f_train,t_train)
t_pred=best_model.predict(f_test)
accuracy_score(t_pred,t_test)

0.9210526315789473

In [31]:
print(f'best penalty is {best_model.best_estimator_.get_params()["penalty"]}')
print(f'best C is {best_model.best_estimator_.get_params()["C"]}')

best penalty is l2
best C is 2.7825594022071245


# 比較的安価な方法でパラメータサーチを行いたい

In [43]:
from scipy.stats import  uniform
from sklearn import  linear_model,datasets
from sklearn.model_selection import RandomizedSearchCV

iris=datasets.load_iris()
features=iris.data
target=iris.target

f_train,f_test,t_train,t_test=train_test_split(features,target,random_state=1)

logit=linear_model.LogisticRegression()

penalty=['l1','l2']

# パラメータ候補をランダムに生成するジェネレータ
C=uniform(loc=0,scale=4)

hypers=dict(C=C,penalty=penalty)

random_search=RandomizedSearchCV(
logit,
hypers,
random_state=1,
n_iter=100,
cv=5,
verbose=0)

best_model=random_search.fit(f_train,t_train)

In [44]:
t_pred=best_model.predict(f_test)
accuracy_score(t_pred,t_test)

0.9210526315789473

精度はGridSearchとほぼ変わらない

In [46]:
print(f'best penalty is {best_model.best_estimator_.get_params()["penalty"]}')

print(f'best C is {best_model.best_estimator_.get_params()["C"]}')

best penalty is l2
best C is 3.730229437354635


# 最適なアルゴリズムとパラメータを見つけたい

In [65]:
import numpy as np
from sklearn import  datasets
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.pipeline import Pipeline

np.random.seed(0)

iris=datasets.load_iris()
features=iris.data
target=iris.target

f_train,f_test,t_train,t_test=train_test_split(features,target,random_state=1)
# GridSearchに導入する際の初期値
pipe=Pipeline([('classifier',RandomForestClassifier())])

# 最近のsklearnの場合、アルゴリズムもパラメータとして指定可能
# パラメータはclassifier__[hyperparameter name]の形で指定
search_space=[
    dict(
    classifier=[LogisticRegression()],
    classifier__penalty=['l1','l2'],
    classifier__C=np.logspace(0,4,10)
    ),
    dict(
    classifier=[RandomForestClassifier()],
    classifier__n_estimators=[10,100,1000],
    classifier__max_features=[1,2,3]
    )
]

gridsearch=GridSearchCV(pipe,search_space,cv=5,verbose=0)
best_model=gridsearch.fit(f_train,t_train)

In [67]:
from sklearn.metrics import  classification_report

print(classification_report(best_model.predict(f_test),t_test))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00        13
          1       0.81      1.00      0.90        13
          2       1.00      0.75      0.86        12

avg / total       0.94      0.92      0.92        38



精度が上２つの例よりも上昇

In [69]:
print(f'best algorisim is {best_model.best_estimator_.get_params()["classifier"]}')

best algorisim is LogisticRegression(C=2.7825594022071245, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)


# 最適な前処理方法を見つけたい

In [87]:
from sklearn.pipeline import  Pipeline,FeatureUnion
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

np.random.seed(0)

# 前処理方法をpipelineに乗せるために結合
preprocessing=FeatureUnion([('std',StandardScaler()),('pca',PCA())])

pipe=Pipeline([('preprocess',preprocessing),('classifier',LogisticRegression())])

# 探索するパラメータの範囲
search_space=[
    dict(
        preprocess__pca__n_components=[1,2,3],
        classifier__penalty=['l1','l2'],
        classifier__C=np.logspace(0,4,10)
    )
]

clif=GridSearchCV(pipe,search_space,cv=5,verbose=1)
best_model=clif.fit(f_train,t_train)

Fitting 5 folds for each of 60 candidates, totalling 300 fits


[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:    1.7s finished


In [85]:
print(
classification_report(best_model.predict(f_test),t_test))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00        13
          1       0.81      1.00      0.90        13
          2       1.00      0.75      0.86        12

avg / total       0.94      0.92      0.92        38



In [82]:
best_model.best_estimator_.get_params()['preprocess__pca__n_components']

1

# モデル選定後のMetrixを出力したい

- パラメータサーチのため全データを使ってしまい、Metrixが出力できない問題
- そのため
    -  パラメータ選択のためのcv(inner cv)をGridSearchでおこない
    - Metrix出力のためのcv(outer cv)をcross_val_scoreで行う

In [88]:
# 4のモデルを使う
cross_val_score(clif,features,target).mean()

Fitting 5 folds for each of 60 candidates, totalling 300 fits


[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:    1.7s finished


Fitting 5 folds for each of 60 candidates, totalling 300 fits


[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:    1.6s finished


Fitting 5 folds for each of 60 candidates, totalling 300 fits


[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:    1.5s finished


0.9673202614379085

データ量が多いだけ、精度が更に向上した