hyper parameter tuenig

https://www.codexa.net/hyperparameter-tuning-python/

In [1]:
# 基本ライブラリ
import pandas as pd
 
# Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, KFold
 
# XGBoost
import xgboost as xgb
 
# Matplotlibのインライン表示
%matplotlib inline

In [6]:
# pickleで読込む
import pickle

# 前処理済みデータ取得
with open('data/dataset/pre/pre_data.pickle','rb') as f:
    pre_data = pickle.load(f)

# 欠損値処理
pre_data=pre_data.fillna(pre_data.mean())

# 目的変数と説明変数の取得
y=pre_data['LoanStatus']
X=pre_data.drop(columns='LoanStatus')

# 訓練データとテストデータの分割
from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, shuffle=True)
train_X, valid_X, train_y, valid_y = train_test_split(train_X, train_y, test_size=0.2, shuffle=True)

In [21]:

params = {'metric':'error',
          'objective':'binary:logistic',
          'n_estimators':50000,
          'booster': 'gbtree',
          'learning_rate':0.01,
          'min_child_weight':1,
          'max_depth':5,
          'random_state':seed,
          'colsample_bytree':1,
          'subsample':1,
         }

ベースラインのモデル訓練

In [44]:
cls = xgb.XGBClassifier()
cls.set_params(**params)
cls.fit(train_X,
        train_y,
        early_stopping_rounds=50,
        eval_set=[(test_X, test_y)],
        eval_metric='error',
        verbose=1)

### GridSearch

In [69]:
cv_params = {'metric':['error'],
             'objective':['binary:logistic'],
             'n_estimators':[5],
#              'random_state':[seed],
             'booster': ['gbtree'],
             'learning_rate':[0.01],
             'min_child_weight':[1,5],
             'max_depth':[1,3],
             'colsample_bytree':[0.5,1.0],
             'subsample':[0.5,1.0]
            }
 
cls = xgb.XGBClassifier()
cls_grid = GridSearchCV(cls, cv_params,cv=KFold(2), scoring='accuracy')
cls_grid.fit(train_X,
             train_y,
             early_stopping_rounds=50,
             eval_set=[(test_X, test_y)],
             eval_metric='error',
             verbose=0)

In [49]:
print(cls_grid.best_params_)
print(cls_grid.best_score_)

{'booster': 'gbtree', 'colsample_bytree': 1.0, 'learning_rate': 0.01, 'max_depth': 3, 'metric': 'error', 'min_child_weight': 1, 'n_estimators': 5, 'objective': 'binary:logistic', 'subsample': 0.5}
0.9177818279675115


In [52]:
pred = cls_grid.best_estimator_.predict(test_X)
grid_score = accuracy_score(test_y, pred)
grid_score

0.9175473618287145

In [54]:
confusion_matrix(test_y, pred)

array([[39464,  1933],
       [ 2280,  7419]], dtype=int64)

### ランダムサーチ（RandomizedSearchCV）


In [45]:
cv_params = {'metric':['error'],
             'objective':['binary:logistic'],
             'n_estimators':[50000],
             'random_state':[seed],
             'boosting_type': ['gbdt'],
             'learning_rate':[0.01],
             'min_child_weight':[1,2,3,4,5,6,7,8,9,10],
             'max_depth':[1,2,3,4,5,6,7,8,9,10],
             'colsample_bytree':[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0],
             'subsample':[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0],
            }
 
cls = xgb.XGBClassifier()
cls_rdn = RandomizedSearchCV(cls,
                             cv_params,
                             cv=KFold(2),
                             random_state=seed,
                             n_iter=30,
                             scoring='accuracy')
cls_rdn.fit(train_X,
            train_y,
            early_stopping_rounds=50,
            eval_set=[(test_X, test_y)],
            eval_metric='error',
            verbose=0)

In [None]:
pred_3 = cls_rdn.best_estimator_.predict(X_test)
rdn_score = accuracy_score(y_test, pred_3)
rdn_score

### ベイズ最適化（BayesianOptimization）

In [39]:
from bayes_opt import BayesianOptimization

In [43]:
def xgb_evaluate(min_child_weight, subsample, colsample_bytree, max_depth):
    params = {'metric': 'error',
              'objective':'binary:logistic',
              'n_estimators':50000,
              'random_state':42,
              'boosting_type':'gbdt',
              'learning_rate':0.01,              
              'min_child_weight': int(min_child_weight),
              'max_depth': int(max_depth),
              'colsample_bytree': colsample_bytree,
              'subsample': subsample,
             }
    
    cls = xgb.XGBClassifier()
    cls.set_params(**params)
    cls.fit(train_X,
            train_y,
            early_stopping_rounds=50,
            eval_set=[(test_X, test_y)],
            eval_metric='error',
            verbose=0)
    
    pred = cls.predict(X_test)
    score = accuracy_score(y_test, pred)
    return score

ベイズ最適化ではxgb_evaluate関数が戻すscore（正解率の値）の最大化を目的としてハイパーパラメータの値を探索

In [41]:
xgb_bo = BayesianOptimization(xgb_evaluate, 
                              {'min_child_weight': (1,20),
                               'subsample': (.1,1),
                               'colsample_bytree': (.1,1),
                               'max_depth': (1,50)},
                              random_state=10)

maximizeメソッドを実行してベイズ最適化によるハイパーパラメータの検証を行う。  
ベイズ最適化も繰り返しの処理による最適化の手法。  
繰り返し回数はn_iter引数で指定することが可能。  
50回の繰り返し処理で実行する。  
init_points引数はランダムな探索を何回行うのかを指定する引数

In [46]:
xgb_bo.maximize(init_points=15, n_iter=50, acq='ei')

max属性は最も評価スコアが高かった結果を取得することが可能  
ベイズ最適化により得られた最もスコアの良かったハイパーパラメータの値を変数optimized_paramsへ格納

In [None]:
optimized_params = xgb_bo.max['params']
optimized_params['max_depth'] = int(optimized_params['max_depth'])
optimized_params

metricやobjectiveは固定されたハイパーパラメータですので、変数fixed_paramsへ格納

In [None]:
fixed_params = {'metric':'error',
                'objective':'binary:logistic',
                'n_estimators':50000,
                'random_state':seed,
                'booster': 'gbtree',
                'learning_rate':0.01}

新たにXGBClassifierのインスタンスを生成して、ベイズ最適化で得られたハイパーパラメータの値を使いモデル訓練

In [None]:
cls = xgb.XGBClassifier()
cls.set_params(**fixed_params, **optimized_params)
cls.fit(X_train,
        y_train,
        early_stopping_rounds=50,
        eval_set=[(X_test, y_test)],
        eval_metric='error',
        verbose=0)

テストデータの特徴量から推測結果を算出して、正解率と混同行列を出力

In [None]:
pred_4 = cls.predict(X_test)
baseline = accuracy_score(y_test, pred_4)
baseline

In [None]:
confusion_matrix(y_test, pred_4)