# Stacking

http://segafreder.hatenablog.com/entry/2016/05/26/232728  
https://github.com/ikedaosushi/python-sandbox/blob/master/ml/stacking/Stacking.ipynb

In [28]:
# !jupyter nbconvert --to python Stacking.ipynb

[NbConvertApp] Converting notebook Stacking.ipynb to python
[NbConvertApp] Writing 6425 bytes to Stacking.py


In [1]:
# warningの無視
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import cross_val_score
from sklearn.ensemble import StackingClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

import xgboost as xgb
import lightgbm as lgb
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier

# from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier

from sklearn.manifold import TSNE

from sklearn.utils.validation import check_random_state

# from keras.layers import Dense
# from keras.layers import Dropout
# from keras.models import Sequential

from sklearn.model_selection import train_test_split

from sklearn.metrics import f1_score

# Jupyter上で図を表示するためのコマンド
%matplotlib inline

In [3]:
def importances(model):
    """ 変数重要度
    """
    importances = model.feature_importances_
    indices = np.argsort(importances)

    plt.barh(range(len(indices[-20:])), importances[indices[-20:]] , align='center')
    plt.yticks(range(len(indices[-20:])), feature_X[indices[-20:]])
    plt.title('decision tree feature importance')
    plt.xlabel('feature importance')
    plt.ylabel('variable')
    plt.show()

In [4]:
# 目的変数と説明変数の取得
y=pd.read_pickle('../data/feature/pre_feature_data_y.pickle')
X=pd.read_pickle('../data/feature/pre_feature_data_X.pickle')
# submit_X=pd.read_pickle('../data/feature/test_feature_data_X.pickle')

# 訓練データとテストデータの分割
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, shuffle=True)

In [12]:
# train base model
# base_model_1 = pd.read_pickle("../data/models/model_GS_XGB.pickle")
base_model_2 = pd.read_pickle("../data/models/model_GS_LGBM.pickle")
# base_model_3 = pd.read_pickle("../data/models/model_GS_ExTree.pickle")
# base_model_4 = pd.read_pickle("../data/models/model_GS_GB.pickle")
base_model_5 = pd.read_pickle("../data/models/model_GS_RFM.pickle")
# base_model_6 = pd.read_pickle("../data/models/model_GS_Ridge.pickle")
# base_model_7 = pd.read_pickle("../data/models/model_GS_KN.pickle")
# base_model_8 = pd.read_pickle("../data/models/model_GS_TSNE.pickle")
# base_model_9 = NN()

In [13]:
# base_model_2= LGBMClassifier()

# base_model_5= RandomForestClassifier()

In [14]:
# base predicts
# base_pred_1 = base_model_1.predict_proba(train_X)
base_pred_2 = base_model_2.predict_proba(train_X)
# base_pred_3 = base_model_3.predict_proba(train_X)
# base_pred_4 = base_model_4.predict_proba(train_X)
base_pred_5 = base_model_5.predict_proba(train_X)
# base_pred_6 = base_model_6.predict_proba(train_X)
# base_pred_7 = base_model_7.predict_proba(train_X)
# base_pred_8 = base_model_8.predict_proba(train_X)
# base_pred_9 = base_model_9.predict_proba(train_X)

In [34]:
# base predicts
# base_pred_1 = base_model_1.predict_proba(test_X)
test_pred_2 = base_model_2.predict_proba(test_X)
# base_pred_3 = base_model_3.predict_proba(test_X)
# base_pred_4 = base_model_4.predict_proba(test_X)
test_pred_5 = base_model_5.predict_proba(test_X)
# base_pred_6 = base_model_6.predict_proba(test_X)
# base_pred_7 = base_model_7.predict_proba(test_X)
# base_pred_8 = base_model_8.predict_proba(test_X)
# base_pred_9 = base_model_9.predict_proba(test_X)

In [35]:
# stacking モデルの　trainデータを作成 
stack_train = np.column_stack((base_pred_2, base_pred_5))

# stacking モデルの　testデータを作成 
stack_test = np.column_stack((test_pred_2, test_pred_5))

In [36]:
stack_test

array([[9.99961715e-01, 3.82852985e-05, 0.00000000e+00, 1.00000000e+00],
       [9.95420346e-01, 4.57965445e-03, 0.00000000e+00, 1.00000000e+00],
       [9.97855694e-01, 2.14430625e-03, 0.00000000e+00, 1.00000000e+00],
       ...,
       [9.75688172e-01, 2.43118285e-02, 0.00000000e+00, 1.00000000e+00],
       [9.99793171e-01, 2.06829327e-04, 0.00000000e+00, 1.00000000e+00],
       [9.99419937e-01, 5.80063407e-04, 0.00000000e+00, 1.00000000e+00]])

In [20]:
random_state = 1

In [21]:
"""ハイパーパラメータの max_depth(木の深さ), 
# n_estimators(決定木の数)に関してグリッドサーチを行う
# グリッドサーチに使用するパラメータの値を用意
"""
seed=1
# params = {
#     'max_depth': [10, 20, 30, 50],
#     'n_estimators': [10, 100, 1000]}

In [22]:
# train meta model 
meta_model = LGBMClassifier(random_state=seed)

In [23]:
grid = GridSearchCV(estimator=meta_model,
#     param_grid=params,
    n_jobs=1,
    cv=KFold(5,shuffle=True, random_state=seed),
    scoring='f1',
    verbose=3,
    return_train_score=True)

In [24]:
# meta_model.fit(stacked_predictions, valid_y)
grid.fit(stack_train, train_y)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV 1/5] END ..................max_depth=10, n_estimators=10; total time=   0.7s
[CV 2/5] END ..................max_depth=10, n_estimators=10; total time=   0.7s
[CV 3/5] END ..................max_depth=10, n_estimators=10; total time=   0.7s
[CV 4/5] END ..................max_depth=10, n_estimators=10; total time=   0.6s
[CV 5/5] END ..................max_depth=10, n_estimators=10; total time=   0.6s
[CV 1/5] END .................max_depth=10, n_estimators=100; total time=   6.6s
[CV 2/5] END .................max_depth=10, n_estimators=100; total time=   6.6s
[CV 3/5] END .................max_depth=10, n_estimators=100; total time=   6.6s
[CV 4/5] END .................max_depth=10, n_estimators=100; total time=   6.7s
[CV 5/5] END .................max_depth=10, n_estimators=100; total time=   6.7s
[CV 1/5] END ................max_depth=10, n_estimators=1000; total time= 1.1min
[CV 2/5] END ................max_depth=10, n_est

GridSearchCV(cv=KFold(n_splits=5, random_state=1, shuffle=True),
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=None,
                                     num_parallel_tree=None, random_state=1,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=None, subsample=None,
           

In [25]:
print(grid.best_estimator_)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=50,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=1000, n_jobs=16, num_parallel_tree=1, random_state=1,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)


In [32]:
# 最適解でモデルを作成
meta_model = grid.best_estimator_

In [27]:
# final result 
pred = meta_model.predict(stack_test)

In [28]:
print ("f1_score of meta model: {:.4f}".format(f1_score(pred,test_y)) )

f1_score of meta model: 0.3074


In [29]:
# モデルの保存
with open('../data/models/model_Stacking.pickle', mode='wb') as f:
    pickle.dump(meta_model, f)