# Stacking

In [33]:
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression

In [2]:
# データ準備
df = sns.load_dataset('titanic')
df.dropna(inplace=True)
# X, yを作成
X = df.loc[:, (df.columns!='survived') & (df.columns!='alive')]
y = df['survived']
# カテゴリ変数の特徴量を整数に変換する
oe = OrdinalEncoder()
# numpyからdfの形に変換
oe.set_output(transform='pandas')
X = oe.fit_transform(X)
# トレーニングデータとテストデータを7:3に分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [30]:
class StackingClassifierCV():
    
    def __init__(self, estimators, final_estimator, cv):
        self.estimators = estimators
        self.final_estimator = final_estimator
        self.cv = cv
        
    def fit(self, X, y):
        pred_features = {}
        # 1層目のモデル学習
        for model_name, model in self.estimators:
            preds = []
            new_y = []
            
            for train_idx, val_idx in self.cv.split(X):
                X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
                y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
                model.fit(X_train, y_train)
                pred = model.predict_proba(X_val)[:, 1].tolist()
                preds += pred
                new_y += y_val.tolist()
            model.fit(X, y)
            pred_features[model_name] = preds
            
        # 2層目のモデル学習
        new_X = pd.DataFrame(pred_features)
        self.final_estimator.fit(new_X, new_y)
        
    def predict_proba(self, X):
        # 1層目のモデルで特徴量(予測値)生成
        pred_features = {}
        for model_name, model in self.estimators:
            pred = model.predict_proba(X)[:, 1]
            pred_features[model_name] = pred
            
        new_X = pd.DataFrame(pred_features)
        final_pred = self.final_estimator.predict_proba(new_X)
        return final_pred

In [32]:
cv = KFold(n_splits=5, shuffle=True, random_state=0)
final_estimator = LogisticRegression()
stacking_cv = StackingClassifierCV(estimators= [('rf', RandomForestClassifier()), ('knn', KNeighborsClassifier())],
                                  final_estimator=final_estimator,
                                  cv=cv)
stacking_cv.fit(X_train, y_train)
y_pred_stacking_cv = stacking_cv.predict_proba(X_test)

In [34]:
print(f"stackingCV AUC: {roc_auc_score(y_test, y_pred_stacking_cv[:, 1])}")

stackingCV AUC: 0.8181089743589745
