# Stacking
　スタッキングの実装例.

## Import

In [1]:
import numpy as np
import pandas as pd
import math

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

## Setting

In [2]:
# Path
input_path = "../input_data/"

# Set Display Max Columns
pd.set_option("display.max_columns", 50)

## Read Data and Make Data

In [3]:
train = pd.read_csv(
    input_path + "bank/train.csv",
    sep = ",",
    header = 0,
    quotechar = "\""
)

train = pd.get_dummies(train, drop_first=True) # drop_first=Trueでk-1個のダミー変数となる

## Hold Out
　投稿せずに, 精度を確かめるためにホールドアウト法で検証データを作成.

In [4]:
# Hold Out
train_x, valid_x, train_y, valid_y = train_test_split(
    train.drop(columns = "y"), train[["y"]],
    # 検証データ割合
    test_size = 0.3,
    # 再現性のためシードを固定
    random_state = 17
)

## 決定木構築

In [5]:
tree_model = DecisionTreeClassifier(
    criterion = "gini",             # Entropy基準の場合は"entropy”
    splitter = "best",              # 分割をランダムで行う場合は"random"
    max_depth = 7,                  # 決定木の深さの最大値
    min_samples_split = 20,          # 分割する最小データ数
    random_state = 17,
    
)
tree_model.fit(train_x, train_y)

# 構築データへの当てはめ
pred_train_tree = tree_model.predict_proba(train_x)[:, 1]

　検証データでAUCの確認.

In [6]:
pred_valid_tree = tree_model.predict_proba(valid_x)[:, 1]
roc_auc_score(valid_y, pred_valid_tree)

0.8283425547241918

## ロジスティック回帰構築

In [7]:
Logi_model = LogisticRegression(solver="liblinear")
Logi_model.fit(train_x, train_y["y"])

# 構築データへの当てはめ
pred_train_logi = Logi_model.predict_proba(train_x)[:, 1]

　検証データでAUCの確認.

In [8]:
pred_valid_logi = Logi_model.predict_proba(valid_x)[:, 1]
roc_auc_score(valid_y, pred_valid_logi)

0.8978653340492817

## 比較用：平均
　検証データでAUCの確認.

In [9]:
pred_mean = (pred_valid_tree + pred_valid_logi) / 2
roc_auc_score(valid_y, pred_mean)

0.901796224660881

## 比較用：メタモデル
　メタモデルはロジスティック回帰を使用.

In [10]:
# メタモデル用説明変数
m_train_x = pd.concat(
    [
        pd.DataFrame(data = pred_train_tree, columns = ["pred_tree"]),
        pd.DataFrame(data = pred_train_logi, columns = ["pred_logi"])
    ],
    axis = 1
)

m_valid_x = pd.concat(
    [
        pd.DataFrame(data = pred_valid_tree, columns = ["pred_tree"]),
        pd.DataFrame(data = pred_valid_logi, columns = ["pred_logi"])
    ],
    axis = 1
)

# メタモデル構築
meta_model_1 = LogisticRegression(solver="liblinear")
meta_model_1.fit(m_train_x, train_y["y"])

　検証データでAUCの確認.

In [11]:
pred_valid_m = meta_model_1.predict_proba(m_valid_x)[:, 1]
roc_auc_score(valid_y, pred_valid_m)

0.8978409660457104

## Stackingの実装

In [12]:
# 学習データをK個にグループ分け
K = 5

# 乱数シード設定
np.random.seed(seed = 17)

train_x["cv_group"] = np.random.randint(
    # low以上high未満の整数をsize数だけ生成する
    low = 0, high = K, size = train_x.shape[0]).tolist()
train_y["cv_group"] = train_x["cv_group"]

# CV内での構築, 検証予測スコアの初期化
pred_train_tree = list()
pred_train_logi = list()
pred_valid_tree = np.zeros(valid_x.shape[0]) 
pred_valid_logi = np.zeros(valid_x.shape[0]) 
m_train_y = list()

# クロスバリデーション
for i in range(K):
    # 分割
    train_x_tmp = train_x[train_x["cv_group"] != i]
    train_y_tmp = train_y[train_y["cv_group"] != i]
    valid_x_tmp = train_x[train_x["cv_group"] == i]
    valid_y_tmp = train_y[train_y["cv_group"] == i]
    
    del train_x_tmp["cv_group"]
    del train_y_tmp["cv_group"]
    del valid_x_tmp["cv_group"]
    del valid_y_tmp["cv_group"]
    
    # 決定木構築
    tree_model_tmp = DecisionTreeClassifier(
        random_state = 17,
        criterion = "gini",             # Entropy基準の場合は"entropy”
        splitter = "best",              # 分割をランダムで行う場合は"random"
        max_depth = 7,                  # 決定木の深さの最大値
        min_samples_split = 20          # 分割する最小データ数
    )
    tree_model_tmp.fit(train_x_tmp, train_y_tmp)
    
    # ロジスティック回帰構築
    Logi_model_tmp = LogisticRegression(solver="liblinear")
    Logi_model_tmp.fit(train_x_tmp, train_y_tmp["y"])
    
    # モデル構築に使用していないデータの予測値と目的変数
    pred_train_tree.extend(tree_model_tmp.predict_proba(valid_x_tmp)[:, 1].tolist())
    pred_train_logi.extend(Logi_model_tmp.predict_proba(valid_x_tmp)[:, 1].tolist())
    m_train_y.extend(valid_y_tmp["y"].tolist())
    
    # 検証データの予測値
    pred_valid_tree = pred_valid_tree + tree_model_tmp.predict_proba(valid_x)[:, 1]
    pred_valid_logi = pred_valid_logi + Logi_model_tmp.predict_proba(valid_x)[:, 1]

    
# 検証データの予測値の平均
pred_valid_tree = pred_valid_tree / K
pred_valid_logi = pred_valid_logi / K

pred_valid_tree = pred_valid_tree.tolist()
pred_valid_logi = pred_valid_logi.tolist()


# メタモデル用変数
m_train_x = pd.concat(
    [
        pd.DataFrame(data = pred_train_tree, columns = ["pred_tree"]),
        pd.DataFrame(data = pred_train_logi, columns = ["pred_logi"])
    ],
    axis = 1
)
m_train_y = pd.DataFrame(data = m_train_y, columns = ["y"])


# メタモデル構築
meta_model_2 = LogisticRegression(solver="liblinear")
meta_model_2.fit(m_train_x, m_train_y["y"])

## 検証データ適用方法1

In [13]:
# メタモデル用変数
m_valid_x_1 = pd.concat(
    [
        pd.DataFrame(data = pred_valid_tree, columns = ["pred_tree"]),
        pd.DataFrame(data = pred_valid_logi, columns = ["pred_logi"])
    ],
    axis = 1
)

# メタモデルの当てはめ
pred_valid_m_1 = meta_model_2.predict_proba(m_valid_x_1)[:, 1]

# AUC確認
roc_auc_score(valid_y, pred_valid_m_1)

0.9033122316996293

## 検証データ適用方法2

In [14]:
del train_x["cv_group"]

In [15]:
# 決定木：構築データ全体でモデル構築
tree_model_all = DecisionTreeClassifier(
    criterion = "gini",             # Entropy基準の場合は"entropy”
    splitter = "best",              # 分割をランダムで行う場合は"random"
    max_depth = 7,                  # 決定木の深さの最大値
    min_samples_split = 20         # 分割する最小データ数
)
tree_model_all = tree_model_all.fit(train_x, train_y["y"])

# ロジスティック回帰：構築データ全体でモデル構築
Logi_model_all = LogisticRegression(solver="liblinear")
Logi_model_all.fit(train_x, train_y["y"])

# 検証データへの当てはめ
pred_valid_tree_all = tree_model_all.predict_proba(valid_x)[:, 1]
pred_valid_logi_all = Logi_model_all.predict_proba(valid_x)[:, 1]

# メタモデル用変数
m_valid_x_2 = pd.concat(
    [
        pd.DataFrame(data = pred_valid_tree_all, columns = ["pred_tree"]),
        pd.DataFrame(data = pred_valid_logi_all, columns = ["pred_logi"])
    ],
    axis = 1
)

# メタモデルの当てはめ
pred_valid_m_2 = meta_model_2.predict_proba(m_valid_x_2)[:, 1]

# AUC確認
roc_auc_score(valid_y, pred_valid_m_2)

0.9019099900740037

## 結果まとめ

|手法|AUC|
| --- | --- |
|決定木単体             |  0.8283425547241918  |
|ロジスティック回帰単体 |  0.8978653340492817  |
|単純平均               |  0.901796224660881  |
|メタモデル             |  0.8978409660457104  |
|スタッキング(適用方法1)|  0.9033122316996293  |
|スタッキング(適用方法2)|  0.9019099900740037  |