# Stacking
　スタッキングの実装例. 適用方法1で投稿ファイル作成まで.

## Import

In [1]:
import numpy as np
import pandas as pd
import math

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

## Setting

In [2]:
# Path
input_path = "../input_data/"

# Set Display Max Columns
pd.set_option("display.max_columns", 50)

## Read Data and Make Data

In [3]:
train = pd.read_csv(
    input_path + "bank/train.csv",
    sep = ",",
    header = 0,
    quotechar = "\""
)
train_x = train[["age", "duration", "campaign"]].copy()
train_y = train[["y"]].copy()

test = pd.read_csv(
    input_path + "bank/test.csv",
    sep = ",",
    header = 0,
    quotechar = "\""
)
test_x = test[["age", "duration", "campaign"]].copy()

## Stackingの実装

In [4]:
# 学習データをK個にグループ分け
K = 5

# 乱数シード設定
np.random.seed(seed = 17)

train_x["cv_group"] = np.random.randint(
    # low以上high未満の整数をsize数だけ生成する
    low = 0, high = K, size = train_x.shape[0]).tolist()
train_y["cv_group"] = train_x["cv_group"]

# CV内での構築, 検証予測スコアの初期化
pred_train_tree = list()
pred_train_logi = list()
pred_test_tree = np.zeros(test_x.shape[0]) 
pred_test_logi = np.zeros(test_x.shape[0]) 
m_train_y = list()

# クロスバリデーション
for i in range(K):
    # 分割
    train_x_tmp = train_x[train_x["cv_group"] != i]
    train_y_tmp = train_y[train_y["cv_group"] != i]
    valid_x_tmp = train_x[train_x["cv_group"] == i]
    valid_y_tmp = train_y[train_y["cv_group"] == i]
    
    del train_x_tmp["cv_group"]
    del train_y_tmp["cv_group"]
    del valid_x_tmp["cv_group"]
    del valid_y_tmp["cv_group"]
    
    # 決定木構築
    tree_model_tmp = DecisionTreeClassifier(
        random_state = 17,
        criterion = "gini",             # Entropy基準の場合は"entropy”
        splitter = "best",              # 分割をランダムで行う場合は"random"
        max_depth = 7,                  # 決定木の深さの最大値
        min_samples_split = 20          # 分割する最小データ数
    )
    tree_model_tmp.fit(train_x_tmp, train_y_tmp)
    
    # ロジスティック回帰構築
    Logi_model_tmp = LogisticRegression(solver="liblinear")
    Logi_model_tmp.fit(train_x_tmp, train_y_tmp["y"])
    
    # モデル構築に使用していないデータの予測値と目的変数
    pred_train_tree.extend(tree_model_tmp.predict_proba(valid_x_tmp)[:, 1].tolist())
    pred_train_logi.extend(Logi_model_tmp.predict_proba(valid_x_tmp)[:, 1].tolist())
    m_train_y.extend(valid_y_tmp["y"].tolist())
    
    # testデータの予測値
    pred_test_tree = pred_test_tree + tree_model_tmp.predict_proba(test_x)[:, 1]
    pred_test_logi = pred_test_logi + Logi_model_tmp.predict_proba(test_x)[:, 1]

    
# testデータの予測値の平均
pred_test_tree = pred_test_tree / K
pred_test_logi = pred_test_logi / K

pred_test_tree = pred_test_tree.tolist()
pred_test_logi = pred_test_logi.tolist()


# メタモデル用変数
m_train_x = pd.concat(
    [
        pd.DataFrame(data = pred_train_tree, columns = ["pred_tree"]),
        pd.DataFrame(data = pred_train_logi, columns = ["pred_logi"])
    ],
    axis = 1
)
m_train_y = pd.DataFrame(data = m_train_y, columns = ["y"])


# メタモデル構築
meta_model_2 = LogisticRegression(solver="liblinear")
meta_model_2.fit(m_train_x, m_train_y["y"])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

## 検証データ適用方法1

In [5]:
# メタモデル用変数
m_test_x_1 = pd.concat(
    [
        pd.DataFrame(data = pred_test_tree, columns = ["pred_tree"]),
        pd.DataFrame(data = pred_test_logi, columns = ["pred_logi"])
    ],
    axis = 1
)

# メタモデルの当てはめ
pred_valid_m_1 = meta_model_2.predict_proba(m_test_x_1)[:, 1]

In [6]:
submit_ens = test[["id"]].assign(pred=pred_valid_m_1)

## Output

In [7]:
submit_ens.to_csv(
    "../submit/submit_tree_logi_ens_20201022.csv",
    sep = ",",
    index = False,
    header = False
)