# Bagging
　バギングの実装例.

## Import

In [1]:
import numpy as np
import pandas as pd

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

## Setting

In [2]:
# Path
input_path = "../input_data/"

# Set Display Max Columns
pd.set_option("display.max_columns", 50)

## Read Data and Make Data

In [3]:
train = pd.read_csv(
    input_path + "bank/train.csv",
    sep = ",",
    header = 0,
    quotechar = "\""
)

train = pd.get_dummies(train, drop_first=True) # drop_first=Trueでk-1個のダミー変数となる

## Hold Out

In [4]:
# Hold Out
train_x, valid_x, train_y, valid_y = train_test_split(
    train.drop(columns = "y"), train[["y"]],
    # 検証データ割合
    test_size = 0.3,
    # 再現性のためシードを固定
    random_state = 17
)

In [5]:
tree_model = DecisionTreeClassifier(
    criterion = "gini",             # Entropy基準の場合は"entropy”
    splitter = "best",              # 分割をランダムで行う場合は"random"
    max_depth = 7,                  # 決定木の深さの最大値
    min_samples_split = 10         # 分割する最小データ数
)
tree_model = tree_model.fit(train_x, train_y)

In [6]:
pred = tree_model.predict_proba(valid_x)[:, 1]
roc_auc_score(valid_y, pred)

0.8197424518306571

## Bagging

In [7]:
# モデル構築回数
L = 10

# 1回の構築での使用データ割合
rate_M = 0.1
M = int(train_x.shape[0] * rate_M)


auc = list()
pred_all = np.zeros(valid_x.shape[0])

# Bagging
for i in range(L):
    
    np.random.seed(17 * i)
    row = np.random.randint(0, train_x.shape[0], M).tolist()
    train_x_tmp = train_x.iloc[row,]
    train_y_tmp = train_y.iloc[row,]
    
    # 決定木構築
    tree_model_tmp = DecisionTreeClassifier(
        random_state = 17,
        criterion = "gini",             # Entropy基準の場合は"entropy”
        splitter = "best",              # 分割をランダムで行う場合は"random"
        max_depth = 7,                  # 決定木の深さの最大値
        min_samples_split = 10          # 分割する最小データ数
    )
    tree_model_tmp.fit(train_x_tmp, train_y_tmp)
    
    # 検証データへ当てはめ
    pred = tree_model_tmp.predict_proba(valid_x)[:, 1]
    pred_all = pred_all + pred
    
    #  AUC
    auc.extend([roc_auc_score(valid_y, pred)])

    
# 予測結果の平均
pred_all = pred_all / L

　個別のAUC.

In [8]:
auc

[0.7227544812614378,
 0.7573628139074685,
 0.7689531159611288,
 0.7254960979472624,
 0.7522692523089042,
 0.7938737973885573,
 0.7683571812939092,
 0.7815019168534881,
 0.7297783780541471,
 0.7429096319076081]

　Baggingした結果のAUC.

In [9]:
roc_auc_score(valid_y, pred_all)

0.8860070560510223