In [1]:
import pandas as pd
import scipy as sc
import numpy as np
import sklearn
import pickle
import pathlib as Path
import matplotlib.pyplot as plt
import japanize_matplotlib
import seaborn as sb
sb.set(font='IPAexGothic')

import multiprocessing
import itertools
import collections
import datetime
import gc

from tqdm._tqdm_notebook import tqdm

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.set_option('display.max_columns', 50)

data_path = Path.Path("../data")
result_path = Path.Path("../result")

prefix = 'ana500'

# データ抽出
> ・基本情報（ana301_all_FTR_prob_df.pkl）<br>
> ・LGBM（ana401_pred_lgbm_df.pkl）<br>
> ・多値ロジスティック回帰（ana402_pred_lgbm_df.pkl）<br>
> ・K近傍法（ana403_pred_kmeans_df.pkl）<br>
> ・SVM（ana404_pred_svm_df.pkl） <br>

In [2]:
# 基本情報
base_df = pd.read_pickle(data_path / 'ana301_all_FTR_prob_df.pkl')

# LGBM
lgbm_df = pd.read_pickle(data_path / 'ana401_pred_lgbm_df.pkl')

# 多値ロジスティック回帰
logit_df = pd.read_pickle(data_path / 'ana402_pred_logit_df.pkl')

# ｋ近傍法
kmeans_df = pd.read_pickle(data_path / 'ana403_pred_kmeans_df.pkl')

# SVM
svm_df = pd.read_pickle(data_path / 'ana404_pred_svm_df.pkl')

# 　

# データ結合

In [3]:
# ステータス情報
all_df = base_df[['id', 'y', 'flg_train']].copy()

In [4]:
# LGBM
all_df['pred_lgbm'] = lgbm_df['pred'].copy()

# logit
all_df['pred_logit'] = logit_df['pred'].copy()

# kmeans
all_df['pred_kmeans'] = kmeans_df['pred'].copy()

# SVM
all_df['pred_svm'] = svm_df['pred'].copy()

In [5]:
all_df

Unnamed: 0,id,y,flg_train,pred_lgbm,pred_logit,pred_kmeans,pred_svm
0,0,0.0,1,0.0,0.0,5,0.0
1,1,1.0,1,0.0,1.0,5,0.0
2,2,0.0,1,0.0,0.0,3,0.0
3,3,2.0,1,2.0,0.0,3,0.0
4,4,4.0,1,4.0,4.0,3,4.0
...,...,...,...,...,...,...,...
54203,33803,,0,0.0,0.0,1,0.0
54204,33804,,0,1.0,1.0,6,0.0
54205,33805,,0,2.0,0.0,6,0.0
54206,33806,,0,1.0,4.0,4,0.0


# 　

# 予測結果の統合
> ・最頻値 <br>
> ・再予測（LGBM）<br>

## 1．最頻値

In [6]:
# データのコピー
mode_df = all_df.copy()

# LGBMの結果の重みを2倍する
mode_df['pred_lgbm_2'] = lgbm_df['pred'] 

In [7]:
# 説明変数（予測結果のみ）
cols = [i for i in mode_df.columns if 'pred' in i]
cols

['pred_lgbm', 'pred_logit', 'pred_kmeans', 'pred_svm', 'pred_lgbm_2']

In [8]:
# 最頻値を計算
mode = mode_df[cols].mode(axis=1)[0]

# 　

## 2．スタッキング（LGBM）

### 下準備

In [9]:
# データのコピー
stack_df = all_df.copy()

In [10]:
# 訓練データ
train_df = stack_df[stack_df['flg_train']==1]
train_df.shape

(20400, 7)

In [11]:
# 目的変数
y = train_df['y']

# 説明変数
X = train_df.filter(like='pred', axis=1)

In [12]:
# LGBM
import lightgbm as lgbm

# CV
from sklearn.model_selection import GridSearchCV

### モデル推定

In [13]:
# ハイパラ設定
pram_grid = {

    'max_depth':[i for i in range(3, 6)],
    'n_estimators':[1000],
    'num_leaves':[2**i for i in range(3, 7)],
    'random_state':[123]
        
}

In [14]:
# モデル推定
model = GridSearchCV(

            lgbm.LGBMClassifier(),
            pram_grid,
            cv=10
)

In [15]:
%%time
model.fit(X, y)

Wall time: 8min 1s


GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=LGBMClassifier(boosting_type='gbdt', class_weight=None,
                                      colsample_bytree=1.0,
                                      importance_type='split',
                                      learning_rate=0.1, max_depth=-1,
                                      min_child_samples=20,
                                      min_child_weight=0.001,
                                      min_split_gain=0.0, n_estimators=100,
                                      n_jobs=-1, num_leaves=31, objective=None,
                                      random_state=None, reg_alpha=0.0,
                                      reg_lambda=0.0, silent=True,
                                      subsample=1.0, subsample_for_bin=200000,
                                      subsample_freq=0),
             iid='warn', n_jobs=None,
             param_grid={'max_depth': [3, 4, 5], 'n_estimators': [1000],
          

In [16]:
# 推定モデル
model.best_estimator_

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=3,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=1000, n_jobs=-1, num_leaves=8, objective=None,
               random_state=123, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [17]:
# スコア
model.best_estimator_.score(X, y)

0.8647058823529412

0.9701960784313726

### モデル予測

In [18]:
# 全体データ
stck = model.best_estimator_.predict(

            # 全体データの特徴量
            stack_df.filter(like="pred_", axis=1)
        )

# 　

# データ結合

In [19]:
# 基本情報の抽出
result_df = all_df[['id', 'flg_train']].copy()

# 最頻値
result_df['mode'] = mode

# スタッキング
result_df['stck'] = stck

In [20]:
result_df

Unnamed: 0,id,flg_train,mode,stck
0,0,1,0.0,0.0
1,1,1,0.0,0.0
2,2,1,0.0,0.0
3,3,1,0.0,2.0
4,4,1,4.0,4.0
...,...,...,...,...
54203,33803,0,0.0,0.0
54204,33804,0,1.0,1.0
54205,33805,0,0.0,2.0
54206,33806,0,1.0,1.0


# 　

# データ保存

In [21]:
result_df.to_pickle(data_path / ("%s_pred_stacking_df.pkl" % prefix))