## Stacking

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

data = pd.read_csv('input/sample-data/train.csv')

data['date'] = pd.to_datetime(data['date'], format='%Y/%m/%d')
data = data.sort_values('date').reset_index()

train_x, test_x = train_test_split(data, train_size=0.8, shuffle=False)

train_y = train_x['target']
train_x = train_x.drop('target', axis=1)

test_y = test_x['target']
test_x = test_x.drop('target', axis=1)

train_x = train_x.select_dtypes(include=[float, int])
test_x = test_x.select_dtypes(include=[float, int])

In [2]:
from sklearn.metrics import log_loss
from sklearn.model_selection import KFold

from models import Model1Xgb, Model1NN, Model2Linear

def predict_cv(model, train_x, train_y, test_x):
    preds = []
    preds_test = []
    va_idxes = []
    
    kf = KFold(n_splits=4, shuffle=True, random_state=71)
    
    for i, (tr_idx, va_idx) in enumerate(kf.split(train_x)):
        tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
        tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
        model.fit(tr_x, tr_y, va_x, va_y)
        pred = model.predict(va_x)
        preds.append(pred)
        pred_test = model.predict(test_x)
        preds_test.append(pred_test)
        va_idxes.append(va_idx)
    
    va_idxes = np.concatenate(va_idxes)
    preds = np.concatenate(preds, axis=0)
    order = np.argsort(va_idxes)
    pred_train = preds[order]
    
    preds_test = np.mean(preds_test, axis=0)
    return pred_train, preds_test

caused by: ["[Errno 2] The file to load file system plugin from does not exist.: '/usr/local/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so'"]
caused by: ['/usr/local/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: cannot open shared object file: No such file or directory']


In [6]:
# 1層目のモデル
model_1a = Model1Xgb()
pred_train_1a, pred_test_1a = predict_cv(model_1a, train_x, train_y, test_x)
model_1b = Model1NN()
pred_train_1b, pred_test_1b = predict_cv(model_1a, train_x, train_y, test_x)

Parameters: { "silent" } are not used.

[0]	train-logloss:0.54152	eval-logloss:0.54981
[1]	train-logloss:0.45050	eval-logloss:0.46866
[2]	train-logloss:0.39165	eval-logloss:0.41922
[3]	train-logloss:0.35046	eval-logloss:0.38942
[4]	train-logloss:0.32201	eval-logloss:0.36968
[5]	train-logloss:0.29800	eval-logloss:0.35564
[6]	train-logloss:0.27455	eval-logloss:0.34026
[7]	train-logloss:0.25541	eval-logloss:0.33181
[8]	train-logloss:0.23857	eval-logloss:0.32133
[9]	train-logloss:0.22593	eval-logloss:0.31467
Parameters: { "silent" } are not used.

[0]	train-logloss:0.53817	eval-logloss:0.55281
[1]	train-logloss:0.44917	eval-logloss:0.47516
[2]	train-logloss:0.39010	eval-logloss:0.42682
[3]	train-logloss:0.34630	eval-logloss:0.39108
[4]	train-logloss:0.31644	eval-logloss:0.36876
[5]	train-logloss:0.28724	eval-logloss:0.35166
[6]	train-logloss:0.26847	eval-logloss:0.34006
[7]	train-logloss:0.25009	eval-logloss:0.32828
[8]	train-logloss:0.23800	eval-logloss:0.32067
[9]	train-logloss:0.22396	e

In [9]:
# 1層目のモデルの評価
print(f'logloss: {log_loss(train_y, pred_train_1a, eps=1e-7):.4f}')
print(f'logloss: {log_loss(train_y, pred_train_1b, eps=1e-7):.4f}')

logloss: 0.3046
logloss: 0.3046


In [12]:
# 予測値を特徴量としてデータフレームを作成
train_x_2 = pd.DataFrame({'pred_1a': pred_train_1a, 'pred_1b': pred_train_1b})
test_x_2 = pd.DataFrame({'pred_1a': pred_test_1a, 'pred_1b': pred_test_1b})

In [15]:
# 2層目のモデル
model_2 = Model2Linear()
pred_train_2, pred_test_2 = predict_cv(model_2, train_x_2, train_y, test_x_2)
print(f'logloss: {log_loss(train_y, pred_train_2, eps=1e-7):.4f}')

logloss: 0.3017
