In [39]:
import numpy as np
import pandas as pd
from typing import Union

In [40]:
train = pd.read_csv("../data/train_preprocessed.csv")
train_x = train.drop(["target"], axis=1)
train_y = train["target"]

test_x = pd.read_csv("../data/test_preprocessed.csv")
test_x

Unnamed: 0,age,sex,height,weight,product,amount,medical_info_a1,medical_info_a2,medical_info_a3,medical_info_b1,...,medical_keyword_5,medical_keyword_6,medical_keyword_7,medical_keyword_8,medical_keyword_9,medical_keyword_10,year,month,day,yearmonth
0,49,1,187.431987,81.008363,1,1000000,302,212,1,10,...,0,1,0,1,0,0,2016,12,6,24204
1,79,1,171.632630,71.067812,6,2000,197,469,0,14,...,0,0,0,0,1,1,2016,9,3,24201
2,78,0,163.543983,64.032098,0,4000000,247,225,2,17,...,0,1,0,1,0,0,2015,4,10,24184
3,26,1,150.391858,52.322910,2,1000000,108,228,0,15,...,0,0,1,0,0,0,2016,4,17,24196
4,14,1,165.835167,67.008154,2,4000000,181,90,2,11,...,0,0,0,1,0,0,2015,1,26,24181
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,21,1,185.174944,62.893499,8,3000,277,173,2,11,...,0,0,0,1,0,0,2015,3,11,24183
9996,34,1,157.581442,58.889901,8,2000,184,206,3,18,...,0,1,0,1,0,0,2016,3,27,24195
9997,36,1,177.676066,85.277018,2,6000000,443,191,1,11,...,0,0,0,1,1,0,2016,3,16,24195
9998,18,1,166.757782,64.254215,3,6000000,267,193,0,19,...,0,0,0,1,0,0,2015,6,17,24186


In [41]:
# モデルのスタッキング
from sklearn.metrics import log_loss
from sklearn.model_selection import KFold

from models import Model2Linear, ModleGBDT

In [42]:
def predict_cv(model:Union[ModleGBDT, Model2Linear], train_x:pd.DataFrame, train_y:pd.DataFrame, test_x:pd.DataFrame):
    """
    学習データに対する目的変数を知らない学習、予測を行い、テストデータに対する予測値を返す関数
    """
    preds = []
    preds_test = []
    va_idxes = []

    kf = KFold(n_splits=4, shuffle=True, random_state=71)

    for i, (tr_idx, va_idx) in enumerate(kf.split(train_x)):
        tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
        tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]

        model.fit(tr_x, tr_y, va_x, va_y)

        pred = model.predict(va_x)
        preds.append(pred)

        pred_test = model.predict(test_x)
        preds_test.append(pred_test)

        va_idxes.append(va_idx)

        print(preds_test)
    
    # validationに対する予測値を連結、元の順序に並べなおす
    va_idxes = np.concatenate(va_idxes)
    preds = np.concatenate(preds, axis=0)

    order = np.argsort(va_idxes)
    pred_train = preds[order]

    preds_test = np.mean(preds_test, axis=0)

    return pred_train, preds_test

In [43]:
model_1a = ModleGBDT()
pred_train_1a, pred_test_1a = predict_cv(
    model=model_1a, train_x=train_x, train_y=train_y, test_x=test_x
)

print(f'logloss: {log_loss(train_y, pred_train_1a, eps=1e-7):.4f}')

This may cause significantly different results comparing to the previous versions of LightGBM.
Try to set boost_from_average=false, if your old models produce bad results
[1]	train's binary_logloss: 0.454308	valid's binary_logloss: 0.465515
[2]	train's binary_logloss: 0.429565	valid's binary_logloss: 0.443444
[3]	train's binary_logloss: 0.410077	valid's binary_logloss: 0.425543
[4]	train's binary_logloss: 0.39358	valid's binary_logloss: 0.410625
[5]	train's binary_logloss: 0.379354	valid's binary_logloss: 0.397666
[6]	train's binary_logloss: 0.365913	valid's binary_logloss: 0.387422
[7]	train's binary_logloss: 0.354309	valid's binary_logloss: 0.376037
[8]	train's binary_logloss: 0.344354	valid's binary_logloss: 0.366734
[9]	train's binary_logloss: 0.334834	valid's binary_logloss: 0.35898
[10]	train's binary_logloss: 0.326209	valid's binary_logloss: 0.351612
[11]	train's binary_logloss: 0.317809	valid's binary_logloss: 0.34563
[12]	train's binary_logloss: 0.310845	valid's binary_logloss

In [44]:
pred_train_1a

array([0.0168521 , 0.08079163, 0.73945262, ..., 0.00476939, 0.00251913,
       0.00287568])

In [45]:
train_x_2 = pd.DataFrame({"pred_1a":pred_train_1a})
test_x_2 = pd.DataFrame({"pred_1a":pred_test_1a})

train_x_2

Unnamed: 0,pred_1a
0,0.016852
1,0.080792
2,0.739453
3,0.002048
4,0.446095
...,...
9995,0.029752
9996,0.273593
9997,0.004769
9998,0.002519


In [46]:
model_2 = Model2Linear()

pred_train_2, pred_test_2 = predict_cv(model_2, train_x_2, train_y, test_x_2)

f'logloss: {log_loss(train_y, pred_train_2, eps=1e-7):.4f}'

[array([0.1099099 , 0.05753015, 0.02990078, ..., 0.94392942, 0.02715792,
       0.42033222])]
[array([0.1099099 , 0.05753015, 0.02990078, ..., 0.94392942, 0.02715792,
       0.42033222]), array([0.11179359, 0.05818056, 0.03004526, ..., 0.94748128, 0.02726274,
       0.42940997])]
[array([0.1099099 , 0.05753015, 0.02990078, ..., 0.94392942, 0.02715792,
       0.42033222]), array([0.11179359, 0.05818056, 0.03004526, ..., 0.94748128, 0.02726274,
       0.42940997]), array([0.11131275, 0.05858234, 0.03060699, ..., 0.9428351 , 0.02782013,
       0.4207574 ])]
[array([0.1099099 , 0.05753015, 0.02990078, ..., 0.94392942, 0.02715792,
       0.42033222]), array([0.11179359, 0.05818056, 0.03004526, ..., 0.94748128, 0.02726274,
       0.42940997]), array([0.11131275, 0.05858234, 0.03060699, ..., 0.9428351 , 0.02782013,
       0.4207574 ]), array([0.10544075, 0.05477073, 0.02827816, ..., 0.94351755, 0.02566066,
       0.41242049])]


'logloss: 0.2290'