In [None]:
import numpy as np
import pandas as pd

from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import train_test_split

from models import model_cat1, model_cat2
from models import model_xgb1, model_xgb2
from models import model_lgbm1, model_lgbm2, model_lgbm3
from models import model_rf1, model_rf2
from models import model_ridge
from models import model_knr1, model_knr2
from models import model_lr

### データの前処理

In [None]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
sample_submission = pd.read_csv("sample_submission.csv")

# trainから id = 170514を削除する
train = train[train["id"] != 170514]
train = train.reset_index(drop = True)

# 説明変数と目的変数の分離
# 列の抜き出し
features = train.columns
features = features.drop(["id", "target"])

# 分離
train_target = train["target"]
train_id = train["id"]
test_id = test["id"]
train = train[features]
test = test[features]

In [None]:
# KMeans法によるラベリング(特徴量エンジニアリング)
km = KMeans(n_clusters=2, random_state=0)
km.fit(train)
km_train = pd.DataFrame(km.predict(train), columns = ["km_cluster"])
km_test = pd.DataFrame(km.predict(test), columns = ["km_cluster"])

# 結合
train = pd.concat([train, km_train], axis = 1)
test = pd.concat([test, km_test], axis = 1)

In [None]:
## 決定木系モデル用のデータを作成
train_dt = train.copy()
test_dt = test.copy()

# 混合ガウス分布による特徴量のラベリング(特徴量エンジニアリング)
def get_gmm_class_feature(feat, train_df, test_df, n):
    gmm = GaussianMixture(n_components=n, random_state=1234)
    gmm.fit(train_df[feat].values.reshape(-1, 1))
    train_df[f'{feat}_class'] = gmm.predict(train_df[feat].values.reshape(-1, 1))
    test_df[f'{feat}_class'] = gmm.predict(test_df[feat].values.reshape(-1, 1))

get_gmm_class_feature('cont1', train_dt, test_dt, 4)
get_gmm_class_feature('cont2', train_dt, test_dt, 10)
get_gmm_class_feature('cont3', train_dt, test_dt, 6)
get_gmm_class_feature('cont4', train_dt, test_dt, 4)
get_gmm_class_feature('cont5', train_dt, test_dt, 3)
get_gmm_class_feature('cont6', train_dt, test_dt, 2)
get_gmm_class_feature('cont7', train_dt, test_dt, 3)
get_gmm_class_feature('cont8', train_dt, test_dt, 4)
get_gmm_class_feature('cont9', train_dt, test_dt, 4)
get_gmm_class_feature('cont10', train_dt, test_dt, 8)
get_gmm_class_feature('cont11', train_dt, test_dt, 5)
get_gmm_class_feature('cont12', train_dt, test_dt, 4)
get_gmm_class_feature('cont13', train_dt, test_dt, 6)
get_gmm_class_feature('cont14', train_dt, test_dt, 6)   

In [None]:
# 線形回帰、K近傍法用にデータを加工する
train_l = train_dt.copy()
test_l = test_dt.copy()

# ダミー変数化
for i in range(1,15):
    temp = pd.get_dummies(train_l["cont{}_class".format(str(i))], drop_first = True)
    train_l = pd.concat([train_l, temp], axis = 1)
    del train_l["cont{}_class".format(str(i))]

for i in range(1,15):
    temp = pd.get_dummies(test_l["cont{}_class".format(str(i))], drop_first = True)
    test_l = pd.concat([test_l, temp], axis = 1)
    del test_l["cont{}_class".format(str(i))]

In [None]:
# train_test_split
X_train_dt, X_val_dt, y_train_dt, y_val_dt = train_test_split(train_dt, train_target, test_size = 0.25, random_state = 1234)
X_train_l, X_val_l, y_train_l, y_val_l = train_test_split(train_l, train_target, test_size = 0.25, random_state = 1234)

### スタッキング1層目

In [None]:
#スタッキング1層目用の関数の定義
def predict_cv(model, train, train_target, test):
    y_preds = []
    preds_test = []
    val_idxes = []
    
    kf = KFold(n_splits = 4, shuffle = True, random_state = 1234)
    
    # クロスバリデーションで学習・予測をを行い、予測値とインデックスを保存する
    for i, (train_idx, val_idx) in enumerate(kf.split(train)):
        X_train, X_val = train.iloc[train_idx], train.iloc[val_idx]
        y_train, y_val = train_target.iloc[train_idx], train_target.iloc[val_idx]
        model.fit(X_train, y_train, X_val, y_val)
        y_pred = model.predict(X_val)
        y_preds.append(y_pred)
        pred_test = model.predict(test)
        preds_test.append(pred_test)
        val_idxes.append(val_idx)
        
    # バリデーションデータに対する予測値を連結し、その後元の順序に並べ直す
    val_idxes = np.concatenate(val_idxes)
    preds = np.concatenate(y_preds, axis = 0)
    order = np.argsort(val_idxes)
    pred_train = preds[order]
    
    # テストデータに対する予測値の平均をとる
    preds_test = np.mean(preds_test, axis = 0)
    return pred_train, preds_test

In [None]:
# catboost1 (optunaにより最適化されたモデル)
m_cat1 = model_cat1()
pred_train_cat1, pred_test_cat1 = predict_cv(m_cat1, train_dt, train_target, test_dt)

# catboost2
m_cat2 = model_cat2()
pred_train_cat2, pred_test_cat2 = predict_cv(m_cat2, train_dt, train_target, test_dt)

# xgboost1 (optunaにより最適化されたモデル)
m_xgb1 = model_xgb1()
pred_train_xgb1, pred_test_xgb1 = predict_cv(m_xgb1, train_dt, train_target, test_dt)

# xgboost2
m_xgb2 = model_xgb2()
pred_train_xgb2, pred_test_xgb2 = predict_cv(m_xgb2, train_dt, train_target, test_dt)

# lightgbm1 (optunaにより最適化されたモデル)
m_lgbm1 = model_lgbm1()
pred_train_lgbm1, pred_test_lgbm1 = predict_cv(m_lgbm1, train_dt, train_target, test_dt)

# lightgbm2
m_lgbm2 = model_lgbm2()
pred_train_lgbm2, pred_test_lgbm2 = predict_cv(m_lgbm2, train_dt, train_target, test_dt)

# lightgbm3
m_lgbm3 = model_lgbm3()
pred_train_lgbm3, pred_test_lgbm3 = predict_cv(m_lgbm3, train_dt, train_target, test_dt)

# RandomForest1 (optunaにより最適化されたモデル)
m_rf1 = model_rf1()
pred_train_rf1, pred_test_rf1 = predict_cv(m_rf1, train_dt, train_target, test_dt)

# RandomForest2
m_rf2 = model_rf2()
pred_train_rf2, pred_test_rf2 = predict_cv(m_rf2, train_dt, train_target, test_dt)

# Ridge回帰
m_ridge = model_ridge()
pred_train_ridge, pred_test_ridge = predict_cv(m_ridge, train_dt, train_target, test_dt)

# KNeighbors1 (forループによりハイパーパラメーターを調整されたモデル)
m_knr1 = model_knr1()
pred_train_knr1, pred_test_knr1 = predict_cv(m_knr1, train_dt, train_target, test_dt)

# KNeighbors2
m_knr2 = model_knr2()
pred_train_knr2, pred_test_knr2 = predict_cv(m_knr2, train_dt, train_target, test_dt)



### スタッキング2層目

In [None]:
# 1層目の予測結果を元にデータフレームを作成
train_2 = pd.DataFrame({"catboost1" : pred_train_cat1,
                        "catboost2" : pred_train_cat2,
                        "xgboost1" : pred_train_xgb1,
                        "xgboost2" : pred_train_xgb2,
                        "LightGBM1" : pred_train_lgbm1,
                        "LightGBM2" : pred_train_lgbm2,
                        "LightGBM3" : pred_train_lgbm3,
                        "RandomForest1" : pred_train_rf1,
                        "RandomForest2" : pred_train_rf2,
                        "Ridge" : pred_train_ridge,
                        "knr1" : pred_train_knr1,
                        "knr2" : pred_train_knr2})


test_2 = pd.DataFrame({"catboost1" : pred_test_cat1,
                       "catboost2" : pred_test_cat2,
                       "xgboost1" : pred_test_xgb1,
                       "xgboost2" : pred_test_xgb2,
                       "LightGBM1" : pred_test_lgbm1,
                       "LightGBM2" : pred_test_lgbm2,
                       "LightGBM3" : pred_test_lgbm3,
                       "RandomForest1" : pred_test_rf1,
                       "RandomForest2" : pred_test_rf2,
                       "Ridge" : pred_test_ridge,
                       "knr1" : pred_test_knr1,
                       "knr2" : pred_test_knr2})

In [None]:
# 予測結果
m_lr = model_lr()
pre_train_2_lr, pred_test_2_lr = predict_cv(m_lr, train_2, train_target, test_2)

In [None]:
# 指定の形式のデータテーブルへ変換
df = pd.concat([test_id, pd.DataFrame(pred_test_2_lr)], axis = 1)
df = df.rename(columns = {0:"target"})
df = df.set_index("id")
df.to_csv("my_submission.csv", header = True)