In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics, preprocessing, model_selection
from sklearn import ensemble

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import PolynomialFeatures

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

import xgboost as xgb
from xgboost import XGBRegressor
import lightgbm as lgb
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor


from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import BayesianRidge

from sklearn.neighbors import KNeighborsRegressor

from rgf.sklearn import RGFRegressor

from sklearn.metrics import mean_squared_error



import os
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 300)
pd.set_option('display.max_rows', 300)

In [None]:
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, RNN, GRU, LeakyReLU
from keras.metrics import RootMeanSquaredError
from keras.optimizers import adam_v2
from keras.optimizers import adadelta_v2
from keras.optimizers import adagrad_v2
from keras.optimizers import adamax_v2
from keras.optimizers import nadam_v2
from keras.layers import BatchNormalization
from keras.callbacks import LearningRateScheduler
from keras.callbacks import EarlyStopping
from keras.initializers import RandomUniform
from keras.initializers import Zeros
from keras.initializers import HeNormal
from keras.initializers import HeUniform
from keras.initializers import GlorotUniform

In [None]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv') 
sample = pd.read_csv('../input/submit_sample.csv')

In [None]:
train_id = train['id']
test_id = test['id']

train = train.drop(['id'], axis=1)
test = test.drop(['id'], axis=1)

train_x = train.drop(['pm25_mid'],axis=1)
train_y = train['pm25_mid']
test_x = test.copy()

In [None]:
# Cityは削除する、訓練データとテストデータでカテゴリが全く一致しないため
train_x = train_x.drop(['City'], axis=1)
test_x = test_x.drop(['City'], axis=1)

## 決定木の前処理

In [None]:
# ワンホットエンコーディング
train_x = pd.get_dummies(train_x)
test_x = pd.get_dummies(test_x)

In [None]:
# 採用
train_x['lon+lat'] = train_x['lon'] + train_x['lat']
train_x['lon-lat'] = train_x['lon'] - train_x['lat']
train_x['lon*lat'] = train_x['lon'] * train_x['lat']
train_x['lon/lat'] = train_x['lon'] / train_x['lat']

test_x['lon+lat'] = test_x['lon'] + test_x['lat']
test_x['lon-lat'] = test_x['lon'] - test_x['lat']
test_x['lon*lat'] = test_x['lon'] * test_x['lat']
test_x['lon/lat'] = test_x['lon'] / test_x['lat']

In [None]:
train_x = np.array(train_x)
test_x = np.array(test_x)

## 決定木の前処理_1

In [None]:
# ワンホットエンコーディング
train_x = pd.get_dummies(train_x)
test_x = pd.get_dummies(test_x)

In [None]:
train_x = np.array(train_x)
test_x = np.array(test_x)

## 決定木の前処理_2

In [None]:
# ワンホットエンコーディング
train_x = pd.get_dummies(train_x)
test_x = pd.get_dummies(test_x)

In [None]:
# 多項式特徴量作成
# 多項式特徴量を作成する前の特徴量のカラム
train_x_columns = train_x.columns

# 標準化
scaler = StandardScaler()
scaler.fit(train_x)
train_x = scaler.transform(train_x)
test_x = scaler.transform(test_x)

train_x = pd.DataFrame(train_x, columns=train_x_columns)
test_x = pd.DataFrame(test_x, columns=train_x_columns)

# lgbで重要な特徴量を確認する
model = LGBMRegressor(random_state=0, verbose=0).fit(train_x, train_y)
importances = pd.DataFrame(model.feature_importances_, index=train_x.columns, columns=['importance']).sort_values(by='importance', ascending=False)

# 重要な特徴量を抽出する
poly_features = train_x[importances.index[:10]]
poly_features_test = test_x[importances.index[:10]]

# 多項式特徴量を作成する
poly_transformer = PolynomialFeatures(degree = 3, interaction_only=False)
poly_transformer.fit(poly_features)
poly_features = poly_transformer.transform(poly_features)
poly_features_test = poly_transformer.transform(poly_features_test)

# 多項式特徴量を元のデータフレームに結合
poly_features = pd.DataFrame(poly_features, 
                             columns = poly_transformer.get_feature_names(importances.index[:10]))
poly_features_test = pd.DataFrame(poly_features_test, 
                                  columns = poly_transformer.get_feature_names(importances.index[:10]))

train_x = pd.concat([train_x, poly_features], axis=1)
test_x = pd.concat([test_x, poly_features_test], axis=1)

# 重複した列を削除
train_x = train_x.loc[:,~train_x.columns.duplicated()]
test_x = test_x.loc[:,~test_x.columns.duplicated()]

In [None]:
train_x = np.array(train_x)
test_x = np.array(test_x)

## ニューラルネットワークの前処理

In [None]:
non_clip = ['year', 'month', 'day', 'Country', 'lat', 'lon']
train_clip_x = train_x.drop(non_clip, axis=1)
test_clip_x = test_x.drop(non_clip, axis=1)

p01 = train_clip_x.quantile(0.01)
p99 = train_clip_x.quantile(0.99)

train_clip_x = train_clip_x.clip(p01, p99, axis=1)
test_clip_x = test_clip_x.clip(p01, p99, axis=1)

train_x = pd.concat([train_x[non_clip], train_clip_x], axis=1)
test_x = pd.concat([test_x[non_clip], test_clip_x], axis=1)

In [None]:
# ワンホットエンコーディング
train_x = pd.get_dummies(train_x)
test_x = pd.get_dummies(test_x)

In [None]:
# Yeo-Johnson
scaler = PowerTransformer()
scaler.fit(train_x)
train_x = scaler.transform(train_x)
test_x = scaler.transform(test_x)

# モデルの定義

### 決定木

##### ExtraTreesRegressor

In [None]:
# ExtraTreesRegressorによるモデル
class ModelETR:

    def __init__(self):
        self.model = None

    def fit(self, tr_x, tr_y, va_x, va_y):
        self.model = ExtraTreesRegressor(random_state=0, n_estimators=500)
        self.model.fit(tr_x, tr_y)

    def predict(self, x):
        pred = self.model.predict(x)
        return pred

#### RandomForestRegressor

In [None]:
# RandomForestRegressorによるモデル
class ModelRFR:

    def __init__(self):
        self.model = None

    def fit(self, tr_x, tr_y, va_x, va_y):
        self.model = RandomForestRegressor(random_state=0)
        self.model.fit(tr_x, tr_y)

    def predict(self, x):
        pred = self.model.predict(x)
        return pred

#### GradientBoostingRegressor

In [None]:
# GradientBoostingRegressorによるモデル
class ModelGBR:

    def __init__(self):
        self.model = None

    def fit(self, tr_x, tr_y, va_x, va_y):
        self.model = GradientBoostingRegressor(learning_rate=0.01,
                                               n_estimators=150,
                                               subsample=1.0,
                                               min_samples_split=2,
                                               min_samples_leaf=1,
                                               max_depth=15,
                                               random_state=0)
        self.model.fit(tr_x, tr_y)

    def predict(self, x):
        pred = self.model.predict(x)
        return pred

##### CatBoostRegressor

In [None]:
# CatBoostRegressorによるモデル
class ModelCBR:

    def __init__(self):
        self.model = None

    def fit(self, tr_x, tr_y, va_x, va_y):
        self.model = CatBoostRegressor(random_state=0, verbose=0)
        self.model.fit(tr_x, tr_y)

    def predict(self, x):
        pred = self.model.predict(x)
        return pred

##### XGBRegressor

In [None]:
# XGBRegressorによるモデル
class ModelXGB:

    def __init__(self):
        self.model = None

    def fit(self, tr_x, tr_y, va_x, va_y):
        self.model = XGBRegressor(objective='reg:squarederror',
                                  booster='gbtree',
                                  random_state=0,
                                  n_estimators=10000,
                                  subsample=1.0,
                                  colsample_bytree=0.6,
                                  reg_alpha=1e-5,
                                  reg_lambda=1,
                                  learning_rate=0.1,
                                  min_child_weight=2,
                                  max_depth=9,
                                  gamma=0.2)
        self.model.fit(tr_x, tr_y,
                       early_stopping_rounds=40,
                       eval_set=[(va_x, va_y)])

    def predict(self, x):
        pred = self.model.predict(x)
        return pred

##### LGBMRegressor

In [None]:
# LGBMRegressorによるモデル(シンプルな特徴量作成)
class ModelLGB:

    def __init__(self):
        self.model = None

    def fit(self, tr_x, tr_y, va_x, va_y):
        self.model = LGBMRegressor(objective='regression', 
                                   boosting_type='gbdt',
                                   n_estimators=10000,
                                   reg_alpha=100,
                                   reg_lambda=0,
                                   num_leaves=550,
                                   colsample_bytree=0.7,
                                   subsample=1.0,
                                   subsample_freq=0,
                                   min_child_samples=40,
                                   random_state=0)
        self.model.fit(tr_x, tr_y,
                       eval_metric='rmse',
                       eval_set=[(va_x, va_y)],
                       callbacks=[lgb.early_stopping(stopping_rounds=40)])

    def predict(self, x):
        pred = self.model.predict(x)
        return pred

In [None]:
# LGBMRegressorによるモデル(ワンホットエンコーディングのみ)
class ModelLGB_1:

    def __init__(self):
        self.model = None

    def fit(self, tr_x, tr_y, va_x, va_y):
        self.model = LGBMRegressor(objective='regression', 
                                   boosting_type='gbdt',
                                   n_estimators=10000,
                                   reg_alpha=100,
                                   reg_lambda=0,
                                   num_leaves=450,
                                   colsample_bytree=1.0,
                                   subsample=1.0,
                                   subsample_freq=0,
                                   min_child_samples=10,
                                   random_state=0)
        self.model.fit(tr_x, tr_y,
                       eval_metric='rmse',
                       eval_set=[(va_x, va_y)],
                       callbacks=[lgb.early_stopping(stopping_rounds=40)])

    def predict(self, x):
        pred = self.model.predict(x)
        return pred

In [None]:
# LGBMRegressorによるモデル(多項式特徴量作成)
class ModelLGB_2:

    def __init__(self):
        self.model = None

    def fit(self, tr_x, tr_y, va_x, va_y):
        self.model = LGBMRegressor(objective='regression', 
                                   boosting_type='gbdt',
                                   n_estimators=10000,
                                   reg_alpha=100,
                                   reg_lambda=0,
                                   num_leaves=400,
                                   colsample_bytree=0.9,
                                   subsample=1.0,
                                   subsample_freq=0,
                                   min_child_samples=20,
                                   random_state=0)
        self.model.fit(tr_x, tr_y,
                       eval_metric='rmse',
                       eval_set=[(va_x, va_y)],
                       callbacks=[lgb.early_stopping(stopping_rounds=40)])

    def predict(self, x):
        pred = self.model.predict(x)
        return pred

##### RGFRegressor

In [None]:
# RGFRegressorによるモデル
class ModelRGF:

    def __init__(self):
        self.model = None

    def fit(self, tr_x, tr_y, va_x, va_y):
        self.model = RGFRegressor()
        self.model.fit(tr_x, tr_y)

    def predict(self, x):
        pred = self.model.predict(x)
        return pred

### 線形

##### LinearRegression

In [None]:
# LinearRegression
class ModelLR:

    def __init__(self):
        self.model = None

    def fit(self, tr_x, tr_y, va_x, va_y):
        self.model = LinearRegression()
        self.model.fit(tr_x, tr_y)

    def predict(self, x):
        pred = self.model.predict(x)
        return pred

##### Ridge

In [None]:
# Ridge
class ModelR:

    def __init__(self):
        self.model = None

    def fit(self, tr_x, tr_y, va_x, va_y):
        self.model = Ridge(random_state=0)
        self.model.fit(tr_x, tr_y)

    def predict(self, x):
        pred = self.model.predict(x)
        return pred

##### BayesianRidge

In [None]:
# BayesianRidge
class ModelBR:

    def __init__(self):
        self.model = None

    def fit(self, tr_x, tr_y, va_x, va_y):
        self.model = BayesianRidge()
        self.model.fit(tr_x, tr_y)

    def predict(self, x):
        pred = self.model.predict(x)
        return pred

### ニューラルネットワーク

##### 各パラメータの設定

In [None]:
# 最適化の手法
nadam = nadam_v2.Nadam()

# EarlyStopping
early_stopping = EarlyStopping(monitor='val_loss', patience=8)

# 学習率を返す関数を用意する
def scheduler(epoch):
    x = 0.002
    if epoch >= 10:
        x = 0.0002
    if epoch >= 20:
        x = 0.00002
    if epoch >= 25:
        x = 0.000002
    return x

scheduler = LearningRateScheduler(
    scheduler, verbose=1,
)

##### NN_1(512 - 512 - 1)

In [None]:
class ModelNN_1:

    def __init__(self):
        self.model = None
        
    def fit(self, tr_x, tr_y, va_x, va_y):
        batch_size = 128
        epochs = 30
        
        model = Sequential()
        model.add(Dense(512, activation='relu', input_shape=(train_x.shape[1],), 
                        kernel_initializer=HeUniform(),
                        bias_initializer=Zeros())
                  )
        model.add(Dropout(0.2, seed=0))
        model.add(BatchNormalization())
        model.add(Dense(512, activation='relu',
                        kernel_initializer=HeUniform()))
        model.add(Dropout(0.2, seed=0))
        model.add(BatchNormalization())
        model.add(Dense(1))

        model.compile(loss='mean_squared_error', optimizer=nadam, 
                      metrics=[RootMeanSquaredError()])

        history = model.fit(tr_x, tr_y,
                            batch_size=batch_size, epochs=epochs,
                            verbose=1, validation_data=(va_x, va_y), 
                            callbacks=[early_stopping, scheduler])
        self.model = model

    def predict(self, x):
        pred = self.model.predict(x).reshape(-1)
        return pred

##### NN_2(512 - 256 -1)

In [None]:
class ModelNN_2:

    def __init__(self):
        self.model = None
        
    def fit(self, tr_x, tr_y, va_x, va_y):
        batch_size = 128
        epochs = 30
        
        model = Sequential()
        model.add(Dense(512, activation='relu', input_shape=(train_x.shape[1],), 
                        kernel_initializer=HeUniform(),
                        bias_initializer=Zeros())
                  )
        model.add(Dropout(0.2, seed=0))
        model.add(BatchNormalization())
        model.add(Dense(256, activation='relu',
                        kernel_initializer=HeUniform()))
        model.add(Dropout(0.2, seed=0))
        model.add(BatchNormalization())
        model.add(Dense(1))

        model.compile(loss='mean_squared_error', optimizer=nadam, 
                      metrics=[RootMeanSquaredError()])

        history = model.fit(tr_x, tr_y,
                            batch_size=batch_size, epochs=epochs,
                            verbose=1, validation_data=(va_x, va_y), 
                            callbacks=[early_stopping, scheduler])
        self.model = model

    def predict(self, x):
        pred = self.model.predict(x).reshape(-1)
        return pred

##### NN_3(512 -256 - 256 - 1)

In [None]:
class ModelNN_3:

    def __init__(self):
        self.model = None
        
    def fit(self, tr_x, tr_y, va_x, va_y):
        batch_size = 128
        epochs = 30
        
        model = Sequential()
        model.add(Dense(512, activation='relu', input_shape=(train_x.shape[1],), 
                        kernel_initializer=HeUniform(),
                        bias_initializer=Zeros())
                  )
        model.add(Dropout(0.2, seed=0))
        model.add(BatchNormalization())
        model.add(Dense(256, activation='relu', kernel_initializer=HeUniform()))
        model.add(Dropout(0.2, seed=0))
        model.add(BatchNormalization())
        model.add(Dense(256, activation='relu', kernel_initializer=HeUniform()))
        model.add(Dropout(0.2, seed=0))
        model.add(BatchNormalization())
        model.add(Dense(1))

        model.compile(loss='mean_squared_error', optimizer=nadam, 
                      metrics=[RootMeanSquaredError()])

        history = model.fit(tr_x, tr_y,
                            batch_size=batch_size, epochs=epochs,
                            verbose=1, validation_data=(va_x, va_y), 
                            callbacks=[early_stopping, scheduler])
        self.model = model

    def predict(self, x):
        pred = self.model.predict(x).reshape(-1)
        return pred

##### NN_4(512 - 384 - 256 - 1)

In [None]:
class ModelNN_4:

    def __init__(self):
        self.model = None
        
    def fit(self, tr_x, tr_y, va_x, va_y):
        batch_size = 128
        epochs = 30
        
        model = Sequential()
        model.add(Dense(512, activation='relu', input_shape=(train_x.shape[1],), 
                        kernel_initializer=HeUniform(),
                        bias_initializer=Zeros())
                  )
        model.add(Dropout(0.2, seed=0))
        model.add(BatchNormalization())
        model.add(Dense(384, activation='relu', kernel_initializer=HeUniform()))
        model.add(Dropout(0.2, seed=0))
        model.add(BatchNormalization())
        model.add(Dense(256, activation='relu', kernel_initializer=HeUniform()))
        model.add(Dropout(0.2, seed=0))
        model.add(BatchNormalization())
        model.add(Dense(1))

        model.compile(loss='mean_squared_error', optimizer=nadam, 
                      metrics=[RootMeanSquaredError()])

        history = model.fit(tr_x, tr_y,
                            batch_size=batch_size, epochs=epochs,
                            verbose=1, validation_data=(va_x, va_y), 
                            callbacks=[early_stopping, scheduler])
        self.model = model

    def predict(self, x):
        pred = self.model.predict(x).reshape(-1)
        return pred

# アンサンブル

In [None]:
# 学習データに対する「目的変数を知らない」予測値と、テストデータに対する予測値を返す関数
def predict_cv(model, train_x, train_y, test_x, seed):
    preds = []
    preds_test = []
    va_idxes = []

    kf = KFold(n_splits=4, shuffle=True, random_state=seed)

    # クロスバリデーションで学習・予測を行い、予測値とインデックスを保存する
    for i, (tr_idx, va_idx) in enumerate(kf.split(train_x)):
        tr_x, va_x = train_x[tr_idx], train_x[va_idx]
        tr_y, va_y = train_y[tr_idx], train_y[va_idx]
        model.fit(tr_x, tr_y, va_x, va_y)
        pred = model.predict(va_x)
        preds.append(pred)
        pred_test = model.predict(test_x)
        preds_test.append(pred_test)
        va_idxes.append(va_idx)

    # バリデーションデータに対する予測値を連結し、その後元の順序に並べ直す
    va_idxes = np.concatenate(va_idxes)
    preds = np.concatenate(preds, axis=0)
    order = np.argsort(va_idxes)
    pred_train = preds[order]

    # テストデータに対する予測値の平均をとる
    preds_test = np.mean(preds_test, axis=0)

    return pred_train, preds_test

## 1層目

#### ModelLGB

In [None]:
# cv=5
model_lgb = ModelLGB()
pred_train_1a, pred_test_1a = predict_cv(model_lgb, train_x, train_y, test_x, 0)
pred_train_1b, pred_test_1b = predict_cv(model_lgb, train_x, train_y, test_x, 10)
pred_train_1c, pred_test_1c = predict_cv(model_lgb, train_x, train_y, test_x, 20)
pred_train_1d, pred_test_1d = predict_cv(model_lgb, train_x, train_y, test_x, 30)
pred_train_1e, pred_test_1e = predict_cv(model_lgb, train_x, train_y, test_x, 40)
pred_train_1f, pred_test_1f = predict_cv(model_lgb, train_x, train_y, test_x, 50)

In [None]:
# 各seedの平均値をとる
pred_train_1_lgb = (pred_train_1a + pred_train_1b + pred_train_1c + pred_train_1d + pred_train_1e + pred_train_1f)/6
pred_test_1_lgb = (pred_test_1a + pred_test_1b + pred_test_1c + pred_test_1d + pred_test_1e + pred_test_1f)/6

#### ModelLGB_1

In [None]:
# cv=5
model_lgb = ModelLGB_1()
pred_train_1a, pred_test_1a = predict_cv(model_lgb, train_x, train_y, test_x, 0)
pred_train_1b, pred_test_1b = predict_cv(model_lgb, train_x, train_y, test_x, 10)
pred_train_1c, pred_test_1c = predict_cv(model_lgb, train_x, train_y, test_x, 20)
pred_train_1d, pred_test_1d = predict_cv(model_lgb, train_x, train_y, test_x, 30)
pred_train_1e, pred_test_1e = predict_cv(model_lgb, train_x, train_y, test_x, 40)
pred_train_1f, pred_test_1f = predict_cv(model_lgb, train_x, train_y, test_x, 50)

In [None]:
# 各seedの平均値をとる
pred_train_1_lgb_1 = (pred_train_1a + pred_train_1b + pred_train_1c + pred_train_1d + pred_train_1e + pred_train_1f)/6
pred_test_1_lgb_1 = (pred_test_1a + pred_test_1b + pred_test_1c + pred_test_1d + pred_test_1e + pred_test_1f)/6
print('LGB:', mean_squared_error(train_y, pred_train_1_lgb_1, squared=False))

#### ModelLGB_2

In [None]:
# cv=5
model_lgb = ModelLGB_2()
pred_train_1a, pred_test_1a = predict_cv(model_lgb, train_x, train_y, test_x, 0)
pred_train_1b, pred_test_1b = predict_cv(model_lgb, train_x, train_y, test_x, 10)
pred_train_1c, pred_test_1c = predict_cv(model_lgb, train_x, train_y, test_x, 20)
pred_train_1d, pred_test_1d = predict_cv(model_lgb, train_x, train_y, test_x, 30)
pred_train_1e, pred_test_1e = predict_cv(model_lgb, train_x, train_y, test_x, 40)
pred_train_1f, pred_test_1f = predict_cv(model_lgb, train_x, train_y, test_x, 50)

In [None]:
# 各seedの平均値をとる
pred_train_1_lgb_2 = (pred_train_1a + pred_train_1b + pred_train_1c + pred_train_1d + pred_train_1e + pred_train_1f)/6
pred_test_1_lgb_2 = (pred_test_1a + pred_test_1b + pred_test_1c + pred_test_1d + pred_test_1e + pred_test_1f)/6
print('LGB:', mean_squared_error(train_y, pred_train_1_lgb_2, squared=False))

#### ModelXGB

In [None]:
# cv=5
model_xgb = ModelXGB()
pred_train_1a, pred_test_1a = predict_cv(model_xgb, train_x, train_y, test_x, 60)
pred_train_1b, pred_test_1b = predict_cv(model_xgb, train_x, train_y, test_x, 70)
pred_train_1c, pred_test_1c = predict_cv(model_xgb, train_x, train_y, test_x, 80)
pred_train_1d, pred_test_1d = predict_cv(model_xgb, train_x, train_y, test_x, 90)
pred_train_1e, pred_test_1e = predict_cv(model_xgb, train_x, train_y, test_x, 100)
pred_train_1f, pred_test_1f = predict_cv(model_xgb, train_x, train_y, test_x, 110)

In [None]:
# 各seedの平均値をとる
pred_train_1_xgb = (pred_train_1a + pred_train_1b + pred_train_1c + pred_train_1d + pred_train_1e + pred_train_1f)/6
pred_test_1_xgb = (pred_test_1a + pred_test_1b + pred_test_1c + pred_test_1d + pred_test_1e + pred_test_1f)/6
print('XGB:', mean_squared_error(train_y, pred_train_1_xgb, squared=False))

#### ModelCBR

In [None]:
model_cbr = ModelCBR()
pred_train_1a, pred_test_1a = predict_cv(model_cbr, train_x, train_y, test_x, 120)
pred_train_1b, pred_test_1b = predict_cv(model_cbr, train_x, train_y, test_x, 130)
pred_train_1c, pred_test_1c = predict_cv(model_cbr, train_x, train_y, test_x, 140)
pred_train_1d, pred_test_1d = predict_cv(model_cbr, train_x, train_y, test_x, 150)
pred_train_1e, pred_test_1e = predict_cv(model_cbr, train_x, train_y, test_x, 160)
pred_train_1f, pred_test_1f = predict_cv(model_cbr, train_x, train_y, test_x, 170)

In [None]:
# 各seedの平均値をとる
pred_train_1_cbr = (pred_train_1a + pred_train_1b + pred_train_1c + pred_train_1d + pred_train_1e + pred_train_1f)/6
pred_test_1_cbr = (pred_test_1a + pred_test_1b + pred_test_1c + pred_test_1d + pred_test_1e + pred_test_1f)/6
print('CBR:', mean_squared_error(train_y, pred_train_1_cbr, squared=False))

#### NN_1

In [None]:
# cv=4
model_nn = ModelNN_1()
pred_train_1a, pred_test_1a = predict_cv(model_nn, train_x, train_y, test_x, 180)
pred_train_1b, pred_test_1b = predict_cv(model_nn, train_x, train_y, test_x, 190)
pred_train_1c, pred_test_1c = predict_cv(model_nn, train_x, train_y, test_x, 200)

In [None]:
# 各seedの平均値をとる
pred_train_1_nn1 = (pred_train_1a + pred_train_1b + pred_train_1c)/3
pred_test_1_nn1 = (pred_test_1a + pred_test_1b + pred_test_1c)/3
print('NN_1:', mean_squared_error(train_y, pred_train_1_nn1, squared=False))

#### NN_2

In [None]:
# cv=4
model_nn = ModelNN_2()
pred_train_1a, pred_test_1a = predict_cv(model_nn, train_x, train_y, test_x, 210)
pred_train_1b, pred_test_1b = predict_cv(model_nn, train_x, train_y, test_x, 220)
pred_train_1c, pred_test_1c = predict_cv(model_nn, train_x, train_y, test_x, 230)

In [None]:
# 各seedの平均値をとる
pred_train_1_nn2 = (pred_train_1a + pred_train_1b + pred_train_1c)/3
pred_test_1_nn2 = (pred_test_1a + pred_test_1b + pred_test_1c)/3
print('NN_2:', mean_squared_error(train_y, pred_train_1_nn2, squared=False))

#### NN_3

In [None]:
# cv=4
model_nn = ModelNN_3()
pred_train_1a, pred_test_1a = predict_cv(model_nn, train_x, train_y, test_x, 240)
pred_train_1b, pred_test_1b = predict_cv(model_nn, train_x, train_y, test_x, 250)
pred_train_1c, pred_test_1c = predict_cv(model_nn, train_x, train_y, test_x, 260)

In [None]:
# 各seedの平均値をとる
pred_train_1_nn3 = (pred_train_1a + pred_train_1b + pred_train_1c)/3
pred_test_1_nn3 = (pred_test_1a + pred_test_1b + pred_test_1c)/3
print('NN_3:', mean_squared_error(train_y, pred_train_1_nn3, squared=False))

#### NN_4

In [None]:
# cv=4
model_nn = ModelNN_4()
pred_train_1a, pred_test_1a = predict_cv(model_nn, train_x, train_y, test_x, 270)
pred_train_1b, pred_test_1b = predict_cv(model_nn, train_x, train_y, test_x, 280)
pred_train_1c, pred_test_1c = predict_cv(model_nn, train_x, train_y, test_x, 290) # random_state=20

In [None]:
# 各seedの平均値をとる
pred_train_1_nn4 = (pred_train_1a + pred_train_1b + pred_train_1c)/3
pred_test_1_nn4 = (pred_test_1a + pred_test_1b + pred_test_1c)/3
print('NN_4:', mean_squared_error(train_y, pred_train_1_nn4, squared=False))

#### ETR

In [None]:
# cv=4
model_etr = ModelETR()
pred_train_1_etr, pred_test_1_etr = predict_cv(model_etr, train_x, train_y, test_x, 300)

#### RFR

In [None]:
# cv=4
model_rfr = ModelRFR()
pred_train_1_rfr, pred_test_1_rfr = predict_cv(model_etr, train_x, train_y, test_x, 300)

#### GBR

In [None]:
# cv=4
model_gbr = ModelGBR()
pred_train_1_gbr, pred_test_1_gbr = predict_cv(model_gbr, train_x, train_y, test_x, 400) 
print('GBR:', mean_squared_error(train_y, pred_train_1_gbr, squared=False))

#### RGF(シンプルな特徴量作成) 実行はgoogle colab

In [None]:
# cv=4
model_rgf = ModelRGF()
pred_train_1a, pred_test_1a = predict_cv(model_rgf, train_x, train_y, test_x, 360) # random_state=0(predict_cv)
pred_train_1b, pred_test_1b = predict_cv(model_rgf, train_x, train_y, test_x, 370) # random_state=0(predict_cv)
pred_train_1c, pred_test_1c = predict_cv(model_rgf, train_x, train_y, test_x, 380) # random_state=0(predict_cv)

In [None]:
pred_train_1_rgf = (pred_train_1a + pred_train_1b + pred_train_1c)/3
pred_test_1_rgf = (pred_test_1a + pred_test_1b + pred_test_1c)/3
print('RGF:', mean_squared_error(train_y, pred_train_1_rgf, squared=False))

## 2層目

In [None]:
# 予測値を特徴量としてデータフレームを作成
train_x_2 = pd.DataFrame({'pred_lgb': pred_train_1_lgb, 'pred_lgb_1': pred_train_1_lgb_1, 'pred_lgb_2': pred_train_1_lgb_2,
                          'pred_xgb': pred_train_1_xgb, 'pred_cbr': pred_train_1_cbr, 'pred_nn1': pred_train_1_nn1,
                          'pred_nn2': pred_train_1_nn2, 'pred_nn3': pred_train_1_nn3, 'pred_nn4': pred_train_1_nn4,
                          'pred_etr': pred_train_1_etr, 'pred_rfr': pred_train_1_rfr, 'pred_gbr': pred_train_1_gbr,'pred_rgf': pred_train_1_rgf})
test_x_2 = pd.DataFrame({'pred_lgb': pred_test_1_lgb, 'pred_lgb_1': pred_test_1_lgb_1, 'pred_lgb_2': pred_test_1_lgb_2,
                         'pred_xgb': pred_test_1_xgb, 'pred_cbr': pred_test_1_cbr, 'pred_nn1': pred_test_1_nn1,
                         'pred_nn2': pred_test_1_nn2, 'pred_nn3': pred_test_1_nn3, 'pred_nn4': pred_test_1_nn4,
                         'pred_etr': pred_test_1_etr, 'pred_rfr': pred_test_1_rfr, 'pred_gbr': pred_test_1_gbr,'pred_rgf': pred_test_1_rgf})

In [None]:
train_x_2 = np.array(train_x_2)
test_x_2 = np.array(test_x_2)

In [None]:
model_lr = ModelLR()
pred_train_2_lr, pred_test_2_lr = predict_cv(model_lr, train_x_2, train_y, test_x_2, 0)
print(mean_squared_error(train_y, pred_train_2_lr, squared=False))

In [None]:
model_r = ModelR()
pred_train_2_r, pred_test_2_r = predict_cv(model_r, train_x_2, train_y, test_x_2, 500)
print(mean_squared_error(train_y, pred_train_2_r, squared=False))

In [None]:
model_br = ModelBR()
pred_train_2_br, pred_test_2_br = predict_cv(model_br, train_x_2, train_y, test_x_2, 500)
print(mean_squared_error(train_y, pred_train_2_br, squared=False))

In [None]:
# LGBMRegressor
class ModelLGB_3:

    def __init__(self):
        self.model = None

    def fit(self, tr_x, tr_y, va_x, va_y):
        self.model = LGBMRegressor(objective='regression', 
                                   boosting_type='gbdt',
                                   n_estimators=10000,
                                   reg_alpha=10,
                                   reg_lambda=0,
                                   num_leaves=20,
                                   colsample_bytree=0.7,
                                   subsample=0.6,
                                   subsample_freq=0,
                                   min_child_samples=10,
                                   random_state=0)
        self.model.fit(tr_x, tr_y,
                       eval_metric='rmse',
                       eval_set=[(va_x, va_y)],
                       callbacks=[lgb.early_stopping(stopping_rounds=40)])

    def predict(self, x):
        pred = self.model.predict(x)
        return pred

In [None]:
model_lgb = ModelLGB_3()
pred_train_2_lgb, pred_test_2_lgb = predict_cv(model_lgb, train_x_2, train_y, test_x_2, 0)
print(mean_squared_error(train_y, pred_train_2_lgb, squared=False))

In [None]:
# XGBRegressor
class ModelXGB_2:

    def __init__(self):
        self.model = None

    def fit(self, tr_x, tr_y, va_x, va_y):
        self.model = XGBRegressor(objective='reg:squarederror',
                                  booster='gbtree',
                                  random_state=0,
                                  n_estimators=10000,
                                  subsample=0.8,
                                  colsample_bytree=0.8,
                                  reg_alpha=0.1,
                                  reg_lambda=1,
                                  learning_rate=0.05,
                                  min_child_weight=3,
                                  max_depth=5,
                                  gamma=0)
        self.model.fit(tr_x, tr_y,
                       early_stopping_rounds=40,
                       eval_set=[(va_x, va_y)])

    def predict(self, x):
        pred = self.model.predict(x)
        return pred

In [None]:
model_xgb = ModelXGB_2()
pred_train_2_xgb, pred_test_2_xgb = predict_cv(model_xgb, train_x_2, train_y, test_x_2, 0)
print(mean_squared_error(train_y, pred_train_2_xgb, squared=False))

In [None]:
model_etr = ModelETR()
pred_train_2_etr, pred_test_2_etr = predict_cv(model_etr, train_x_2, train_y, test_x_2, 0)
print(mean_squared_error(train_y, pred_train_2_etr, squared=False))

In [None]:
model_cbr = ModelCBR()
pred_train_2_cbr, pred_test_2_cbr = predict_cv(model_cbr, train_x_2, train_y, test_x_2, 90)
print(mean_squared_error(train_y, pred_train_2_cbr, squared=False))

## 3層目

In [None]:
# 予測値を特徴量としてデータフレームを作成
train_x_3 = pd.DataFrame({'pred_lr': pred_train_2_lr, 'pred_lgb': pred_train_2_lgb, 'pred_etr': pred_train_2_etr,
                          'pred_xgb': pred_train_2_xgb, 'pred_cbr': pred_train_2_cbr, 'pred_r': pred_train_2_r,
                          'pred_br': pred_train_2_br})
test_x_3 = pd.DataFrame({'pred_lr': pred_test_2_lr, 'pred_lgb': pred_test_2_lgb, 'pred_etr': pred_test_2_etr,
                          'pred_xgb': pred_test_2_xgb, 'pred_cbr': pred_test_2_cbr, 'pred_r': pred_test_2_r,
                          'pred_br': pred_test_2_br})

In [None]:
train_x_3 = np.array(train_x_3)
test_x_3 = np.array(test_x_3)

In [None]:
model_lr = ModelLR()
pred_train_3_lr, pred_test_3_lr = predict_cv(model_lr, train_x_3, train_y, test_x_3, 71)
print(mean_squared_error(train_y, pred_train_3_lr, squared=False))

## 提出

In [None]:
submission = pd.DataFrame({'id': test_id, 'pm26_mid': pred_test_3_lr})
submission.to_csv('../output/final_submission.csv', header=False, index=False)

final_submission.csv: 1層目: lgb, lgb_1, lgb_2, xgb, cbc, nn_1, nn_2, nn_3, nn_4, etr, rfr, gbr, rgf 2層目: lgb, xgb, cbr, lr, r, br, etr<br>