In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# category variable
category = ['cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9']

# continuous variable
continuous = ['cont0', 'cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 
              'cont7', 'cont8', 'cont9', 'cont10', 'cont11', 'cont12', 'cont13']

In [None]:
#features = ['cat1', 'cont0', 'cat2', 'cont11', 'cont13', 'cat8', 'cont8', 'cont1', 'cat9',
#            'cont9', 'cont5', 'cat3', 'cat0', 'cat6', 'cont3', 'cat5', 'cont4', 'cont2', 'cont12']

train_dataset = pd.read_csv('../input/tabular-playground-series-feb-2021/train.csv')
test_data = pd.read_csv('../input/tabular-playground-series-feb-2021/test.csv')
dataset = pd.concat([train_dataset, test_data])

# あきらかな外れ値は削除
# [166042]はあきらかな外れ値
outlier=[166042]
for x in outlier:
    dataset = dataset.loc[dataset['id'] != x, :]

# idとtargetは避難させておく
id = dataset['id']
target = dataset['target']
# 避難させたので遠慮なく削除
dataset = dataset.drop(columns=['id', 'target'])
# 相関が高いものだけを使用
# この手法は有効だったと思うが、Catboostでのスコアは一定以上伸びなかった。

# 重要度を確認するためにはLabelEncodingが有効
from sklearn.preprocessing import LabelEncoder, RobustScaler, StandardScaler
encoder = LabelEncoder()
scaler = RobustScaler()

for x in dataset.columns:
    # notebookみると、みんなLabelEncoder使っているなぁ
    if dataset[x].dtype == object:
        #dataset[x] = encoder.fit_transform(dataset[x])
        dataset = pd.get_dummies(dataset, columns=[x], drop_first=True)
        
dataset[continuous] = scaler.fit_transform(dataset.loc[:,continuous].values)

# データセットにidとtargetを元に戻して
dataset = pd.concat([id,dataset,target], axis=1)
# targetのあるなしでtrainとtestを分割
train = dataset.loc[dataset['target'].notnull(), :]
test  = dataset.loc[dataset['target'].isnull(), :]

X = train.drop(columns=['id', 'target'])
y = train['target']
X_prediction = test.drop(columns=['id', 'target'])
prediction_id = test.loc[:,'id']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
## Lightgbm Optuna

#!pip install optuna 
import optuna.integration.lightgbm as lgb

trains = lgb.Dataset(X_train, y_train)
tests = lgb.Dataset(X_test, y_test)

params = {'objective': 'mean_squared_error',
         'metric': 'rmse'}

model = lgb.train(params, trains, valid_sets=tests, early_stopping_rounds=10)
best_params = model.params


import lightgbm as lgb

regressor = lgb.train(best_params, trains, valid_sets=tests)
y_pred = regressor.predict(X_test)

from sklearn.metrics import mean_squared_error
print(mean_squared_error(y_test, y_pred))

In [None]:
from sklearn.model_selection import KFold
import lightgbm as lgb

kf = KFold(n_splits=5,random_state=48,shuffle=True)
rmse=[]  # list contains rmse for each fold
n=0
preds=0

# kf.splitはindexを返すことに注意。データ自体じゃないよ!
for train, test in kf.split(X, y):
    model = lgb.LGBMRegressor(**best_params)
    x_tr = X.iloc[train, :].values
    x_te = X.iloc[test,  :].values
    y_tr = y.iloc[train].values
    y_te = y.iloc[test].values
    
    model.fit(x_tr,y_tr,eval_set=[(x_te,y_te)],early_stopping_rounds=100,verbose=False)
    rmse.append(mean_squared_error(y_te, model.predict(x_te), squared=False))
    preds+=model.predict(X_prediction)
    print(n+1,rmse[n])
    n+=1

y_pred = preds/kf.n_splits
import statistics as stat
print(f'RMSE mean = {stat.mean(rmse)}')
print(f'RMSE = {rmse}')

In [None]:
## CatBoost Optuna
from catboost import CatBoostRegressor
from catboost import Pool
from sklearn.metrics import r2_score, mean_squared_error
import optuna

cat_train = Pool(X_train, y_train, cat_features=category)
cat_test = Pool(X_test, y_test, cat_features=category)

def objective(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 50, 500),
        'depth': trial.suggest_int('depth', 4, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
        'l2_leaf_reg': trial.suggest_int('l2_leaf', 1,10),
        'eval_metric': trial.suggest_categorical('eval_metric', ['RMSE']),
        'random_strength': trial.suggest_int('random_strength', 0, 100),
        'bagging_temperature': trial.suggest_loguniform('bagging_temperature', 0.01, 100.00),
        'od_type': trial.suggest_categorical('od_type', ['IncToDec', 'Iter']),
        'od_wait': trial.suggest_int('od_wait', 10, 50),
        'use_best_model': True,
        'cat_features': category
    }  
    
    model = CatBoostRegressor(**params)
    model.fit(cat_train, eval_set = cat_test)
    y_pred = model.predict(cat_test)
    
    return np.sqrt(mean_squared_error(y_test, y_pred))

study = optuna.create_study()
study.optimize(objective, n_trials=20, n_jobs=-1)
cat_best = study.best_params
print(cat_best)

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.metrics import r2_score, mean_squared_error

class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models):
        self.models = models
    
    def fit(self, X, y, X_test, y_test):
        self.models_ = [clone(x) for x in self.models]

        for model in self.models_:
            # CatBoostに関しては、eval_setが必須のためここでエラーが発生する。
            # その場合はexceptでfit処理を行う
            print(f'************ RUNNING : {model} ************')
            try:
                model.fit(X, y)
            except:
                model.fit(X, y, eval_set=(X_test, y_test))
                
            y_pred = model.predict(X_test)
            print(f'{model} RMSE: {mean_squared_error(y_test, y_pred)}')
            
        return self
    
    def predict(self, X):
        predictions = np.column_stack(
            [model.predict(X) for model in self.models_]
            )
        return np.mean(predictions, axis=1)

    
averaged_models = AveragingModels(models = (cat, lasso, lgb_tuned))
averaged_models.fit(X_train, y_train, X_test, y_test)

# 予想してみる
from sklearn.metrics import mean_squared_error
y_pred = averaged_models.predict(X_test)

In [None]:
import tensorflow as tf
from tensorflow import keras

def rmse(y_true, y_pred):
    return tf.sqrt(tf.losses.mean_squared_error(y_true, y_pred))

### ***********************************************************###
model = tf.keras.Sequential()

#model.add(tf.keras.layers.Dense(1024, activation='relu', input_shape=(X.shape[1],)))
#model.add(tf.keras.layers.LeakyReLU())
#model.add(tf.keras.layers.BatchNormalization())
#model.add(tf.keras.layers.Dropout(0.5))

#model.add(tf.keras.layers.Dense(512, activation='relu'))
model.add(tf.keras.layers.Dense(64, kernel_regularizer=keras.regularizers.l2(0.001), activation='relu',input_shape=(X.shape[1],)))
model.add(tf.keras.layers.LeakyReLU())
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Dropout(0.4))

#model.add(tf.keras.layers.Dense(512, activation='relu'))
model.add(tf.keras.layers.Dense(64, kernel_regularizer=keras.regularizers.l2(0.001), activation='relu'))
model.add(tf.keras.layers.LeakyReLU())
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Dropout(0.4))

#model.add(tf.keras.layers.Dense(units=256, activation='relu'))
model.add(tf.keras.layers.Dense(32, kernel_regularizer=keras.regularizers.l2(0.001), activation='relu'))
model.add(tf.keras.layers.LeakyReLU())
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Dropout(0.3))

#model.add(tf.keras.layers.Dense(units=256, activation='relu'))
model.add(tf.keras.layers.Dense(32, kernel_regularizer=keras.regularizers.l2(0.001), activation='relu'))
model.add(tf.keras.layers.LeakyReLU())
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Dropout(0.3))

#model.add(tf.keras.layers.Dense(units=128, activation='relu'))
model.add(tf.keras.layers.Dense(16, kernel_regularizer=keras.regularizers.l2(0.001), activation='relu'))
model.add(tf.keras.layers.LeakyReLU())

model.add(tf.keras.layers.Dense(units=1, activation='linear'))
### ***********************************************************###

optimizer = tf.keras.optimizers.Adam(lr=0.005, decay=5e-4)
model.compile(optimizer = optimizer, loss = 'mae', metrics = ['mse', 'mae'])

#checkpoint_name = 'Model/{epoch:03d}-{val_loss:.5f}.hdf5'
checkpoint_name = 'DNN_BestModel.hdf5'
checkpoint = tf.keras.callbacks.ModelCheckpoint(checkpoint_name, monitor='val_loss', verbose=1, save_best_only=True, mode='auto')
callback_list = [checkpoint]

history = model.fit(X_train, y_train, validation_split=0.2, epochs = 500, batch_size = 1024,
                    validation_data=(X_test, y_test), callbacks=callback_list)
y_pred = model.predict(X_test).reshape(-1)
RMSE = rmse(y_test, y_pred)
print(f'RMSE = {RMSE}')

fig = plt.figure(figsize=(8,5))
ax = fig.add_subplot(111)
ax.set_ylim(0.6, 0.75)

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.legend(['Train', 'Test'], loc='upper right')
plt.savefig(fname='1024 neurons enable LearningRate.png')

In [None]:
# CatBoosting
'''
from catboost import CatBoostRegressor
from sklearn.metrics import r2_score, mean_squared_error

from sklearn.model_selection import StratifiedKFold, KFold
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

from sklearn.model_selection import GridSearchCV
param_cat = {'depth':[6,8,10],
            'learning_rate':[0.005, 0.001],
            'l2_leaf_reg':[1,4,9],
            'iterations':[100],
            'cat_features':[feature_cat],
            'eval_metric':['RMSE']
            }

grid_result = GridSearchCV(estimator=CatBoostRegressor(),param_grid=param_cat, cv=kfold, scoring='neg_mean_squared_error', n_jobs = -1, verbose=2)
grid_result.fit(X_train, y_train)
grid_param = grid_result.best_params_
print(grid_param)


cat = CatBoostRegressor(task_type='GPU', iterations=8000, use_best_model=True, depth=10, eval_metric='RMSE', l2_leaf_reg=1, learning_rate=0.001, early_stopping_rounds=10)
cat.fit(X_train, y_train, cat_features=feature_cat, eval_set = (X_test, y_test))
y_pred = cat.predict(X_test)
print(f'RMSE: {mean_squared_error(y_test, y_pred)}')


'''

In [None]:
#y_pred=regressor.predict(X_prediction)
output = pd.DataFrame({'id': prediction_id, 'target': y_pred})
output.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved!")