In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

!pip install optuna 

# category variable
category = ['cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9']

# continuous variable
continuous = ['cont0', 'cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 
              'cont7', 'cont8', 'cont9', 'cont10', 'cont11', 'cont12', 'cont13']

You should consider upgrading via the '/opt/conda/bin/python3.7 -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/opt/conda/bin/python3.7 -m pip install --upgrade pip' command.[0m


In [11]:
train_dataset = pd.read_csv('../input/tabular-playground-series-feb-2021/train.csv')
test_data = pd.read_csv('../input/tabular-playground-series-feb-2021/test.csv')
dataset = pd.concat([train_dataset, test_data])
#print(dataset.info())

# あきらかな外れ値は削除
# [166042]はあきらかな外れ値
outlier=[166042]
for x in outlier:
    dataset = dataset.loc[dataset['id'] != x, :] 

# idとtargetは避難させておく
id = dataset['id']
target = dataset['target']
# 相関が高いものだけを使用。この手法は有効だったと思うが、Catboostでのスコアは一定以上伸びなかった。
   
## ***** Encoding ***** ##
from sklearn.preprocessing import LabelEncoder, RobustScaler, StandardScaler
scaler = RobustScaler()
encoder = LabelEncoder()

cat_index = []
for col in dataset.columns:
#    # カテゴリ変数はエンコーディングを適用
    if dataset[col].dtype == object:
        cat_index.append(dataset.columns.get_loc(col))
        #dataset[col] = encoder.fit_transform(dataset[col])

#dataset[continuous] = scaler.fit_transform(dataset.loc[:,continuous].values)

# targetのあるなしでtrainとtestを分割
train = dataset.loc[dataset['target'].notnull(), :]
test  = dataset.loc[dataset['target'].isnull(), :]

X = train.drop(columns=['id', 'target'])
y = train['target']
X_prediction = test.drop(columns=['id', 'target'])
prediction_id = test.loc[:,'id']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import statistics as stat
import lightgbm as lgb

def kfold_processing(models, X, y):
    kf = KFold(n_splits=5,random_state=48,shuffle=True)
    rmse=[]  # list contains rmse for each fold
    n=0
    preds=0
    
    for model in models:
        # kf.splitはindexを返すことに注意。データ自体じゃないよ!
        for train, test in kf.split(X, y):
            x_tr = X.iloc[train, :]
            x_te = X.iloc[test,  :]
            y_tr = y.iloc[train]
            y_te = y.iloc[test]
            model.fit(x_tr,y_tr,eval_set=[(x_te,y_te)],early_stopping_rounds=100,verbose=False)
            rmse.append(mean_squared_error(y_te, model.predict(x_te), squared=False))
            preds+=model.predict(X_prediction)
            print(n+1,rmse[n])
            n+=1

        print(f'RMSE mean = {stat.mean(rmse)}')
        print(f'RMSE = {rmse}')
    return preds

In [None]:
# sklearn.ensemble.StackingRegressor — scikit-learn 0.24.1 documentation 
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.StackingRegressor.html

import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import StackingRegressor
best_lgb_params = {'objective': 'regression', 'num_leaves': 87, 'max_depth': 9, 'learning_rate': 0.009582240516938432, 'n_estimators': 2553, 
                   'reg_alpha': 0.037580598353005736, 'reg_lambda': 0.02686007922451687, 'colsample_bytree': 0.5534542046480458}

best_xgb_params = {'objective': 'reg:squarederror', 'eval_metric': 'rmse', 'tree_method': 'gpu_hist','eta': 0.6967322281466614, 
                   'gamma': 0.0036942646535044962, 'max_depth': 6, 'sub_sample': 0.553569633282626, 
                   'colsample_bytree': 0.554663669927102, 'lambda': 0.001701369247469206, 'alpha': 0.009630419498745284, 
                   'learning_rate': 0.03446986865236482, 'n_estimators': 603}


        

lgb_model = lgb.LGBMRegressor(**best_lgb_params)
xgb_model = xgb.XGBRegressor(**best_xgb_params)
kf = KFold(n_splits=5,random_state=48,shuffle=True)

models = [lgb_model, xgb_model]

y_pred = kfold_processing(models=models, X=X, y=y)

'''
estimators = [
    ('lgb', lgb_model),
    ('xgb_model', xgb_model)
]

streg = StackingRegressor(estimators=estimators, cv=kf)
streg.fit(X,y)
'''

### Optuna Tuning
[lightgbm.LGBMRegressor — LightGBM 3.1.1.99 documentation](https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMRegressor.html)  
[XGBoostパラメータのまとめとランダムサーチ実装 - Qiita](https://qiita.com/FJyusk56/items/0649f4362587261bd57a)  
[XGBoost Parameters — xgboost 1.4.0-SNAPSHOT documentation](https://xgboost.readthedocs.io/en/latest/parameter.html)  
[CatBoostRegressor - CatBoost. Documentation](https://catboost.ai/docs/concepts/python-reference_catboostregressor.html)

In [None]:
import lightgbm as lgb
from sklearn.metrics import r2_score, mean_squared_error
import optuna

trains = lgb.Dataset(X_train, y_train)
tests = lgb.Dataset(X_test, y_test)

def objective(trial):
    params = {
        'objective': 'regression',
        'num_leaves': trial.suggest_int('num_leaves', 5, 200),
        'max_depth': trial.suggest_int('max_depth', 4, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
        'n_estimators': trial.suggest_int('n_estimators', 100, 3000),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.001, 0.1),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.001, 0.1),
        'colsample_bytree': trial.suggest_loguniform('colsample_bytree', 0.5, 1),
        'random_state': 42
    }  
    
    
    lgb_model = lgb.LGBMRegressor(**params)
    lgb_model.fit(X=X_train, y=y_train, eval_set = [(X_test, y_test)], eval_metric='rmse',  early_stopping_rounds=10)
    y_pred = lgb_model.predict(X_test)
    
    return np.sqrt(mean_squared_error(y_test, y_pred))

study = optuna.create_study()
study.optimize(objective, n_trials=50, n_jobs=-1)
lgb_best = study.best_params
print(lgb_best)

In [None]:
import xgboost as xgb
from sklearn.metrics import r2_score, mean_squared_error
import optuna

# XGBoost Parameters — xgboost 1.4.0-SNAPSHOT documentation https://xgboost.readthedocs.io/en/latest/parameter.html

def objective(trial):
    params = {
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'tree_method': 'gpu_hist',
        'eta': trial.suggest_loguniform('eta', 0.1, 1.0),
        'gamma': trial.suggest_loguniform('gamma', 0.001, 5.),
        'max_depth': trial.suggest_int('max_depth', 4, 10),
        'sub_sample': trial.suggest_loguniform('sub_sample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_loguniform('colsample_bytree', 0.5, 1.0),
        'lambda': trial.suggest_loguniform('lambda', 0.001, 0.01),
        'alpha': trial.suggest_loguniform('alpha', 0.001, 0.01),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.0001, 0.1),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000)
    }  
    
    xgb_model = xgb.XGBRegressor(**params)
    xgb_model.fit(X_train, y_train, eval_set = [(X_test, y_test)],  early_stopping_rounds=10)
    y_pred = xgb_model.predict(X_test)
    
    return np.sqrt(mean_squared_error(y_test, y_pred))

study = optuna.create_study()
study.optimize(objective, n_trials=50, n_jobs=-1)
xgb_best = study.best_params
print(xgb_best)

In [15]:
## CatBoost Optuna
from catboost import CatBoostRegressor
from catboost import Pool
from sklearn.metrics import r2_score, mean_squared_error
import optuna

cat_train = Pool(X_train, y_train, cat_features=category)
cat_test = Pool(X_test, y_test, cat_features=category)

def objective(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 100, 3000),
        'depth': trial.suggest_int('depth', 4, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
        'l2_leaf_reg': trial.suggest_int('l2_leaf', 1,10),
        'eval_metric': 'RMSE',
        'random_strength': trial.suggest_int('random_strength', 0, 100),
        'bagging_temperature': trial.suggest_loguniform('bagging_temperature', 0.01, 100.00),
        'od_type': trial.suggest_categorical('od_type', ['IncToDec', 'Iter']),
        'od_wait': trial.suggest_int('od_wait', 10, 50),
        'use_best_model': True
    }  
    
    model = CatBoostRegressor(**params)
    model.fit(cat_train, eval_set=cat_test, early_stopping_rounds=10, verbose=0)
    y_pred = model.predict(cat_test)
    
    return np.sqrt(mean_squared_error(y_test, y_pred))

study = optuna.create_study()
study.optimize(objective, n_trials=100, n_jobs=-1)
cat_best = study.best_params
print(cat_best)
print(f'Best Value: {study.best_value}')

[32m[I 2021-02-08 23:56:08,039][0m A new study created in memory with name: no-name-0c8cdbe1-4b65-4dfd-b410-de882789d2ee[0m
[32m[I 2021-02-09 00:00:43,424][0m Trial 3 finished with value: 0.8721303443712503 and parameters: {'iterations': 467, 'depth': 5, 'learning_rate': 0.013103925677769053, 'l2_leaf': 3, 'random_strength': 93, 'bagging_temperature': 14.735305943508498, 'od_type': 'IncToDec', 'od_wait': 31}. Best is trial 3 with value: 0.8721303443712503.[0m
[32m[I 2021-02-09 00:23:23,170][0m Trial 2 finished with value: 0.8426832060325465 and parameters: {'iterations': 2585, 'depth': 7, 'learning_rate': 0.07148267295307521, 'l2_leaf': 3, 'random_strength': 80, 'bagging_temperature': 19.696303321327303, 'od_type': 'IncToDec', 'od_wait': 40}. Best is trial 2 with value: 0.8426832060325465.[0m
[32m[I 2021-02-09 00:29:04,143][0m Trial 4 finished with value: 0.8811541008757278 and parameters: {'iterations': 2750, 'depth': 5, 'learning_rate': 0.0002781743242777458, 'l2_leaf': 5,

KeyboardInterrupt: 

In [None]:
#y_pred=regressor.predict(X_prediction)
print(y_pred/10)
output = pd.DataFrame({'id': prediction_id, 'target': y_pred/10})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

In [None]:
## Lightgbm Optuna

import optuna.integration.lightgbm as lgb

trains = lgb.Dataset(X_train, y_train)
tests = lgb.Dataset(X_test, y_test)

params = {'objective': 'mean_squared_error',
         'metric': 'rmse'}

lgb_model = lgb.train(params, trains, valid_sets=tests, early_stopping_rounds=10)
best_params = lgb_model.params


import lightgbm as lgb

lgb_model = lgb.train(best_params, trains, valid_sets=tests)
y_pred = lgb_model.predict(X_test)

from sklearn.metrics import mean_squared_error
print(mean_squared_error(y_test, y_pred))

In [None]:
import tensorflow as tf
from tensorflow import keras

def rmse(y_true, y_pred):
    return tf.sqrt(tf.losses.mean_squared_error(y_true, y_pred))

### ***********************************************************###
model = tf.keras.Sequential()

#model.add(tf.keras.layers.Dense(1024, activation='relu', input_shape=(X.shape[1],)))
#model.add(tf.keras.layers.LeakyReLU())
#model.add(tf.keras.layers.BatchNormalization())
#model.add(tf.keras.layers.Dropout(0.5))

#model.add(tf.keras.layers.Dense(512, activation='relu'))
model.add(tf.keras.layers.Dense(64, kernel_regularizer=keras.regularizers.l2(0.001), activation='relu',input_shape=(X.shape[1],)))
model.add(tf.keras.layers.LeakyReLU())
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Dropout(0.4))

#model.add(tf.keras.layers.Dense(512, activation='relu'))
model.add(tf.keras.layers.Dense(64, kernel_regularizer=keras.regularizers.l2(0.001), activation='relu'))
model.add(tf.keras.layers.LeakyReLU())
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Dropout(0.4))

#model.add(tf.keras.layers.Dense(units=256, activation='relu'))
model.add(tf.keras.layers.Dense(32, kernel_regularizer=keras.regularizers.l2(0.001), activation='relu'))
model.add(tf.keras.layers.LeakyReLU())
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Dropout(0.3))

#model.add(tf.keras.layers.Dense(units=256, activation='relu'))
model.add(tf.keras.layers.Dense(32, kernel_regularizer=keras.regularizers.l2(0.001), activation='relu'))
model.add(tf.keras.layers.LeakyReLU())
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Dropout(0.3))

#model.add(tf.keras.layers.Dense(units=128, activation='relu'))
model.add(tf.keras.layers.Dense(16, kernel_regularizer=keras.regularizers.l2(0.001), activation='relu'))
model.add(tf.keras.layers.LeakyReLU())

model.add(tf.keras.layers.Dense(units=1, activation='linear'))
### ***********************************************************###

optimizer = tf.keras.optimizers.Adam(lr=0.005, decay=5e-4)
model.compile(optimizer = optimizer, loss = 'mae', metrics = ['mse', 'mae'])

#checkpoint_name = 'Model/{epoch:03d}-{val_loss:.5f}.hdf5'
checkpoint_name = 'DNN_BestModel.hdf5'
checkpoint = tf.keras.callbacks.ModelCheckpoint(checkpoint_name, monitor='val_loss', verbose=1, save_best_only=True, mode='auto')
callback_list = [checkpoint]

history = model.fit(X_train, y_train, validation_split=0.2, epochs = 500, batch_size = 1024,
                    validation_data=(X_test, y_test), callbacks=callback_list)
y_pred = model.predict(X_test).reshape(-1)
RMSE = rmse(y_test, y_pred)
print(f'RMSE = {RMSE}')

fig = plt.figure(figsize=(8,5))
ax = fig.add_subplot(111)
ax.set_ylim(0.6, 0.75)

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.legend(['Train', 'Test'], loc='upper right')
plt.savefig(fname='1024 neurons enable LearningRate.png')

In [None]:
# CatBoosting
'''
from catboost import CatBoostRegressor
from sklearn.metrics import r2_score, mean_squared_error

from sklearn.model_selection import StratifiedKFold, KFold
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

from sklearn.model_selection import GridSearchCV
param_cat = {'depth':[6,8,10],
            'learning_rate':[0.005, 0.001],
            'l2_leaf_reg':[1,4,9],
            'iterations':[100],
            'cat_features':[feature_cat],
            'eval_metric':['RMSE']
            }

grid_result = GridSearchCV(estimator=CatBoostRegressor(),param_grid=param_cat, cv=kfold, scoring='neg_mean_squared_error', n_jobs = -1, verbose=2)
grid_result.fit(X_train, y_train)
grid_param = grid_result.best_params_
print(grid_param)


cat = CatBoostRegressor(task_type='GPU', iterations=8000, use_best_model=True, depth=10, eval_metric='RMSE', l2_leaf_reg=1, learning_rate=0.001, early_stopping_rounds=10)
cat.fit(X_train, y_train, cat_features=feature_cat, eval_set = (X_test, y_test))
y_pred = cat.predict(X_test)
print(f'RMSE: {mean_squared_error(y_test, y_pred)}')


'''