In [2]:
from google.colab import drive
drive.mount('/content/gdrive/', force_remount = True)

Mounted at /content/gdrive/


In [3]:
%cd /content/gdrive/My Drive/kaggle/session3episode9/

/content/gdrive/My Drive/kaggle/session3episode9


In [4]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [5]:
!pip install catboost
!pip install optuna

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.1.1-cp39-none-manylinux1_x86_64.whl (76.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.6/76.6 MB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.1.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting optuna
  Downloading optuna-3.1.0-py3-none-any.whl (365 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m365.3/365.3 KB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting cmaes>=0.9.1
  Downloading cmaes-0.9.1-py3-none-any.whl (21 kB)
Collecting alembic>=1.5.0
  Downloading alembic-1.10.2-py3-none-any.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.2/212.2 KB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting colorlog
  Downloadi

In [6]:
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import optuna
from optuna import Trial, visualization
from optuna.samplers import TPESampler

In [7]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [8]:
from sklearn.cluster import KMeans

In [9]:
km = KMeans(n_clusters = 8, random_state = 42)

In [10]:
train['km_cls'] = km.fit_predict(train.iloc[:, 1:-1])
test['km_cls'] = km.predict(test.iloc[:, 1:])

In [11]:
train['feature_mean'] = train.iloc[:, 1:-2].sum(axis = 1)
test['feature_mean'] = test.iloc[:, 1:-1].sum(axis = 1)

In [12]:
kf = KFold(n_splits = 10, random_state = 42, shuffle = True)

In [13]:
X = train.drop(['id', 'Strength'], axis = 1)
y = train['Strength']
target = test[X.columns]

In [14]:
def RMSE(y_actual, y_pred):
    score = mean_squared_error(y_actual, y_pred, squared = False)
    return score

In [15]:
def LGBM_objective(trial : Trial, X, y) :
    
    param = {
        "objective": trial.suggest_categorical("objective", ['regression']),
        "metric": trial.suggest_categorical("metric", ['rmse']), # 'rmse'
        "verbose": trial.suggest_categorical("verbose", [0]),
        "random_state": trial.suggest_categorical("random_state", [42]),
        "n_estimators": trial.suggest_int("n_estimators", 5000, 15000),
        "learning_rate": trial.suggest_discrete_uniform('learning_rate',0.005, 0.05, 0.001),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_discrete_uniform('subsample', 0.5, 1, 0.1),
        'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.5, 1, 0.1),
        'num_leaves' : trial.suggest_int('num_leaves', 2, 256),
    }
    
    lgbm_rmse = 0
    
    for i, (tr_idx, val_idx) in enumerate(kf.split(X, y)) :
        tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
        val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]

        lgbm = LGBMRegressor(**param)
        lgbm.fit(tr_x, tr_y, eval_set = [(tr_x, tr_y), (val_x, val_y)], early_stopping_rounds = 1500, verbose = 0)

        val_pred = lgbm.predict(val_x)
        fold_rmse = mean_squared_error(val_y, val_pred) ** 0.5
        lgbm_rmse += fold_rmse / kf.n_splits
    return lgbm_rmse

In [16]:
study = optuna.create_study(direction = 'minimize', sampler = TPESampler())

[32m[I 2023-03-13 11:54:43,301][0m A new study created in memory with name: no-name-1076d388-8cb9-4db0-852f-37b20d450c37[0m


In [17]:
study.optimize(lambda trial : LGBM_objective(trial, X, y), n_trials = 30)

[32m[I 2023-03-13 11:55:49,465][0m Trial 0 finished with value: 12.310675684964748 and parameters: {'objective': 'regression', 'metric': 'rmse', 'verbose': 0, 'random_state': 42, 'n_estimators': 9775, 'learning_rate': 0.026000000000000002, 'max_depth': 8, 'min_child_samples': 8, 'subsample': 0.6, 'colsample_bytree': 1.0, 'num_leaves': 136}. Best is trial 0 with value: 12.310675684964748.[0m
[32m[I 2023-03-13 11:56:15,481][0m Trial 1 finished with value: 12.106671479578688 and parameters: {'objective': 'regression', 'metric': 'rmse', 'verbose': 0, 'random_state': 42, 'n_estimators': 6209, 'learning_rate': 0.024, 'max_depth': 4, 'min_child_samples': 36, 'subsample': 0.6, 'colsample_bytree': 1.0, 'num_leaves': 248}. Best is trial 1 with value: 12.106671479578688.[0m
[32m[I 2023-03-13 11:56:52,015][0m Trial 2 finished with value: 12.148022696267653 and parameters: {'objective': 'regression', 'metric': 'rmse', 'verbose': 0, 'random_state': 42, 'n_estimators': 9712, 'learning_rate': 

In [18]:
lgbm_params = study.best_params

In [19]:
lgbm_params

{'objective': 'regression',
 'metric': 'rmse',
 'verbose': 0,
 'random_state': 42,
 'n_estimators': 12476,
 'learning_rate': 0.044,
 'max_depth': 4,
 'min_child_samples': 86,
 'subsample': 0.6,
 'colsample_bytree': 0.6,
 'num_leaves': 104}

In [20]:
def XGB_objective(trial : Trial, X, y) :
    
    param = {
        "objective": trial.suggest_categorical("objective", ['reg:squarederror']),
        "random_state": trial.suggest_categorical("random_state", [42]),
        "n_estimators": trial.suggest_int("n_estimators", 5000, 15000),
        "learning_rate": trial.suggest_discrete_uniform('learning_rate',0.005, 0.05, 0.001),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_discrete_uniform('subsample', 0.5, 1, 0.1),
        'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.5, 1, 0.1),
        'lambda' : trial.suggest_loguniform('lambda', 1e-3, 5.),
        'alpha' : trial.suggest_loguniform('alpha', 1e-3, 5)
    }
    
    XGB_rmse = 0
    
    for i, (tr_idx, val_idx) in enumerate(kf.split(X, y)) :
        tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
        val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]

        model = XGBRegressor(**param)
        model.fit(tr_x, tr_y, eval_set = [(tr_x, tr_y), (val_x, val_y)], early_stopping_rounds = 1500, verbose = 0)

        val_pred = model.predict(val_x)
        fold_rmse = mean_squared_error(val_y, val_pred) ** 0.5
        XGB_rmse += fold_rmse / kf.n_splits
    return XGB_rmse

In [21]:
study = optuna.create_study(direction = 'minimize', sampler = TPESampler())

[32m[I 2023-03-13 12:08:26,762][0m A new study created in memory with name: no-name-49ae5396-13a9-4a7c-afc6-5b0e7f295612[0m


In [22]:
study.optimize(lambda trial : XGB_objective(trial, X, y), n_trials = 20)

[32m[I 2023-03-13 12:09:46,773][0m Trial 0 finished with value: 12.118257333224019 and parameters: {'objective': 'reg:squarederror', 'random_state': 42, 'n_estimators': 6685, 'learning_rate': 0.011, 'max_depth': 4, 'subsample': 1.0, 'colsample_bytree': 0.7, 'lambda': 0.15139542532579275, 'alpha': 1.071093684134274}. Best is trial 0 with value: 12.118257333224019.[0m
[32m[I 2023-03-13 12:11:44,043][0m Trial 1 finished with value: 12.33208556368551 and parameters: {'objective': 'reg:squarederror', 'random_state': 42, 'n_estimators': 9520, 'learning_rate': 0.041999999999999996, 'max_depth': 8, 'subsample': 0.5, 'colsample_bytree': 0.6, 'lambda': 0.40110913238254914, 'alpha': 0.03311319824431492}. Best is trial 0 with value: 12.118257333224019.[0m
[32m[I 2023-03-13 12:13:04,317][0m Trial 2 finished with value: 12.062863787354804 and parameters: {'objective': 'reg:squarederror', 'random_state': 42, 'n_estimators': 10783, 'learning_rate': 0.027, 'max_depth': 4, 'subsample': 0.5, 'col

In [23]:
xgb_params = study.best_params

In [24]:
xgb_params

{'objective': 'reg:squarederror',
 'random_state': 42,
 'n_estimators': 8631,
 'learning_rate': 0.030000000000000002,
 'max_depth': 3,
 'subsample': 0.5,
 'colsample_bytree': 1.0,
 'lambda': 2.1859900049479357,
 'alpha': 0.10998437511715273}

In [25]:
def HPO_objective(trial : Trial, X, y) :
    param = {
        "eval_metric": trial.suggest_categorical("eval_metric", ['RMSE']),
        'iterations':trial.suggest_int("iterations", 4000, 15000), 
        'od_wait':trial.suggest_int('od_wait', 500, 2300), 
        'learning_rate' : trial.suggest_uniform('learning_rate', 0.005, 0.05), 
        'reg_lambda': trial.suggest_uniform('reg_lambda', 1e-5, 100), 
        'subsample': trial.suggest_uniform('subsample', 0.5, 1), 
        'random_strength': trial.suggest_uniform('random_strength', 10, 50), 
        'depth': trial.suggest_int('depth', 3, 10), 
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 30), 
        'leaf_estimation_iterations': trial.suggest_int('leaf_estimation_iterations', 1, 15), 
        'bagging_temperature' :trial.suggest_loguniform('bagging_temperature', 0.01, 100.00), 
        'colsample_bylevel':trial.suggest_float('colsample_bylevel', 0.4, 1.0),
        "random_state": trial.suggest_categorical("random_state", [42]),
        "use_best_model": trial.suggest_categorical("use_best_model", [True])
    }
    
    model_rmse = 0
    
    for i, (tr_idx, val_idx) in enumerate(kf.split(X, y)) :
        tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
        val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]

        model = CatBoostRegressor(**param)
        model.fit(tr_x, tr_y, eval_set = [(tr_x, tr_y), (val_x, val_y)], early_stopping_rounds = 1500, verbose = 0)

        val_pred = model.predict(val_x)
        fold_rmse = mean_squared_error(val_y, val_pred) ** 0.5
        model_rmse += fold_rmse / kf.n_splits
    return model_rmse

In [26]:
study = optuna.create_study(direction = 'minimize', sampler = TPESampler())

[32m[I 2023-03-13 12:46:35,875][0m A new study created in memory with name: no-name-dc2c6b3a-72b0-40d1-b5e4-b51776aa37d1[0m


In [27]:
study.optimize(lambda trial : HPO_objective(trial, X, y), n_trials = 20)

[32m[I 2023-03-13 12:48:40,465][0m Trial 0 finished with value: 12.116192896672505 and parameters: {'eval_metric': 'RMSE', 'iterations': 4425, 'od_wait': 872, 'learning_rate': 0.03643834001654684, 'reg_lambda': 59.892709500499286, 'subsample': 0.6736922978396944, 'random_strength': 18.931689229746556, 'depth': 8, 'min_data_in_leaf': 21, 'leaf_estimation_iterations': 6, 'bagging_temperature': 1.7813426068014704, 'colsample_bylevel': 0.4340320345631101, 'random_state': 42, 'use_best_model': True}. Best is trial 0 with value: 12.116192896672505.[0m
[32m[I 2023-03-13 12:52:10,300][0m Trial 1 finished with value: 12.081676180201528 and parameters: {'eval_metric': 'RMSE', 'iterations': 8278, 'od_wait': 1544, 'learning_rate': 0.022494137713091278, 'reg_lambda': 82.70905834942405, 'subsample': 0.6542155829378076, 'random_strength': 31.3798497056591, 'depth': 8, 'min_data_in_leaf': 6, 'leaf_estimation_iterations': 2, 'bagging_temperature': 2.633047873815998, 'colsample_bylevel': 0.90778213

In [28]:
cb_params = study.best_params

In [30]:
cb_params

{'eval_metric': 'RMSE',
 'iterations': 11345,
 'od_wait': 2254,
 'learning_rate': 0.044151759481315334,
 'reg_lambda': 99.66350736727402,
 'subsample': 0.5045597205367082,
 'random_strength': 10.328390798637777,
 'depth': 4,
 'min_data_in_leaf': 1,
 'leaf_estimation_iterations': 11,
 'bagging_temperature': 65.76530012019344,
 'colsample_bylevel': 0.9979538381243023,
 'random_state': 42,
 'use_best_model': True}

In [31]:
rmse_ens = []

pred_1 = np.zeros(target.shape[0])
pred_2 = np.zeros(target.shape[0])
pred_3 = np.zeros(target.shape[0])

for i, (tr_idx, val_idx) in enumerate(kf.split(X, y)) :
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]
    
    # XGBoost
    model_xgb = XGBRegressor(**xgb_params)
    model_xgb.fit(tr_x, tr_y, eval_set = [(val_x, val_y)], early_stopping_rounds = 1000, verbose = False)
    val_pred_xgb = model_xgb.predict(val_x)
    pred_1 += model_xgb.predict(target) / kf.n_splits
    
    # CatBoost
    model_cat = CatBoostRegressor(**cb_params)
    model_cat.fit(tr_x, tr_y, eval_set = [(val_x, val_y)], early_stopping_rounds = 1000, verbose = False) 
    val_pred_cat = model_cat.predict(val_x)
    pred_2 += model_cat.predict(target) / kf.n_splits

    # LGBM
    model_lgbm = LGBMRegressor(**lgbm_params)
    model_lgbm.fit(tr_x, tr_y, eval_set = [(val_x, val_y)], early_stopping_rounds = 1000, verbose = 0)
    val_pred_lgbm = model_lgbm.predict(val_x)
    pred_3 += model_lgbm.predict(target) / kf.n_splits

    ens_pred = val_pred_xgb * 0.01 + val_pred_lgbm * 0.19 + val_pred_cat * 0.8
    rmse_ens.append(mean_squared_error(val_y, ens_pred, squared = False))
 
    print(f"fold : {i + 1} RMSE of Ensemble : {rmse_ens[i]}")

fold : 1 RMSE of Ensemble : 11.743342281095954
fold : 2 RMSE of Ensemble : 11.69605714694925
fold : 3 RMSE of Ensemble : 12.31560606057687
fold : 4 RMSE of Ensemble : 11.62028339584501
fold : 5 RMSE of Ensemble : 12.64158896338211
fold : 6 RMSE of Ensemble : 11.679223124183142
fold : 7 RMSE of Ensemble : 12.057749756536067
fold : 8 RMSE of Ensemble : 12.022288698483699
fold : 9 RMSE of Ensemble : 12.599534932059605
fold : 10 RMSE of Ensemble : 12.10835463472983


In [32]:
print(f'\n{np.mean(rmse_ens)}')


12.048402899384154


best = 12.043611527687109

In [33]:
submission = pd.read_csv('sample_submission.csv')

In [34]:
submission['Strength'] = pred_2 * 0.8 + pred_1 * 0.01 + pred_3 * 0.19

In [35]:
submission.to_csv("ensemble.csv", index = False)

***

In [None]:
pred_xgb=[]  
pred_cat=[]
pred_lgbm = []

for i, (tr_idx, val_idx) in enumerate(kf.split(X, y)) :
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]
    
    # XGBoost
    model_xgb = XGBRegressor(random_state = 42, max_depth = 4, learning_rate = 0.02, n_estimators = 10000, objective = 'reg:squarederror')
    model_xgb.fit(tr_x, tr_y, eval_set = [(val_x, val_y)], early_stopping_rounds = 1000, verbose = False)
    val_pred_xgb = model_xgb.predict(val_x)

    # CatBoost
    model_cat = CatBoostRegressor(random_state = 42, max_depth = 4, learning_rate = 0.02, n_estimators = 10000, use_best_model = True, eval_metric = 'RMSE')
    model_cat.fit(tr_x, tr_y, eval_set = [(val_x, val_y)], early_stopping_rounds = 1000, verbose = False) 
    val_pred_cat = model_cat.predict(val_x)
    
    # LGBM
    model_lgbm = LGBMRegressor(random_state = 42, max_depth = 5, learning_rate = 0.02, n_estimators = 10000, objective = 'rmse')
    model_lgbm.fit(tr_x, tr_y, eval_set = [(val_x, val_y)], early_stopping_rounds = 1000, verbose = 0)
    val_pred_lgbm = model_lgbm.predict(val_x)
    
    pred_xgb.append(val_pred_xgb)
    pred_cat.append(val_pred_cat)
    pred_lgbm.append(val_pred_lgbm)

In [None]:
ens1_rmse = []
ens2_rmse = []
ens3_rmse = []
ens4_rmse = []

for i in range(10) :

  ens1 = pred_xgb[i] * 0.15 + pred_lgbm[i] * 0.05 + pred_cat[i] * 0.8
  ens2 = pred_xgb[i] * 0.1 + pred_lgbm[i] * 0.1 + pred_cat[i] * 0.8
  ens3 = pred_xgb[i] * 0.05 + pred_lgbm[i] * 0.15 + pred_cat[i] * 0.8
  ens4 = pred_xgb[i] * 0.01 + pred_lgbm[i] * 0.19 + pred_cat[i] * 0.8

  rmse1 = mean_squared_error(val_ys[i], ens1, squared = False)
  rmse2 = mean_squared_error(val_ys[i], ens2, squared = False)
  rmse3 = mean_squared_error(val_ys[i], ens3, squared = False)
  rmse4 = mean_squared_error(val_ys[i], ens4, squared = False)

  ens1_rmse.append(rmse1)
  ens2_rmse.append(rmse2)
  ens3_rmse.append(rmse3)
  ens4_rmse.append(rmse4)
print(np.mean(ens1_rmse), np.mean(ens2_rmse), np.mean(ens3_rmse), np.mean(ens4_rmse))

12.046895611857169 12.045387372625246 12.044251724220057 12.043611527687109
