In [3]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [5]:
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

In [75]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [76]:
from sklearn.cluster import KMeans

In [77]:
km = KMeans(n_clusters = 8, random_state = 42)

In [78]:
train['km_cls'] = km.fit_predict(train.iloc[:, 1:-1])
test['km_cls'] = km.predict(test.iloc[:, 1:])

In [79]:
train['feature_mean'] = train.iloc[:, 1:-2].sum(axis = 1)
test['feature_mean'] = test.iloc[:, 1:-1].sum(axis = 1)

In [80]:
kf = KFold(n_splits = 10, random_state = 42, shuffle = True)

In [81]:
X = train.drop(['id', 'Strength'], axis = 1)
y = train['Strength']
target = test[X.columns]

In [82]:
rmse_ens = []

pred_1 = np.zeros(target.shape[0])
pred_2 = np.zeros(target.shape[0])
pred_3 = np.zeros(target.shape[0])

for i, (tr_idx, val_idx) in enumerate(kf.split(X, y)) :
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]
    
    # XGBoost
    model_xgb = XGBRegressor(random_state = 42, max_depth = 4, learning_rate = 0.02, n_estimators = 10000, objective = 'reg:squarederror')
    model_xgb.fit(tr_x, tr_y, eval_set = [(val_x, val_y)], early_stopping_rounds = 1000, verbose = False)
    val_pred_xgb = model_xgb.predict(val_x)
    pred_1 += model_xgb.predict(target) / kf.n_splits
    
    # CatBoost
    model_cat = CatBoostRegressor(random_state = 42, max_depth = 4, learning_rate = 0.02, n_estimators = 10000, use_best_model = True, eval_metric = 'RMSE')
    model_cat.fit(tr_x, tr_y, eval_set = [(val_x, val_y)], early_stopping_rounds = 1000, verbose = False) 
    val_pred_cat = model_cat.predict(val_x)
    pred_2 += model_cat.predict(target) / kf.n_splits

    # LGBM
    model_lgbm = LGBMRegressor(random_state = 42, max_depth = 5, learning_rate = 0.02, n_estimators = 10000, objective = 'rmse')
    model_lgbm.fit(tr_x, tr_y, eval_set = [(val_x, val_y)], early_stopping_rounds = 1000, verbose = 0)
    val_pred_lgbm = model_lgbm.predict(val_x)
    pred_3 += model_lgbm.predict(target) / kf.n_splits

    ens_pred = val_pred_xgb * 0.01 + val_pred_lgbm * 0.19 + val_pred_cat * 0.8
    rmse_ens.append(mean_squared_error(val_y, ens_pred, squared = False))
 
    print(f"fold : {i + 1} RMSE of Ensemble : {rmse_ens[i]}")

fold : 1 RMSE of Ensemble : 11.665837062917277
fold : 2 RMSE of Ensemble : 11.662083426362718
fold : 3 RMSE of Ensemble : 12.332666927656792
fold : 4 RMSE of Ensemble : 11.612519173486321
fold : 5 RMSE of Ensemble : 12.657017932589646
fold : 6 RMSE of Ensemble : 11.689934959475787
fold : 7 RMSE of Ensemble : 12.102846280571494
fold : 8 RMSE of Ensemble : 11.987116203220538
fold : 9 RMSE of Ensemble : 12.63423025761019
fold : 10 RMSE of Ensemble : 12.091863052980317


In [83]:
print(f'\n{np.mean(rmse_ens)}')


12.043611527687109


best = 12.043611527687109

In [89]:
submission = pd.read_csv('sample_submission.csv')

In [90]:
submission['Strength'] = pred_2 * 0.8 + pred_1 * 0.01 + pred_3 * 0.19

In [91]:
submission.to_csv("ensemble.csv", index = False)

***

In [71]:
pred_xgb=[]  
pred_cat=[]
pred_lgbm = []

for i, (tr_idx, val_idx) in enumerate(kf.split(X, y)) :
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]
    
    # XGBoost
    model_xgb = XGBRegressor(random_state = 42, max_depth = 4, learning_rate = 0.02, n_estimators = 10000, objective = 'reg:squarederror')
    model_xgb.fit(tr_x, tr_y, eval_set = [(val_x, val_y)], early_stopping_rounds = 1000, verbose = False)
    val_pred_xgb = model_xgb.predict(val_x)

    # CatBoost
    model_cat = CatBoostRegressor(random_state = 42, max_depth = 4, learning_rate = 0.02, n_estimators = 10000, use_best_model = True, eval_metric = 'RMSE')
    model_cat.fit(tr_x, tr_y, eval_set = [(val_x, val_y)], early_stopping_rounds = 1000, verbose = False) 
    val_pred_cat = model_cat.predict(val_x)
    
    # LGBM
    model_lgbm = LGBMRegressor(random_state = 42, max_depth = 5, learning_rate = 0.02, n_estimators = 10000, objective = 'rmse')
    model_lgbm.fit(tr_x, tr_y, eval_set = [(val_x, val_y)], early_stopping_rounds = 1000, verbose = 0)
    val_pred_lgbm = model_lgbm.predict(val_x)
    
    pred_xgb.append(val_pred_xgb)
    pred_cat.append(val_pred_cat)
    pred_lgbm.append(val_pred_lgbm)

In [73]:
ens1_rmse = []
ens2_rmse = []
ens3_rmse = []
ens4_rmse = []

for i in range(10) :

  ens1 = pred_xgb[i] * 0.15 + pred_lgbm[i] * 0.05 + pred_cat[i] * 0.8
  ens2 = pred_xgb[i] * 0.1 + pred_lgbm[i] * 0.1 + pred_cat[i] * 0.8
  ens3 = pred_xgb[i] * 0.05 + pred_lgbm[i] * 0.15 + pred_cat[i] * 0.8
  ens4 = pred_xgb[i] * 0.01 + pred_lgbm[i] * 0.19 + pred_cat[i] * 0.8

  rmse1 = mean_squared_error(val_ys[i], ens1, squared = False)
  rmse2 = mean_squared_error(val_ys[i], ens2, squared = False)
  rmse3 = mean_squared_error(val_ys[i], ens3, squared = False)
  rmse4 = mean_squared_error(val_ys[i], ens4, squared = False)

  ens1_rmse.append(rmse1)
  ens2_rmse.append(rmse2)
  ens3_rmse.append(rmse3)
  ens4_rmse.append(rmse4)
print(np.mean(ens1_rmse), np.mean(ens2_rmse), np.mean(ens3_rmse), np.mean(ens4_rmse))

12.046895611857169 12.045387372625246 12.044251724220057 12.043611527687109
