In [1]:
from google.colab import drive
drive.mount('/content/gdrive/', force_remount = True)

Mounted at /content/gdrive/


In [2]:
%cd /content/gdrive/My Drive/kaggle/session3episode9/

/content/gdrive/My Drive/kaggle/session3episode9


In [4]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [None]:
!pip install catboost
!pip install sdv

In [34]:
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sdv.tabular import CTGAN

In [58]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [59]:
from sklearn.cluster import KMeans

In [60]:
km = KMeans(n_clusters = 8, random_state = 42)

In [61]:
train['km_cls'] = km.fit_predict(train.iloc[:, 1:-1])
test['km_cls'] = km.predict(test.iloc[:, 1:])

In [64]:
train['feature_mean'] = train.iloc[:, 1:-2].sum(axis = 1)
test['feature_mean'] = test.iloc[:, 1:-1].sum(axis = 1)

In [68]:
kf = KFold(n_splits = 10, random_state = 42, shuffle = True)

In [69]:
X = train.drop(['id', 'Strength'], axis = 1)
y = train['Strength']
target = test[X.columns]

In [70]:
rmse_xgb=[]  
rmse_cat=[]
rmse_lgbm = []

pred_xgb = np.zeros(target.shape[0])
pred_cat = np.zeros(target.shape[0])
pred_lgbm = np.zeros(target.shape[0])
rmse = []

for i, (tr_idx, val_idx) in enumerate(kf.split(X, y)) :
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]
    
    # XGBoost
    model_xgb = XGBRegressor(random_state = 42, max_depth = 4, learning_rate = 0.02, n_estimators = 10000, objective = 'reg:squarederror')
    model_xgb.fit(tr_x, tr_y, eval_set = [(val_x, val_y)], early_stopping_rounds = 1000, verbose = False) 
    pred_xgb += model_xgb.predict(target) / kf.n_splits
    rmse_xgb.append(mean_squared_error(val_y, model_xgb.predict(val_x), squared = False))
    
    # CatBoost
    model_cat = CatBoostRegressor(random_state = 42, max_depth = 4, learning_rate = 0.02, n_estimators = 10000, use_best_model = True, eval_metric = 'RMSE')
    model_cat.fit(tr_x, tr_y, eval_set = [(val_x, val_y)], early_stopping_rounds = 1000, verbose = False) 
    pred_cat += model_cat.predict(target) / kf.n_splits
    rmse_cat.append(mean_squared_error(val_y, model_cat.predict(val_x), squared = False))

    # LGBM
    model_lgbm = LGBMRegressor(random_state = 42, max_depth = 5, learning_rate = 0.02, n_estimators = 10000, objective = 'rmse')
    model_lgbm.fit(tr_x, tr_y, eval_set = [(val_x, val_y)], early_stopping_rounds = 1000, verbose = 0)
    pred_lgbm += model_lgbm.predict(target) / kf.n_splits
    rmse_lgbm.append(mean_squared_error(val_y, model_lgbm.predict(val_x), squared = False))

    rmse_val = (rmse_xgb[i]+ rmse_cat[i] + rmse_lgbm[i]) / 3
    rmse.append(rmse_val)
    print(f"fold : {i + 1} rmse xgb : {rmse_xgb[i]} | rmse cat : {rmse_cat[i]} | rmse lgbm : {rmse_lgbm[i]} | final rmse is : {rmse[i]}")


print(np.mean(rmse))

fold : 1 rmse xgb : 11.728594719729251 | rmse cat : 11.667795224915245 | rmse lgbm : 11.738212070698955 | final rmse is : 11.711534005114485
fold : 2 rmse xgb : 11.724154554163876 | rmse cat : 11.676393331283899 | rmse lgbm : 11.687755150347417 | final rmse is : 11.69610101193173
fold : 3 rmse xgb : 12.362133640353008 | rmse cat : 12.361766287814959 | rmse lgbm : 12.318633717807998 | final rmse is : 12.347511215325321
fold : 4 rmse xgb : 11.750128905481741 | rmse cat : 11.6105179293951 | rmse lgbm : 11.729101289276915 | final rmse is : 11.696582708051253
fold : 5 rmse xgb : 12.806146798756684 | rmse cat : 12.645237176618874 | rmse lgbm : 12.791490329177588 | final rmse is : 12.74762476818438
fold : 6 rmse xgb : 11.775474144547633 | rmse cat : 11.704881279285543 | rmse lgbm : 11.767591168843056 | final rmse is : 11.749315530892076
fold : 7 rmse xgb : 12.183441867812155 | rmse cat : 12.098372775071168 | rmse lgbm : 12.19412234452446 | final rmse is : 12.158645662469262
fold : 8 rmse xgb 

In [71]:
print(f"LGBM : {np.mean(rmse_lgbm)} | XGB : {np.mean(rmse_xgb)} | CatBoost : {np.mean(rmse_cat)}")

LGBM : 12.130737901854484 | XGB : 12.133171276354414 | CatBoost : 12.046075685422933


In [72]:
submission = pd.read_csv('sample_submission.csv')

In [73]:
submission['Strength'] = pred_cat * 0.6 + pred_xgb * 0.2 + pred_lgbm * 0.2

In [74]:
submission.to_csv("ensemble.csv", index = False)