In [96]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from ngboost import NGBRegressor

In [73]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [75]:
train['feature_mean'] = train.iloc[:, 1:-1].mean(axis = 1)
test['feature_mean'] = test.iloc[:, 1:].mean(axis = 1)

In [85]:
kf = KFold(n_splits = 8, random_state = 42, shuffle = True)

In [86]:
ss = StandardScaler()

In [87]:
X = train.drop(['id', 'Strength'], axis = 1)
target = test[X.columns]

In [88]:
X = pd.DataFrame(ss.fit_transform(X), columns = X.columns)
y = train['Strength']

In [89]:
target = pd.DataFrame(ss.transform(target), columns = X.columns)

In [90]:
%%time

cb_pred = np.zeros(target.shape[0])
cb_rmse = 0
for i, (tr_idx, val_idx) in enumerate(kf.split(X, y)) :
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]
    
    cb = CatBoostRegressor(random_state = 42, learning_rate = 0.02, n_estimators = 10000, max_depth = 5)
    cb.fit(tr_x, tr_y, eval_set = [(tr_x, tr_y), (val_x, val_y)], early_stopping_rounds = 1500, verbose = 0)
    
    val_pred = cb.predict(val_x)
    fold_rmse = mean_squared_error(val_y, val_pred) ** 0.5
    cb_rmse += fold_rmse / kf.n_splits
    print(f'{i + 1} Fold RMSE : {fold_rmse} Iterations : {cb.best_iteration_}')
    
    fold_pred = cb.predict(target) / kf.n_splits
    cb_pred += fold_pred
    
print(f'\n{cb.__class__.__name__} Mean of RMSE : {cb_rmse}')

1 Fold RMSE : 11.678100301248495 Iterations : 816
2 Fold RMSE : 11.669365898827962 Iterations : 539
3 Fold RMSE : 12.344550467056505 Iterations : 375
4 Fold RMSE : 12.25873345334013 Iterations : 263
5 Fold RMSE : 11.978393727501528 Iterations : 1068
6 Fold RMSE : 11.833102523129963 Iterations : 243
7 Fold RMSE : 12.13728424624623 Iterations : 488
8 Fold RMSE : 12.474234326004929 Iterations : 590

CatBoostRegressor Mean of RMSE : 12.046720617919467
CPU times: user 31.8 s, sys: 19.4 s, total: 51.2 s
Wall time: 18.9 s


In [91]:
%%time

lgbm_pred = np.zeros(target.shape[0])
lgbm_rmse = 0
for i, (tr_idx, val_idx) in enumerate(kf.split(X, y)) :
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]
    
    lgbm = LGBMRegressor(random_state = 42, learning_rate = 0.02, n_estimators = 10000, max_depth = 5, objective = 'l2')
    lgbm.fit(tr_x, tr_y, eval_set = [(tr_x, tr_y), (val_x, val_y)], early_stopping_rounds = 1500, verbose = 0)
    
    val_pred = lgbm.predict(val_x)
    fold_rmse = mean_squared_error(val_y, val_pred) ** 0.5
    lgbm_rmse += fold_rmse / kf.n_splits
    print(f'{i + 1} Fold RMSE : {fold_rmse} Iterations : {lgbm.best_iteration_}')
    
    fold_pred = lgbm.predict(target) / kf.n_splits
    lgbm_pred += fold_pred
    
print(f'\n{lgbm.__class__.__name__} Mean of RMSE : {lgbm_rmse}')

1 Fold RMSE : 11.750850226189874 Iterations : 406
2 Fold RMSE : 11.738872531799089 Iterations : 176
3 Fold RMSE : 12.406382110401243 Iterations : 184
4 Fold RMSE : 12.39185509272988 Iterations : 151
5 Fold RMSE : 12.04011432795785 Iterations : 268
6 Fold RMSE : 11.958778739520701 Iterations : 141
7 Fold RMSE : 12.22266760577524 Iterations : 225
8 Fold RMSE : 12.58022254523948 Iterations : 191

LGBMRegressor Mean of RMSE : 12.13621789745167
CPU times: user 48.2 s, sys: 4.02 s, total: 52.3 s
Wall time: 6.87 s


In [92]:
%%time

xgb_pred = np.zeros(target.shape[0])
xgb_rmse = 0
for i, (tr_idx, val_idx) in enumerate(kf.split(X, y)) :
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]
    
    xgb = XGBRegressor(random_state = 42, learning_rate = 0.02, n_estimators = 10000, max_depth = 5, objective = 'reg:squarederror')
    xgb.fit(tr_x, tr_y, eval_set = [(tr_x, tr_y), (val_x, val_y)], early_stopping_rounds = 1500, verbose = 0)
    
    val_pred = xgb.predict(val_x)
    fold_rmse = mean_squared_error(val_y, val_pred) ** 0.5
    xgb_rmse += fold_rmse / kf.n_splits
    print(f'{i + 1} Fold RMSE : {fold_rmse} Iterations : {xgb.best_iteration}')
    
    fold_pred = xgb.predict(target) / kf.n_splits
    xgb_pred += fold_pred
    
print(f'\n{xgb.__class__.__name__} Mean of RMSE : {xgb_rmse}')

1 Fold RMSE : 11.775904315960506 Iterations : 197
2 Fold RMSE : 11.75518671346508 Iterations : 210
3 Fold RMSE : 12.40462780961416 Iterations : 363
4 Fold RMSE : 12.401486497449048 Iterations : 259
5 Fold RMSE : 12.079413652820365 Iterations : 315
6 Fold RMSE : 11.921972003558842 Iterations : 176
7 Fold RMSE : 12.228999149679918 Iterations : 246
8 Fold RMSE : 12.598531128606192 Iterations : 562

XGBRegressor Mean of RMSE : 12.145765158894266
CPU times: user 2min 37s, sys: 16.7 s, total: 2min 54s
Wall time: 25.1 s


In [104]:
%%time

ngb_pred = np.zeros(target.shape[0])
ngb_rmse = 0
for i, (tr_idx, val_idx) in enumerate(kf.split(X, y)) :
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]
    
    ngb = NGBRegressor(random_state = 42, learning_rate = 0.03, n_estimators = 2000, verbose = 0)
    ngb.fit(tr_x, tr_y, val_x, val_y, early_stopping_rounds = 200)
    
    val_pred = ngb.predict(val_x)
    fold_rmse = mean_squared_error(val_y, val_pred) ** 0.5
    ngb_rmse += fold_rmse / kf.n_splits
    print(f'{i + 1} Fold RMSE : {fold_rmse}')
    
    fold_pred = ngb.predict(target) / kf.n_splits
    ngb_pred += fold_pred
    
print(f'\n{ngb.__class__.__name__} Mean of RMSE : {ngb_rmse}')

1 Fold RMSE : 11.717147680813813
2 Fold RMSE : 11.711753657330577
3 Fold RMSE : 12.410394805098608
4 Fold RMSE : 12.379907770711252
5 Fold RMSE : 12.077621201077793
6 Fold RMSE : 11.912707878556965
7 Fold RMSE : 12.201489906653638
8 Fold RMSE : 12.598401876223646

NGBRegressor Mean of RMSE : 12.126178097058288
CPU times: user 32.4 s, sys: 361 ms, total: 32.8 s
Wall time: 32.8 s


In [64]:
submission = pd.read_csv('sample_submission.csv')

In [105]:
submission['Strength'] = (cb_pred * 0.4 + ngb_pred * 0.25 + lgbm_pred * 0.2 + xgb_pred * 0.15)

In [106]:
submission.to_csv('lgbm_catboost_xgb_ngb.csv', index = False)

***
## NN

In [9]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

from sklearn.preprocessing import StandardScaler

In [10]:
ss = StandardScaler()

In [11]:
X = torch.tensor(ss.fit_transform(X), dtype = torch.float)
y = torch.tensor(y, dtype = torch.long)
target = torch.tensor(ss.transform(target), dtype = torch.float)
_ = torch.zeros(target.shape[0])

In [23]:
class NN(nn.Module) :
    
    def __init__(self, feature) :
        
        super(NN, self).__init__()
        self.layer_1 = nn.Linear(feature.shape[1], 32)
#        self.bn1 = nn.BatchNorm1d(32)
        self.relu_1 = nn.LeakyReLU()
        self.layer_2 = nn.Linear(32, 16)
 #       self.bn2 = nn.BatchNorm1d(16)
        self.relu_2 = nn.LeakyReLU()
        self.layer_3 = nn.Linear(16, 4)
  #      self.bn3 = nn.BatchNorm1d(4)
        self.relu_3 = nn.LeakyReLU()
        self.layer_4 = nn.Linear(4, 1)
        
    def forward(self, x) :
        
        x = self.layer_1(x)
     #   x = self.bn1(x)
        x = self.relu_1(x)
        x = self.layer_2(x)
      #  x = self.bn2(x)
        x = self.relu_2(x)
        x = self.layer_3(x)
       # x = self.bn3(x)
        x = self.relu_3(x)
        x = self.layer_4(x)
        
        return x

In [24]:
def train(model, optimizer, num_epochs, tr_loader, val_loader, criterion) :
    
    for epoch in range(num_epochs) :
        
        model.train()
        tr_losses = 0
        
        for xx, yy in tr_loader :
            
            optimizer.zero_grad()
            
            xx = xx.to('cpu')
            yy = yy.to('cpu')
            pred = model(xx)
            
            loss = criterion(pred, yy)
            loss.backward()
            tr_losses += loss.item() / len(tr_loader)
            
            optimizer.step()
            
        with torch.no_grad() :
            
            model.eval()
            val_losses = 0
            epoch_rmse = 0
            
            for xx, yy in val_loader :
                
                xx = xx.to('cpu')
                yy = yy.to('cpu')
                pred = model(xx)
                
                loss = criterion(pred, yy)
                val_losses += loss.item() / len(val_loader)
                batch_rmse = mean_squared_error(yy, pred) ** 0.5
                epoch_rmse += batch_rmse / len(val_loader)
                
        print(f"{epoch + 1} Epoch Train Loss : {round(tr_losses, 4)} Validation Loss : {round(val_losses, 4)} Validation RMSE : {round(epoch_rmse, 4)}")
    return model

In [25]:
def RMSELoss(y_actual, y_pred) :
    
    return torch.sqrt(torch.mean((y_pred - y_actual) ** 2))

In [None]:
%%time

nn_pred = np.zeros(target.shape[0])
nn_rmse = 0
for i, (tr_idx, val_idx) in enumerate(kf.split(X, y)) :
    tr_x, tr_y = X[tr_idx], y[tr_idx]
    val_x, val_y = X[val_idx], y[val_idx]
    
    tr_ds = TensorDataset(tr_x, tr_y)
    val_ds = TensorDataset(val_x, val_y)
    te_ds = TensorDataset(target, _)

    tr_loader = DataLoader(tr_ds, batch_size = 32, shuffle = True, drop_last = True)
    val_loader = DataLoader(val_ds, batch_size = 32, shuffle = False, drop_last = False)
    te_loader = DataLoader(te_ds, batch_size = 32, shuffle = False, drop_last = False)

    model = NN(X).to('cpu')
    optimizer = optim.Adam(model.parameters(), lr = 0.02)
    criterion = RMSELoss

    best_model = train(model, optimizer, 15, tr_loader, val_loader, criterion)