In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import *

In [4]:
X = train.drop(['id', 'Class'], axis = 1)
y = train['Class']

In [5]:
target = test[X.columns]

In [6]:
skf = StratifiedKFold(n_splits = 10, random_state = 42, shuffle = True)

In [7]:
### K-Fold Ensemble
cb_pred = np.zeros(target.shape[0])
cb_score = 0

for i, (tr_idx, val_idx) in enumerate(skf.split(X, y)) :
    
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]
    cb = CatBoostClassifier(random_state = i, max_depth = 5, learning_rate = 0.02, iterations = 10000, use_best_model = True, eval_metric = 'Logloss')
    cb.fit(tr_x, tr_y, eval_set = [(tr_x, tr_y), (val_x, val_y)], early_stopping_rounds = 1000, verbose = 0)
    
    val_pred = cb.predict_proba(val_x)[:, 1]
    val_log_loss = log_loss(val_y, val_pred)
    cb_score += val_log_loss / skf.n_splits
    print(f'{i + 1} Fold Logloss : {val_log_loss}')
    
    fold_pred = cb.predict_proba(target)[:, 1] / skf.n_splits
    cb_pred += fold_pred
print(f'\n{cb.__class__.__name__} AVG of Logloss is {cb_score}')

1 Fold Logloss : 0.025191101014705012
2 Fold Logloss : 0.033656964061488694
3 Fold Logloss : 0.03005187043146397
4 Fold Logloss : 0.032623589496954784
5 Fold Logloss : 0.031446309946702763
6 Fold Logloss : 0.03065882032185964
7 Fold Logloss : 0.03323976227145605
8 Fold Logloss : 0.02951882774562623
9 Fold Logloss : 0.030998358570111902
10 Fold Logloss : 0.03376151699162498

CatBoostClassifier AVG of Logloss is 0.031114712085199402


In [8]:
submission = pd.read_csv('sample_submission.csv')

In [9]:
submission['Class'] = cb_pred

In [10]:
submission.to_csv('catboost.csv', index = False)