In [132]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [208]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [209]:
train.head()

Unnamed: 0,id,gravity,ph,osmo,cond,urea,calc,target
0,0,1.013,6.19,443,14.8,124,1.45,0
1,1,1.025,5.4,703,23.6,394,4.18,0
2,2,1.009,6.13,371,24.5,159,9.04,0
3,3,1.021,4.91,442,20.8,398,6.63,1
4,4,1.021,5.53,874,17.8,385,2.21,1


In [210]:
test.head()

Unnamed: 0,id,gravity,ph,osmo,cond,urea,calc
0,414,1.017,5.24,345,11.5,152,1.16
1,415,1.02,5.68,874,29.0,385,3.46
2,416,1.024,5.36,698,19.5,354,13.0
3,417,1.02,5.33,668,25.3,252,3.46
4,418,1.011,5.87,567,29.0,457,2.36


In [211]:
use_cols = train.columns[1:-1].tolist()

In [212]:
train['feature_cv'] = train[use_cols].std(axis = 1) / train[use_cols].mean(axis = 1)
test['feature_cv'] = test[use_cols].std(axis = 1) / test[use_cols].mean(axis = 1)

In [213]:
from sklearn.cluster import KMeans

In [222]:
km = KMeans(n_clusters = 6, random_state = 42)

In [223]:
X = train.drop(['id', 'target'], axis = 1)
y = train['target']

In [224]:
target = test[X.columns]

In [225]:
X['km_cls'] = km.fit_predict(X)

In [226]:
target['km_cls'] = km.predict(target)

In [227]:
from catboost import CatBoostClassifier
from sklearn.model_selection import KFold, train_test_split, StratifiedKFold
from sklearn.metrics import auc, roc_curve

In [228]:
kf = KFold(n_splits = 5, random_state = 42, shuffle = True)

In [229]:
cb_prob = np.zeros(target.shape[0])
cb_auc = 0
for i, (tr_idx, val_idx) in enumerate(kf.split(X, y)) :
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]
    
    cb = CatBoostClassifier(random_state = 42, max_depth = 5, learning_rate = 0.01, n_estimators = 1500, eval_metric = 'AUC', use_best_model = True)
    cb.fit(tr_x, tr_y, eval_set = [(tr_x, tr_y), (val_x, val_y)], early_stopping_rounds = 300, verbose = 0)
    
    val_pred = cb.predict_proba(val_x)[:, 1]
    fpr, tpr, threshold = roc_curve(val_y, val_pred)
    val_auc = auc(fpr, tpr)
    cb_auc += val_auc / kf.n_splits
    print(f'{i + 1} Fold AUC : {val_auc}')
    
    fold_pred = cb.predict_proba(target)[:, 1] / kf.n_splits
    cb_prob += fold_pred
print(f'\n{cb.__class__.__name__} AVG of AUC : {cb_auc}')

1 Fold AUC : 0.873391812865497
2 Fold AUC : 0.8297872340425533
3 Fold AUC : 0.7321212121212122
4 Fold AUC : 0.8805361305361306
5 Fold AUC : 0.7505980861244019

CatBoostClassifier AVG of AUC : 0.813286895137959


In [230]:
submission = pd.read_csv('sample_submission.csv')

In [231]:
submission['target'] = cb_prob

In [232]:
submission.to_csv('cb.csv', index = False)