In [4]:
import os
import joblib

import numpy as np
import pandas as pd
import seaborn as sns
import missingno as msno
import matplotlib.pyplot as plt
from scipy import stats

from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, 

import shap
import optuna
import category_encoders as ce
import optuna.visualization as vis

import xgboost as xgb
import lightgbm as lgb
import catboost as cat

from tqdm.notebook import tqdm

import warnings
warnings.filterwarnings('ignore')

In [51]:
train = pd.read_csv('../data/preprocessed_train1.csv')
test = pd.read_csv('../data/preprocessed_test1.csv')
raw_test = pd.read_csv('../data/test.csv')

In [3]:
X = train.drop('Response', axis=1)
y = train['Response']

In [6]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 381109 entries, 0 to 381108
Data columns (total 10 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Gender                381109 non-null  int64  
 1   Age                   381109 non-null  int64  
 2   Region_Code           381109 non-null  int64  
 3   Previously_Insured    381109 non-null  int64  
 4   Vehicle_Age           381109 non-null  int64  
 5   Vehicle_Damage        381109 non-null  int64  
 6   Annual_Premium        381109 non-null  float64
 7   Policy_Sales_Channel  381109 non-null  int64  
 8   Vintage               381109 non-null  int64  
 9   age < 46              381109 non-null  int64  
dtypes: float64(1), int64(9)
memory usage: 29.1 MB


In [27]:
def cross_validate(model, X, y, n_folds=5):
    scores = []
    kf = KFold(n_splits=n_folds)

    for fold, (train_index, valid_index) in tqdm(enumerate(kf.split(X)), total=n_folds):

        train_x, valid_x = X.iloc[train_index], X.iloc[valid_index]
        train_y, valid_y = y[train_index], y[valid_index]

        model.fit(train_x, train_y, eval_set=[(valid_x, valid_y)], use_best_model=True)

        preds = model.predict(valid_x)
        score = roc_auc_score(valid_y, preds)
        scores.append(score)
#         print(f"Fold {fold + 1}: {score:.6f}")

    print(f'Mean Score: {np.mean(scores)}')

In [42]:
def get_kfold_prediction(model, X, y, test_data, n_folds=5):
    scores = []
    oof_preds = np.zeros(test_data.shape[0])
    kf = KFold(n_splits=n_folds)

    for fold, (train_index, valid_index) in tqdm(enumerate(kf.split(X)), total=n_folds):

        train_x, valid_x = X.iloc[train_index], X.iloc[valid_index]
        train_y, valid_y = y[train_index], y[valid_index]

        model.fit(train_x, train_y, eval_set=[(valid_x, valid_y)], use_best_model=True)

        preds = model.predict_proba(valid_x)[:, 1]

        # change metric according to requirement
        score = roc_auc_score(valid_y, preds)
        scores.append(score)

        # change this according to requirement
        oof_preds += model.predict_proba(test_data)[:, 1] / n_folds

#         print(f"Fold {fold + 1}: {score:.6f}")

    print(f'Mean Score: {np.mean(scores)}')

    return oof_preds

In [43]:
cat_params = {
    'n_estimators': 1000,
    'learning_rate': 0.05,
    'max_depth': 7,
    'reg_lambda': 10,
    'cat_features': [2, 4, 7],
    'early_stopping_rounds': 100,
    'eval_metric': 'AUC',
    # 'subsample': 0.8,
    # 'colsample_bylevel': 0.8,
    # 'bagging_temperature': 0.5,
    # 'random_strength': 2,
    'one_hot_max_size': 4,
    'verbose': 50,
    'random_state': 1234,
    'thread_count': -1
}

In [44]:
cat_model = cat.CatBoostClassifier(**cat_params)

In [30]:
cross_validate(cat_model, X, y)

  0%|          | 0/5 [00:00<?, ?it/s]

0:	test: 0.8360164	best: 0.8360164 (0)	total: 216ms	remaining: 3m 35s
50:	test: 0.8597219	best: 0.8597687 (47)	total: 9.5s	remaining: 2m 56s
100:	test: 0.8593999	best: 0.8597687 (47)	total: 19.5s	remaining: 2m 53s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.8597686884
bestIteration = 47

Shrink model to first 48 iterations.
0:	test: 0.8351825	best: 0.8351825 (0)	total: 215ms	remaining: 3m 34s
50:	test: 0.8592732	best: 0.8592732 (50)	total: 9.26s	remaining: 2m 52s
100:	test: 0.8593790	best: 0.8594391 (70)	total: 18.6s	remaining: 2m 45s
150:	test: 0.8592029	best: 0.8594391 (70)	total: 27.7s	remaining: 2m 35s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.8594390795
bestIteration = 70

Shrink model to first 71 iterations.
0:	test: 0.8342105	best: 0.8342105 (0)	total: 217ms	remaining: 3m 37s
50:	test: 0.8571997	best: 0.8571997 (50)	total: 9.13s	remaining: 2m 49s
100:	test: 0.8570680	best: 0.8573580 (58)	total: 18.7s	remaining: 2m 46s
150:	test

In [45]:
oof_test_probs = get_kfold_prediction(cat_model, X, y, test, 10)

  0%|          | 0/10 [00:00<?, ?it/s]

0:	test: 0.8353680	best: 0.8353680 (0)	total: 238ms	remaining: 3m 57s
50:	test: 0.8596508	best: 0.8596508 (50)	total: 12s	remaining: 3m 43s
100:	test: 0.8593304	best: 0.8596645 (51)	total: 22.9s	remaining: 3m 23s
150:	test: 0.8591352	best: 0.8596645 (51)	total: 33.7s	remaining: 3m 9s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.8596645096
bestIteration = 51

Shrink model to first 52 iterations.
0:	test: 0.8364066	best: 0.8364066 (0)	total: 253ms	remaining: 4m 12s
50:	test: 0.8596881	best: 0.8597259 (44)	total: 11.2s	remaining: 3m 28s
100:	test: 0.8594602	best: 0.8597259 (44)	total: 21.8s	remaining: 3m 13s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.8597259497
bestIteration = 44

Shrink model to first 45 iterations.
0:	test: 0.8327286	best: 0.8327286 (0)	total: 285ms	remaining: 4m 45s
50:	test: 0.8585558	best: 0.8586169 (48)	total: 10s	remaining: 3m 6s
100:	test: 0.8585924	best: 0.8587438 (61)	total: 19.7s	remaining: 2m 55s
150:	test: 0.8

In [49]:
oof_test_probs

array([3.57637643e-04, 3.14859615e-01, 2.82039915e-01, 8.26877367e-03,
       3.27540810e-04, 2.91816727e-04, 2.88986709e-04, 3.06304288e-04,
       3.65090502e-01, 1.64974222e-03, 7.50540820e-04, 1.91497249e-01,
       5.48988784e-04, 3.25187425e-01, 1.64958822e-01, 4.39003562e-04,
       1.51882638e-01, 2.88142698e-01, 3.07606919e-04, 4.00343248e-04])

In [50]:
oof_test_probs.max()

0.7107755777376663

In [53]:
sub = pd.DataFrame({
    'id': raw_test['id'],
    'Response': oof_test_probs
})

print(sub.shape)
sub.head()

(127037, 2)


Unnamed: 0,id,Response
0,381110,0.000358
1,381111,0.31486
2,381112,0.28204
3,381113,0.008269
4,381114,0.000328


In [54]:
sub.to_csv('../submissions/catboost1.csv', index=False)