In [1]:
import catboost
catboost.__version__

'0.25'

In [2]:
import warnings
warnings.simplefilter("ignore")

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
import sys, json, dill, re, random
import pandas as pd
import numpy as np

from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score

from utils import fit_models

import matplotlib.pyplot as plt
%matplotlib inline

In [5]:
id_cols = ['phone_num', 'date', 'month', 'is_train', 'target']

with open('../objects/final_feats_all.json', 'r') as f_in:
    try_feats = json.load(f_in)
    
with open('../objects/final_feats_cat.json', 'r') as f_in:
    cat_feats = json.load(f_in)

print(len(try_feats), len(cat_feats))

303 11


In [8]:
df_data = pd.read_csv('../objects/ml_prod_meetup_risk_sample__final.csv', index_col=None, dtype={'sex':str})

df_train = df_data[df_data['is_train']==1]
df_test = df_data[df_data['is_train']==0]

train_default_rate = np.round(df_train['target'].value_counts(normalize=True).min(), 3)
test_default_rate = np.round(df_test['target'].value_counts(normalize=True).min(), 3)

print('Train size', df_train.shape[0], ', OOT sample size', df_test.shape[0])
print('Train default rate', train_default_rate, ', Test default rate', test_default_rate)

Train size 504549 , OOT sample size 216484
Train default rate 0.032 , Test default rate 0.032


In [9]:
X_train = df_train[try_feats]
y_train = df_train['target'].values

X_test = df_test[try_feats]
y_test = df_test['target'].values

### fit on CPU

In [10]:
# [17, 15, 6, 160, 29, 56, 44, 64, 2, 75]
n=10 
random_states = [random.randint(0, 200) for _ in range(n)]
print(random_states)

[17, 15, 6, 160, 29, 56, 44, 64, 2, 75]


In [11]:
params_set_1 = {
    "n_estimators": 1000, "max_depth": 6, "cat_features": cat_feats, "subsample": 0.8,
    "eval_metric": "AUC", "verbose": False, "task_type": "CPU"
}

params_set_2 = {
    "n_estimators": 700, "max_depth": 5, "cat_features": cat_feats, "subsample": 0.8,
    "eval_metric": "AUC", "verbose": False, "task_type": "CPU"
}

params_set_3 = {
    "n_estimators": 800, "max_depth": 4, "cat_features": cat_feats, "subsample": 0.8,
    "eval_metric": "AUC", "verbose": False, "task_type": "CPU"
}

params_dict_cpu = {
    "params_set_1": params_set_1,
    "params_set_2": params_set_2,
    "params_set_3": params_set_3,
}

In [12]:
fit_models(
    model_lib="catboost",
    fit_type="cpu",
    p_dict=params_dict_cpu,
    r_states=random_states,
    cat_feats=cat_feats,
    x_train=X_train,
    y_train=y_train,
    x_test=X_test,
    y_test=y_test,
)

Metrics df saved to: ../metrics/catboost_cpu_500K_test_metrics_final.csv


In [16]:
df_res = pd.read_csv('../metrics/catboost_cpu_500K_test_metrics_final.csv', sep=';')
print(df_res.shape, df_res['roc_auc'].mean(), df_res['fitting_time'].mean())

(30, 5) 0.7671935071213586 85.63076329231262


### fit on GPU

In [17]:
# default bootstrap type (bayesian) doesn't support taken fraction option
# does not affect ROC AUC

params_set_1 = {
    "n_estimators": 1000, "max_depth": 6, "cat_features": cat_feats,
    "eval_metric": "AUC", "verbose": False, "task_type": "GPU", "devices": "0"
}

params_set_2 = {
    "n_estimators": 700, "max_depth": 5, "cat_features": cat_feats,
    "eval_metric": "AUC", "verbose": False, "task_type": "GPU", "devices": "0"
}

params_set_3 = {
    "n_estimators": 800, "max_depth": 4, "cat_features": cat_feats,
    "eval_metric": "AUC", "verbose": False, "task_type": "GPU", "devices": "0"
}

params_dict_gpu = {
    "params_set_1": params_set_1,
    "params_set_2": params_set_2,
    "params_set_3": params_set_3,
}

In [18]:
fit_models(
    model_lib="catboost",
    fit_type="gpu",
    p_dict=params_dict_gpu,
    r_states=random_states,
    cat_feats=cat_feats,
    x_train=X_train,
    y_train=y_train,
    x_test=X_test,
    y_test=y_test,
)

Metrics df saved to: ../metrics/catboost_gpu_500K_test_metrics_final.csv


In [19]:
df_res = pd.read_csv('../metrics/catboost_gpu_500K_test_metrics_final.csv', sep=';')
print(df_res.shape, df_res['roc_auc'].mean(), df_res['fitting_time'].mean())

(30, 5) 0.7307266410800715 31.640563941001894


In [20]:
0.7671935071213586 - 0.7307266410800715  # средняя разница - 3.6 AUC

0.03646686604128713