In [1]:
import xgboost
xgboost.__version__

'1.2.0'

In [2]:
import warnings
warnings.simplefilter("ignore")

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
import sys, json, dill, re, random, time
import pandas as pd
import numpy as np

from xgboost import XGBClassifier
from category_encoders import CountEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score

from utils import fit_models

In [5]:
id_cols = ['phone_num', 'date', 'month', 'is_train', 'target']

with open('../objects/final_feats_all.json', 'r') as f_in:
    try_feats = json.load(f_in)
    
with open('../objects/final_feats_cat.json', 'r') as f_in:
    cat_feats = json.load(f_in)

print(len(try_feats), len(cat_feats))

303 11


In [6]:
df_data = pd.read_csv('../objects/ml_prod_meetup_risk_sample__final.csv', index_col=None, dtype={'sex':str})

df_train = df_data[df_data['is_train']==1]
df_test = df_data[df_data['is_train']==0]

train_default_rate = np.round(df_train['target'].value_counts(normalize=True).min(), 3)
test_default_rate = np.round(df_test['target'].value_counts(normalize=True).min(), 3)

print('Train size', df_train.shape[0], ', OOT sample size', df_test.shape[0])
print('Train default rate', train_default_rate, ', Test default rate', test_default_rate)

Train size 504549 , OOT sample size 216484
Train default rate 0.032 , Test default rate 0.032


In [7]:
X_train = df_train[try_feats]
y_train = df_train['target'].values

X_test = df_test[try_feats]
y_test = df_test['target'].values

### fit on CPU

In [8]:
# [77, 176, 64, 126, 181, 195, 154, 169, 80, 116]
n=10 
[random.randint(0, 200) for _ in range(n)]
print(random_states)

[77, 176, 64, 126, 181, 195, 154, 169, 80, 116]


In [9]:
params_set_1 = {"max_depth": 6, "n_estimators": 1000, "learning_rate": 0.02, "eval_metric": "auc",
    "verbosity": 1, 'subsample':0.8}

params_set_2 = {"max_depth": 5, "n_estimators": 700, "learning_rate": 0.04, "eval_metric": "auc",
    "verbosity": 1, 'subsample':0.8}

params_set_3 = {"max_depth": 4, "n_estimators": 800, "learning_rate": 0.05, "eval_metric": "auc",
    "verbosity": 1, 'subsample':0.8}

params_dict_cpu = {
    'params_set_1': params_set_1,
    'params_set_2': params_set_2,
    'params_set_3': params_set_3,
}

In [10]:
fit_models(
    model_lib="xgboost",
    fit_type="cpu",
    p_dict=params_dict_cpu,
    r_states=random_states,
    cat_feats=cat_feats,
    x_train=X_train,
    y_train=y_train,
    x_test=X_test,
    y_test=y_test,
) 

Metrics df saved to: ../metrics/xgboost_cpu_500K_test_metrics_final.csv


In [11]:
df_res = pd.read_csv('../metrics/xgboost_cpu_500K_test_metrics_final.csv', sep=';')
print(df_res.shape, df_res['roc_auc'].mean(), df_res['fitting_time'].mean())

(30, 5) 0.7697310479104854 454.5530255715052


### fit on GPU

In [18]:
params_set_1 = {"max_depth": 6, "n_estimators": 1000, "learning_rate": 0.02, "eval_metric": "auc",
    "verbosity": 1, 'subsample':0.8, "tree_method": "gpu_hist", "gpu_id": 1}

params_set_2 = {"max_depth": 5, "n_estimators": 700, "learning_rate": 0.04, "eval_metric": "auc",
    "verbosity": 1, 'subsample':0.8, "tree_method": "gpu_hist", "gpu_id": 1}

params_set_3 = {"max_depth": 4, "n_estimators": 800, "learning_rate": 0.05, "eval_metric": "auc",
    "verbosity": 1, 'subsample':0.8, "tree_method": "gpu_hist", "gpu_id": 1}

params_dict_gpu = {
    'params_set_1': params_set_1,
    'params_set_2': params_set_2,
    'params_set_3': params_set_3,
}

In [19]:
fit_models(
    model_lib="xgboost",
    fit_type="gpu",
    p_dict=params_dict_gpu,
    r_states=random_states,
    cat_feats=cat_feats,
    x_train=X_train,
    y_train=y_train,
    x_test=X_test,
    y_test=y_test,
)

Metrics df saved to: ../metrics/xgboost_gpu_500K_test_metrics_final.csv


In [20]:
df_res = pd.read_csv('../metrics/xgboost_gpu_500K_test_metrics_final.csv', sep=';')
print(df_res.shape, df_res['roc_auc'].mean(), df_res['fitting_time'].mean())

(30, 5) 0.7696951708036192 50.11624643802643
