# Imports

In [24]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import xgboost as xgb

import sklearn.metrics
import pickle
from sklearn.model_selection import train_test_split

from IPython.core.display import HTML
pd.options.mode.chained_assignment = None  # default='warn'

# Helper functions

In [26]:
def jupyter_settings():
    %matplotlib inline
    %pylab inline
    plt.style.use( 'bmh' )
    plt.rcParams['figure.figsize'] = [25, 12]
    plt.rcParams['font.size'] = 24
    display( HTML( '<style>.container { width:100% !important; }</style>') )
    sns.set()
jupyter_settings()

Populating the interactive namespace from numpy and matplotlib


In [28]:
from functions import performace_cross_val, performace

# Load data

In [29]:
train = pd.read_csv('../data/train.csv')
val = pd.read_csv('../data/test.csv')

In [30]:
val.head()

Unnamed: 0,id,gender,age,region_code,policy_sales_channel,previously_insured,annual_premium,vintage,vehicle_age,vehicle_damage,response
0,196319,0,0.461538,33.0,124.0,0,0.59727,0.49827,3,1,0
1,377373,0,0.307692,8.0,124.0,1,-1.622718,0.467128,2,0,0
2,96687,1,0.692308,41.0,109.0,0,-0.245443,0.816609,2,1,0
3,303533,0,0.615385,28.0,124.0,0,1.287244,0.446367,2,1,0
4,256233,0,0.246154,36.0,26.0,0,0.500537,0.217993,2,1,1


In [31]:
val = val[train.columns].copy()
df7 = pd.concat([train,val],axis=0)

# Fine tuning - Baysian search

Após escolher o algoritimo que melhor se comportou nos testes, vamos melhorar o desempenho do modelo alterando seus hyperparametros. Para isso utilizaremos a optimização baysiana, que é um modelo probabilistico utilizado para encontar o minimo erro da função

Essa técnica requer uma quantidade menor de iterações para achar o melhor conjunto de parametros, ignorando os valores de parametros desnecessários e economizando tempo e poder computacional.

In [34]:
from hyperopt import tpe, Trials, hp, fmin, STATUS_OK

import warnings
warnings.filterwarnings('ignore')

In [35]:
space={'n_estimators': hp.choice('n_estimators',np.arange(300,3000+1,250)),
      'max_depth': hp.quniform("max_depth", 3, 12, 1),
      'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
      'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
      'eta': hp.uniform('eta',0.01,0.2),
      'seed': 42
    }

Definindo os limites do valores dos parametros e quais serão utilizados

In [36]:
# def objective_function(space):
#     xgb_model = xgb.XGBClassifier(n_estimators =int(space['n_estimators']),
#                                   max_depth = int(space['max_depth']), 
#                                   min_child_weight=int(space['min_child_weight']),
#                                   colsample_bytree=int(space['colsample_bytree']),
#                                   eta=int(space['eta']),
#                                   seed = space['seed'],
#                                   tree_method = 'gpu_hist')      
#     score = performace_cross_val(df7,'response',xgb_model,'XGB_MODEL',round_n=5,splits=5,k=20000)
#     print(space)
#     print(score['RECALL_AT_K_CROSS_VAL'][0])
#     return {'loss': -score['RECALL_AT_K_CROSS_VAL'][0],'status':STATUS_OK}

Função que retornar o valor que queremos otimizar.

In [37]:
# tpe_algorithm = tpe.suggest
# trials = Trials()
# num_eval = 100

Algoritmo de otimização

In [None]:
# best_paramns = fmin(fn=objective_function,space=space,algo=tpe_algorithm,max_evals=num_eval,trials=trials)

In [18]:
# best_paramns

In [22]:
# best_paramns_select_recall_K = {'colsample_bytree': 0.8665878779090729,
#                                 'eta': 0.05078467297264412,
#                                 'max_depth': 3.0,
#                                 'min_child_weight': 0.0,
#                                 'n_estimators': 1300}

Melhores parâmetros levando em consideração o valor de recall at k

In [None]:
# best_paramns_select_balance_acc = {'colsample_bytree': 0.7328187504855774, 
#                                    'eta': 0.02593212072382746, 
#                                    'max_depth': 3.0, 
#                                    'min_child_weight': 1.0, 
#                                    'n_estimators': 1800, 
#                                    'seed': 42}

Melhores parâmetros levando em consideração o valor de Balanced accuracy score

# Final Model

In [None]:
X_train = train.copy()
y_train = X_train['response']
X_train.drop(columns=['id','response'],inplace = True)

X_test = val.copy()
y_test = X_test['response']
X_test.drop(columns=['id','response'],inplace = True)

In [None]:
xgb_model_final = xgb.XGBClassifier(n_estimators =int(best_paramns_select_balance_acc['n_estimators']),
                                  max_depth = int(best_paramns_select_balance_acc['max_depth']), 
                                  min_child_weight=int(best_paramns_select_balance_acc['min_child_weight']),
                                  colsample_bytree=int(best_paramns_select_balance_acc['colsample_bytree']),
                                  eta=int(best_paramns_select_balance_acc['eta']),
                                  seed = 42).fit(X_train,y_train.values.ravel()) 

In [None]:
performace_cross_val(df7,'response',xgb_model,'XGB_MODEL',round_n=5,splits=5,k=20000)

In [None]:
#model prediction proba - poder de generalização
yhat_proba = xgb_model_final.predict_proba( X_test )

#model prediction - poder de generalização
yhat_class = xgb_model_final.predict( X_test )

In [None]:
model_df = val.copy()
model_df['model_score'] = yhat_proba[:,1].tolist()
# sorte by score
model_df = model_df.sort_values('model_score',ascending=False)

In [None]:
xgb_performace = performace("XGBoost",model_df,y_test,yhat_class,20000)

In [None]:
xgb_performace

Unnamed: 0,Model Name,Precison,Recall,Balanced_acc,k,Precison at k,Recall at k
0,XGBoost,0.430962,0.007357,0.502999,20000,0.360632,0.515177


In [None]:
model_df

Unnamed: 0,id,gender,age,region_code,policy_sales_channel,previously_insured,annual_premium,vintage,vehicle_age,vehicle_damage,response,model_score
51525,299944,0,0.292308,18.0,124.0,0,6.014980,0.640138,2,1,0,0.731972
112906,247986,0,0.215385,28.0,124.0,0,3.507987,0.532872,2,1,0,0.702438
51094,139716,1,0.507692,28.0,124.0,0,5.967805,0.667820,3,1,0,0.687480
92549,232888,0,0.523077,28.0,26.0,0,5.915923,0.795848,2,1,0,0.685657
20598,334982,0,0.600000,51.0,157.0,0,5.916736,0.432526,2,1,1,0.642638
...,...,...,...,...,...,...,...,...,...,...,...,...
109035,38211,1,0.000000,25.0,160.0,1,0.051090,0.515571,1,0,0,0.000057
97963,103815,1,0.000000,9.0,160.0,1,-1.622718,0.525952,1,0,0,0.000055
60522,318790,1,0.000000,25.0,160.0,1,-1.622718,0.318339,1,0,0,0.000055
89617,222233,1,0.000000,47.0,160.0,1,-0.325038,0.166090,1,0,0,0.000049


In [None]:
#pickle.dump(xgb_model_final,open('../models/xgb_model_final.pkl','wb'))