# Imports

In [29]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [30]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import xgboost as xgb

import sklearn.metrics
import pickle
from sklearn.model_selection import train_test_split

from IPython.core.display import HTML
pd.options.mode.chained_assignment = None  # default='warn'

In [31]:
# from google.colab import drive
# drive.mount('/content/drive')

# Helper functions

In [32]:
def jupyter_settings():
    %matplotlib inline
    %pylab inline
    plt.style.use( 'bmh' )
    plt.rcParams['figure.figsize'] = [25, 12]
    plt.rcParams['font.size'] = 24
    display( HTML( '<style>.container { width:100% !important; }</style>') )
    sns.set()
jupyter_settings()

Populating the interactive namespace from numpy and matplotlib


In [33]:
# import sys 
# path_to_module = '/content/drive/MyDrive/health_insurance_cross_sell-main/health_insurance_cross_sell'
# sys.path.append(path_to_module)

In [34]:
from functions import performace_cross_val, performace

# Load data

In [35]:
train = pd.read_csv('data/train.csv')
val = pd.read_csv('data/test.csv')

In [36]:
val.head()

Unnamed: 0,id,gender,age,region_code,policy_sales_channel,previously_insured,annual_premium,vintage,vehicle_age,vehicle_damage,response
0,196319,0,0.461538,33.0,124.0,0,0.59727,0.49827,3,1,0
1,377373,0,0.307692,8.0,124.0,1,-1.622718,0.467128,2,0,0
2,96687,1,0.692308,41.0,109.0,0,-0.245443,0.816609,2,1,0
3,303533,0,0.615385,28.0,124.0,0,1.287244,0.446367,2,1,0
4,256233,0,0.246154,36.0,26.0,0,0.500537,0.217993,2,1,1


In [37]:
val = val[train.columns].copy()
#df7 = pd.concat([train,val],axis=0)

In [38]:
#df7.head()

In [39]:
#df7.dtypes

# Fine tuning - Baysian search

Após escolher o algoritimo que melhor se comportou nos testes, vamos melhorar o desempenho do modelo alterando seus hyperparametros. Para isso utilizaremos a optimização baysiana, que é um modelo probabilistico utilizado para encontar o minimo erro da função

Essa técnica requer uma quantidade menor de iterações para achar o melhor conjunto de parametros, ignorando os valores de parametros desnecessários e economizando tempo e poder computacional.

In [40]:
from hyperopt import tpe, Trials, hp, fmin, STATUS_OK

import warnings
warnings.filterwarnings('ignore')

In [41]:
space={'n_estimators': hp.choice('n_estimators',np.arange(300,3000+1,250)),
      'max_depth': hp.quniform("max_depth", 3, 12, 1),
      'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
      'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
      'eta': hp.uniform('eta',0.01,0.2),
      'seed': 42
    }

Definindo os limites do valores dos parametros e quais serão utilizados

In [42]:
# def objective_function(space):
#     xgb_model = xgb.XGBClassifier(n_estimators =int(space['n_estimators']),
#                                   max_depth = int(space['max_depth']), 
#                                   min_child_weight=int(space['min_child_weight']),
#                                   colsample_bytree=int(space['colsample_bytree']),
#                                   eta=int(space['eta']),
#                                   seed = space['seed'],
#                                   tree_method = 'gpu_hist')      
#     score = performace_cross_val(df7,'response',xgb_model,'XGB_MODEL',round_n=5,splits=6,k=20000)
#     print(space)
#     print(score['RECALL_AT_K_CROSS_VAL'][0])
#     return {'loss': -score['RECALL_AT_K_CROSS_VAL'][0],'status':STATUS_OK}

Função que retornar o valor que queremos otimizar.

In [43]:
tpe_algorithm = tpe.suggest
trials = Trials()
num_eval = 100

Algoritmo de otimização

In [44]:
#best_paramns = fmin(fn=objective_function,space=space,algo=tpe_algorithm,max_evals=num_eval,trials=trials)

In [45]:
#best_paramns

In [46]:
# best_paramns_select_recall@K = {'colsample_bytree': 0.8665878779090729,
#                                 'eta': 0.05078467297264412,
#                                 'max_depth': 3.0,
#                                 'min_child_weight': 0.0,
#                                 'n_estimators': 1300}

Melhores parâmetros levando em consideração o valor de recall at k

In [47]:
# best_paramns_select_balance_acc = {'colsample_bytree': 0.5510880215728314,
#                                     'eta': 0.09226011272550536,
#                                     'max_depth': 12.0,
#                                     'min_child_weight': 4.0,
#                                     'n_estimators': 1050}

In [48]:
best_paramns_select_balance_acc = {'colsample_bytree': 0.7328187504855774, 'eta': 0.02593212072382746, 'max_depth': 3.0, 'min_child_weight': 1.0, 'n_estimators': 1800, 'seed': 42}

Melhores parâmetros levando em consideração o valor de Balanced accuracy score

# Final Model

In [49]:
X_train = train.copy()
y_train = X_train['response']
X_train.drop(columns=['id','response'],inplace = True)

X_test = val.copy()
y_test = X_test['response']
X_test.drop(columns=['id','response'],inplace = True)

In [50]:
xgb_model_final = xgb.XGBClassifier(n_estimators =int(best_paramns_select_balance_acc['n_estimators']),
                                  max_depth = int(best_paramns_select_balance_acc['max_depth']), 
                                  min_child_weight=int(best_paramns_select_balance_acc['min_child_weight']),
                                  colsample_bytree=int(best_paramns_select_balance_acc['colsample_bytree']),
                                  eta=int(best_paramns_select_balance_acc['eta']),
                                  seed = 42).fit(X_train,y_train.values.ravel()) 

In [51]:
#model prediction proba - poder de generalização
yhat_proba = xgb_model_final.predict_proba( X_test )

#model prediction - poder de generalização
yhat_class = xgb_model_final.predict( X_test )

In [52]:
model_df = val.copy()
model_df['model_score'] = yhat_proba[:,1].tolist()
# sorte by score
model_df = model_df.sort_values('model_score',ascending=False)

In [53]:
xgb_performace = performace("XGBoost",model_df,y_test,yhat_class,20000)

In [54]:
xgb_performace

Unnamed: 0,Model Name,Precison,Recall,Balanced_acc,k,Precison at k,Recall at k
0,XGBoost,0.0,0.0,0.5,20000,0.125894,0.179844


In [55]:
model_df

Unnamed: 0,id,gender,age,region_code,policy_sales_channel,previously_insured,annual_premium,vintage,vehicle_age,vehicle_damage,response,model_score
0,196319,0,0.461538,33.0,124.0,0,0.597270,0.498270,3,1,0,0.5
76055,206612,1,0.769231,8.0,26.0,0,-0.140460,0.799308,2,1,0,0.5
76066,99903,1,0.507692,1.0,26.0,0,-1.622718,0.089965,3,1,0,0.5
76065,289557,0,0.430769,28.0,26.0,0,-1.622718,0.346021,2,1,0,0.5
76064,224629,0,0.046154,9.0,152.0,1,-0.345256,0.093426,1,0,0,0.5
...,...,...,...,...,...,...,...,...,...,...,...,...
38027,2972,0,0.138462,39.0,124.0,1,-0.193503,0.712803,1,0,0,0.5
38026,299296,1,0.000000,42.0,160.0,1,0.115172,0.871972,1,0,0,0.5
38025,245297,0,0.430769,41.0,26.0,0,1.383861,0.273356,2,1,0,0.5
38024,366938,0,0.061538,28.0,152.0,0,-1.622718,0.280277,1,1,0,0.5


In [56]:
#pickle.dump(xgb_model_final,open('/content/drive/MyDrive/health_insurance_cross_sell-main/health_insurance_cross_sell/models/xgb_model_final.pkl','wb'))