# Modelagem

## Loading Data and Libraries

In [1]:
import unidecode
import os
import sys

sys.path.append('../src/')

import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None

%matplotlib inline
%load_ext autoreload
%autoreload 2

DATAPATH = '../data/'

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, roc_auc_score
from bayes_opt import BayesianOptimization

In [2]:
dataset = pd.read_csv('../data/raw/preprocessed/dataset.csv')
print(dataset.shape)
dataset.head(5)

(37646, 45)


Unnamed: 0,uf_res_dif_paciente,mun_res_dif_paciente,cod_idade,num_idade,sexo,trat_anter,car_atend,rf_tomografos_computadorizados,rf_mamografos,rfressonancia_magnetica,equipes_de_saude_equipes_saude_da_familia,rh_medicos,equipes_de_saude_nucleos_de_apoio_a_saude_da_familia_nasf,rf_leitos_de_internacao,rh_enfermeiros,rf_raios_x,CAPITAL,IDHM,IDHM_Longevidade,IDHM_Educacao,AREA,ESTIMATED_POP,GVA_SERVICES,GDP,POP_GDP,GDP_CAPITA,COMP_Q,distancia_paciente_estab,tardio,tempo_atend_identificacao,tempo_autorizar,qtd_estab_uf,qtd_estab_mun,Intermediário Adjacente,Intermediário Remoto,Rural Adjacente,Rural Remoto,Sem classificação,Urbano,raca_cor_1,raca_cor_2,raca_cor_3,raca_cor_4,raca_cor_5,raca_cor_99
0,0.0,1.0,4.0,71.0,0,1,1.0,4.0,4.0,2.0,4.0,179.0,1.0,188.0,93.0,27.0,0,0.782,0.845,0.728,539087.0,81893.0,1651459.19,3331621.74,79869.0,41713.58,222.0,442.43055,1,-1,4.0,62,2.0,0,0,0,0,0,1,0,0,1,0,0,0
1,0.0,1.0,4.0,70.0,1,0,1.0,1.0,3.0,1.0,6.0,235.0,1.0,184.0,141.0,11.0,0,0.78,0.858,0.707,689.09,83173.0,1650768.35,3048217.08,83089.0,36686.17,124.0,0.306397,1,0,0.0,70,1.0,0,0,0,0,0,1,0,0,1,0,0,0
2,0.0,1.0,4.0,81.0,0,0,1.0,2.0,3.0,1.0,0.0,112.0,0.0,94.0,56.0,7.0,0,0.77,0.84,0.707,665515.0,61949.0,1571.42,2645.23,62193.0,42532.6,71.0,0.641971,1,0,0.0,70,1.0,0,0,0,0,0,1,1,0,0,0,0,0
3,0.0,1.0,4.0,30.0,0,0,1.0,0.0,0.0,0.0,0.0,3.0,0.0,28.0,7.0,1.0,0,0.597,0.737,0.512,63279.0,4095.0,7929.41,30269.1,4204.0,7200.07,2.0,120.39575,1,0,1.0,10,0.0,0,0,1,0,0,0,0,0,0,0,0,1
4,0.0,1.0,4.0,3.0,1,0,1.0,12.0,11.0,5.0,22.0,2121.0,3.0,2076.0,810.0,81.0,1,0.763,0.835,0.694,167401.0,877640.0,12122912.17,21845480.68,877662.0,24890.54,1165.0,9.062047,1,0,0.0,10,5.0,0,0,0,0,0,1,0,0,0,0,0,1


## Separação dos conjuntos de treino e teste

In [3]:
X = dataset.drop('tardio', axis=1)
y = dataset['tardio']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

## Teste inicial com diferentes algoritmos

In [4]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
import lightgbm as lgb
import xgboost as xgb

In [5]:
classifiers = [('RF', RandomForestClassifier()), ('GB', GradientBoostingClassifier()), ('AB', AdaBoostClassifier()),
              ('LB', lgb.LGBMClassifier()), ('XB', xgb.XGBClassifier())]

In [6]:
for clf in classifiers:
    print(clf[0])
    
    scores = cross_val_score(estimator=clf[1], X=X_train, y=y_train, scoring='roc_auc', cv=5, n_jobs=-1)
    print('%.3f (%.3f)' % (scores.mean(), scores.std()))
    
    print('\n')

RF
0.669 (0.008)


GB
0.683 (0.005)


AB
0.637 (0.006)


LB
0.717 (0.003)


XB
0.683 (0.002)




## Otimização dos hiperparâmetros
De maneira breve, utilizando os parâmetros padrões das respectivas libs, foram testados alguns modelos que são baseados em ensemble de árvores. Depois de realizar uma validação cruzada, o modelo com melhor resultado (LightGBM) será selecionado e terá seus parâmetros otimizados com a técnica de Otimização Bayesiana.

In [7]:
n_folds = 5

In [8]:
train_set = lgb.Dataset(X_train, y_train)

In [9]:
# Parâmetros com intervalos que serão otimizados 
parameters = {
    'num_leaves': (7, 100),
    'max_depth': (3, 32),
    'subsample': (0.5, 1),
    'colsample_bytree': (0.5, 1),
    'min_child_samples': (12, 300),
}

In [10]:
# Parâmetro para balanceamento
scale_pos_weight  = y_train[y_train==0].count() / y_train[y_train==1].count()

In [11]:
def lgb_optimization(num_leaves, max_depth, subsample, colsample_bytree, min_child_samples):
    params = {
        'objective': 'binary',
        'num_leaves': int(num_leaves),
        'max_depth': int(max_depth),
        'subsample': subsample,
        'colsample_bytree': colsample_bytree,
        'min_child_samples': int(min_child_samples),
        'scale_pos_weight': scale_pos_weight,
        'n_jobs': -1
    }
    
    cv_results = lgb.cv(params, train_set, nfold=n_folds, stratified=False, shuffle=True, num_boost_round = 4000, 
                        early_stopping_rounds=50, metrics='auc', seed = 50)
    
    return np.max(cv_results['auc-mean'])

In [None]:
BO = BayesianOptimization(lgb_optimization, parameters, verbose=10)
BO.maximize(init_points=2, n_iter=300, acq='ei')

|   iter    |  target   | colsam... | max_depth | min_ch... | num_le... | subsample |
-------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.7166  [0m | [0m 0.9358  [0m | [0m 10.95   [0m | [0m 33.15   [0m | [0m 47.47   [0m | [0m 0.5515  [0m |
| [0m 2       [0m | [0m 0.7164  [0m | [0m 0.5636  [0m | [0m 18.87   [0m | [0m 72.71   [0m | [0m 21.35   [0m | [0m 0.6343  [0m |
| [0m 3       [0m | [0m 0.7153  [0m | [0m 0.8843  [0m | [0m 31.74   [0m | [0m 300.0   [0m | [0m 98.33   [0m | [0m 0.6309  [0m |
| [0m 4       [0m | [0m 0.7162  [0m | [0m 0.5522  [0m | [0m 30.9    [0m | [0m 13.06   [0m | [0m 99.95   [0m | [0m 0.5838  [0m |


In [None]:
BO.max

### Carregar modelo com os melhores parâmetros

In [None]:
best_params = BO.max['params']
colsample_bytree = best_params['colsample_bytree']
subsample = best_params['subsample']
max_depth = int(best_params['max_depth'])
num_leaves = int(best_params['num_leaves'])

model = lgb.LGBMClassifier(colsample_bytree=colsample_bytree, subsample=subsample, max_depth=max_depth, num_leaves=num_leaves)

In [None]:
model.fit(X_train, y_train)

In [None]:
pred = model.predict(X_test)

print('Acurácia:', accuracy_score(y_test, pred))
print('AUC ROC:', roc_auc_score(y_test, pred))

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [None]:
eval_set = [(X_val, y_val)]

In [None]:
model.fit(X_train, y_train, eval_set=eval_set, early_stopping_rounds=50, eval_metric='auc', verbose=False)

In [None]:
pred = model.predict(X_test)

print('Acurácia:', accuracy_score(y_test, pred))
print('AUC ROC:', roc_auc_score(y_test, pred))