# 2) IACOV - Model Development for Prognosis - ICU
## Strategy 6) Training on 70% of a specific hospital plus same absolute number of all other hospitals

In [1]:
# !pip install npm nodejs

In [2]:
# !jupyter lab clean
# !jupyter lab build

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import numpy as np

import warnings
warnings.filterwarnings("ignore")

# para evitarmos a exibição dos dados em notacao científica
pd.set_option('display.float_format', lambda x: '%.3f' % x)

#comment next line to not use MLFlow for cleaning data
from MLFlow_Classification import *
from MLFlow_Utils import *

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


### This code uses just 14 hospitals, since ICU or MV has no variation for 4 hospitals
#### - The code that prepares the data to run all strategies for ICU and MV is: PREPARING_DATA_TO_UCI_MV_ANALYSES
#### - df_iacov_en.csv (used for death) was replaced here by df_iacov_model_lean_14hosp.csv
#### - put df_iacov_model_lean_14hosp.csv in same folder as this notebook

In [4]:
#Put csv in same folder as this notebook
dataset = pd.read_csv("df_iacov_model_lean_14hosp.csv", delimiter=";")
df_iacov_model = pd.DataFrame(dataset)
df_iacov_model.shape

(6046, 30)

In [5]:
df_iacov_model.columns

Index(['city_hospital', 'age', 'male', 'race', 'heart_rate', 'resp_rate',
       'sys_press', 'dias_press', 'mean_press', 'temp', 'hemoglobin',
       'platelets', 'hematocrit', 'red_cells_count', 'hcm', 'rdw', 'mcv',
       'leukocytes', 'neutrophil', 'lymphocytes', 'basophils', 'eosinophils',
       'monocytes', 'crp', 'death', 'icu', 'mv', 'state', 'region', 'excluir'],
      dtype='object')

## 2.1) Assign binary outcome ICU to be predicted

### 2.1.1) Maintain essential variables (LEAN Model)

In [6]:
#22 predictors + outcome + Region/State
df_iacov_model_lean = df_iacov_model[['city_hospital'
                                      ,'age'
                                      ,'male'
                                      ,'heart_rate'
                                      ,'resp_rate'
                                      ,'sys_press'
                                      ,'dias_press'
                                      ,'mean_press'
                                      ,'temp'
                                      ,'hemoglobin'
                                      ,'platelets'
                                      ,'hematocrit'
                                      ,'red_cells_count'
                                      ,'hcm'
                                      ,'rdw'
                                      ,'mcv'
                                      ,'leukocytes'
                                      ,'neutrophil'
                                      ,'lymphocytes'
                                      ,'basophils'
                                      ,'eosinophils'
                                      ,'monocytes'
                                      ,'crp'
                                      ,'icu'
                                      ,'region' #for filtering purposes
                                      ,'state' #for filtering purposes
                                     ]]
df_iacov_model_lean.shape

(6046, 26)

In [7]:
df_iacov_model_lean.city_hospital.value_counts()

city_hospital
HC_USP                        1500
HOSPPORTUGUES_SALVADOR        1359
CEARA_UNIMED                   845
HRL_2021_02                    539
HMV_POA_02                     456
RIO_DE_JANEIRO_UNIMED          449
HOSPSANTAJULIA_MANAUS          247
HOSPSANTACATARINABLUMENAU      148
HOSPSAOFRANCISCO_MOGIGUACU     124
PERNAMBUCO_FULL                112
PELOTAS                         91
CEARA_HUWC                      73
HEVV                            56
AMAZONAS_HUGV                   47
Name: count, dtype: int64

In [8]:
df_iacov_model_lean.to_csv('df_iacov_model_lean.csv', sep=';', index=False)

### 2.1.2) Filter a specific hospital

In [9]:
#Put csv in same folder as this notebook
df_all_hospitals = pd.read_csv("df_iacov_model_lean.csv", delimiter=";", decimal=".")
df_all_hospitals.shape

(6046, 26)

In [10]:
df_all_hospitals.columns

Index(['city_hospital', 'age', 'male', 'heart_rate', 'resp_rate', 'sys_press',
       'dias_press', 'mean_press', 'temp', 'hemoglobin', 'platelets',
       'hematocrit', 'red_cells_count', 'hcm', 'rdw', 'mcv', 'leukocytes',
       'neutrophil', 'lymphocytes', 'basophils', 'eosinophils', 'monocytes',
       'crp', 'icu', 'region', 'state'],
      dtype='object')

In [11]:
df_all_hospitals.region.value_counts()

region
NORDESTE       2389
SUDESTE        2129
SUL             695
CENTROOESTE     539
NORTE           294
Name: count, dtype: int64

In [12]:
column_summaries(df_all_hospitals)

Unnamed: 0,Missing Values,% missing of Total Values,# Unique Values,# Values as [0],datatype,skew,count,mean,std,min,25%,50%,75%,max
city_hospital,0,0.0,14,0,object,,,,,,,,,
age,0,0.0,98,0,float64,-0.06,6046.0,57.521,17.869,18.0,44.0,58.0,71.0,105.0
male,0,0.0,2,2790,float64,-0.155,6046.0,0.539,0.499,0.0,0.0,1.0,1.0,1.0
heart_rate,809,13.381,183,0,float64,0.611,5237.0,85.689,17.209,13.0,75.0,85.0,96.0,233.0
resp_rate,1457,24.099,69,1,float64,2.897,4589.0,21.881,6.38,0.0,18.0,20.0,24.0,108.0
sys_press,1336,22.097,215,0,float64,-0.279,4710.0,124.054,22.791,10.0,110.0,122.0,138.0,242.0
dias_press,1325,21.915,150,0,float64,-0.361,4721.0,74.608,14.626,6.0,67.0,76.0,81.0,141.0
mean_press,1366,22.593,1452,326,float64,-1.751,4680.0,83.731,28.475,0.0,78.981,90.0,98.648,166.0
temp,1173,19.401,128,1,float64,62.972,4873.0,36.43,4.976,0.0,36.0,36.3,36.8,372.0
hemoglobin,1904,31.492,226,0,float64,-0.628,4142.0,12.752,2.189,1.4,11.692,13.0,14.2,30.6


In [13]:
import ipywidgets as widgets
from IPython.display import clear_output

In [14]:
df_all_hospitals.city_hospital.value_counts()

city_hospital
HC_USP                        1500
HOSPPORTUGUES_SALVADOR        1359
CEARA_UNIMED                   845
HRL_2021_02                    539
HMV_POA_02                     456
RIO_DE_JANEIRO_UNIMED          449
HOSPSANTAJULIA_MANAUS          247
HOSPSANTACATARINABLUMENAU      148
HOSPSAOFRANCISCO_MOGIGUACU     124
PERNAMBUCO_FULL                112
PELOTAS                         91
CEARA_HUWC                      73
HEVV                            56
AMAZONAS_HUGV                   47
Name: count, dtype: int64

In [15]:
# Hospitals excluded for ICU and MV analyses: 'HOSPSANTACASASP_FULL', 'HOSPGRUPOSANTA', 'HUTRIN_2021_02', 'RIO_DE_JANEIRO_HUCFF'
dropdown_hospital = widgets.Dropdown(options = ['Choose a hospital ...'
                                                ,'HC_USP'                     #SP
                                                ,'HOSPPORTUGUES_SALVADOR'     #BA
                                                ,'CEARA_UNIMED'               #CE
                                                ,'HRL_2021_02'                #GO - Luiziania
                                                ,'HMV_POA_02'                 #RS - Moinhos de Vento
                                                ,'RIO_DE_JANEIRO_UNIMED'      #RJ
                                                ,'HOSPSANTAJULIA_MANAUS'      #AM
                                                ,'HOSPSANTACATARINABLUMENAU'  #SC
                                                ,'HOSPSAOFRANCISCO_MOGIGUACU' #SP
                                                ,'PERNAMBUCO_FULL'            #PE
                                                ,'PELOTAS'                    #RS
                                                ,'CEARA_HUWC'                 #CE
                                                ,'HEVV'                       #ES - Vila Velha
                                                ,'AMAZONAS_HUGV'              #AM
#                                                 ,'GHC_02'                     #RS 
                                               ])

In [16]:
specific_hospital = 'Choose a hospital'
df_iacov_model_lean = df_all_hospitals

def dropdown_hospital_eventhandler(change):
    global specific_hospital 
    specific_hospital = change.new
    dropdown_hospital.observe(dropdown_hospital_eventhandler, names='value')

In [17]:
dropdown_hospital.observe(dropdown_hospital_eventhandler, names='value')

**Select Hospital to train**

In [18]:
# !pip install ipywidgets

In [19]:
# !jupyter nbextension enable --py widgetsnbextension

In [20]:
# !jupyter labextension install @jupyter-widgets/jupyterlab-manager

In [22]:
display(dropdown_hospital)

Dropdown(index=11, options=('Choose a hospital ...', 'HC_USP', 'HOSPPORTUGUES_SALVADOR', 'CEARA_UNIMED', 'HRL_…

**Filter all hospitals except the selected above**

In [23]:
df_all_hospitals.region.value_counts()

region
NORDESTE       2389
SUDESTE        2129
SUL             695
CENTROOESTE     539
NORTE           294
Name: count, dtype: int64

**Split train/test of specific Hospital**

In [24]:
from sklearn.model_selection import train_test_split

In [25]:
#Filter the specific hospital
df_iacov_model_lean_specific = df_all_hospitals[df_all_hospitals.city_hospital == specific_hospital]
df_iacov_model_lean_specific.shape

(91, 26)

In [26]:
X_specific = df_iacov_model_lean_specific.drop(['icu'],axis=1)
X_specific.shape

(91, 25)

In [27]:
y_specific = df_iacov_model_lean_specific['icu']
y_specific.shape

(91,)

In [28]:
X_train_specific, X_test_specific, y_train_specific, y_test_specific = train_test_split(X_specific, y_specific, test_size=0.30, random_state=42, stratify=y_specific)

In [29]:
X_train_specific.shape

(63, 25)

In [30]:
X_test_specific.shape

(28, 25)

**Filter same absolute number of specific hospital from all hospitals**

PS: Also removed GHC hospital due to very low positive cases

In [31]:
from random import sample

#Sample all hospitals except specific hospital
df_all_hospitals_except_specific = df_all_hospitals[ (~df_all_hospitals.index.isin(X_specific.index)) & 
                                                     (df_all_hospitals.city_hospital != 'GHC_02')]

#Then, sample all hospital data with same absolute number as specific train data
df_iacov_model_lean = df_all_hospitals_except_specific.sample(X_train_specific.shape[0], random_state=42)
df_iacov_model_lean.shape

(63, 26)

In [32]:
#check data without specific
df_iacov_model_lean.city_hospital.value_counts()

city_hospital
HOSPPORTUGUES_SALVADOR        16
HC_USP                        15
CEARA_UNIMED                  11
HRL_2021_02                    6
HMV_POA_02                     3
RIO_DE_JANEIRO_UNIMED          3
HOSPSANTAJULIA_MANAUS          2
HOSPSAOFRANCISCO_MOGIGUACU     2
HOSPSANTACATARINABLUMENAU      2
PERNAMBUCO_FULL                2
CEARA_HUWC                     1
Name: count, dtype: int64

**Append 70% data of specific hospital**

In [33]:
df_iacov_model_lean = pd.concat([df_iacov_model_lean, X_train_specific], axis=0)
df_iacov_model_lean.shape

(126, 26)

In [34]:
#Check after apending data of specific hospital
df_iacov_model_lean.city_hospital.value_counts()

city_hospital
PELOTAS                       63
HOSPPORTUGUES_SALVADOR        16
HC_USP                        15
CEARA_UNIMED                  11
HRL_2021_02                    6
HMV_POA_02                     3
RIO_DE_JANEIRO_UNIMED          3
HOSPSANTAJULIA_MANAUS          2
HOSPSAOFRANCISCO_MOGIGUACU     2
HOSPSANTACATARINABLUMENAU      2
PERNAMBUCO_FULL                2
CEARA_HUWC                     1
Name: count, dtype: int64

In [35]:
print("Specific: " + specific_hospital )
print("(training - all regions - same absolute number) dataset: " + str(df_iacov_model_lean.shape))

print("\n(training - specific) dataset: {}".format(X_train_specific.shape))

print("\n(test - specific) dataset: {}".format(X_test_specific.shape))

Specific: PELOTAS
(training - all regions - same absolute number) dataset: (126, 26)

(training - specific) dataset: (63, 25)

(test - specific) dataset: (28, 25)


### 2.1.3) Dropping unused variables (All hospitals)

In [36]:
df_iacov_model_lean = df_iacov_model_lean.drop(['city_hospital'],axis=1)
df_iacov_model_lean.shape

(126, 25)

In [37]:
#Preencho NAs com 0 
df_iacov_model_lean['icu'] = df_iacov_model_lean['icu'].fillna(0)
df_iacov_model_lean.icu.value_counts()

icu
0.000    102
1.000     24
Name: count, dtype: int64

In [38]:
(df_iacov_model_lean.icu.value_counts()/len(df_iacov_model_lean.icu))*100

icu
0.000   80.952
1.000   19.048
Name: count, dtype: float64

### From now on the outcome will call CLASS

In [39]:
df_iacov_model_lean['class'] = df_iacov_model_lean['icu'].astype('int')
df_iacov_model_lean = df_iacov_model_lean.drop(['icu'],axis=1)

### 2.1.4) Recalculating null mean_press

In [40]:
def isNullMeanPressure(row):

    if pd.isnull(row['mean_press']):
    
        if pd.notnull(row['sys_press']) and pd.notnull(row['dias_press']):
            return (row['sys_press']+row['dias_press'])/2
        else:
            return row['mean_press']
    else:
        return row['mean_press']

In [41]:
#Check mean_press missing before recalculating
column_summaries(df_iacov_model_lean)

Unnamed: 0,Missing Values,% missing of Total Values,# Unique Values,# Values as [0],datatype,skew,count,mean,std,min,25%,50%,75%,max
age,0,0.0,61,0,float64,-0.131,126.0,58.151,17.921,20.0,45.25,58.0,70.0,96.0
male,0,0.0,2,57,float64,-0.194,126.0,0.548,0.5,0.0,0.0,1.0,1.0,1.0
heart_rate,9,7.143,53,0,float64,1.141,117.0,85.167,18.598,53.0,73.0,82.0,94.0,160.0
resp_rate,22,17.46,24,0,float64,1.011,104.0,22.399,5.401,12.0,19.0,21.0,25.25,40.0
sys_press,18,14.286,47,0,float64,0.028,108.0,125.194,25.329,11.0,112.75,123.0,140.0,242.0
dias_press,18,14.286,45,0,float64,-0.523,108.0,75.394,15.407,7.0,69.0,77.5,80.0,120.0
mean_press,77,61.111,41,2,float64,-1.363,49.0,86.897,28.035,0.0,80.0,91.5,106.0,161.0
temp,14,11.111,37,0,float64,0.682,112.0,36.473,0.994,34.4,35.8,36.3,36.925,39.0
hemoglobin,24,19.048,58,0,float64,-0.274,102.0,12.504,2.303,6.6,10.925,12.65,13.975,17.8
platelets,12,9.524,102,0,float64,0.408,114.0,235353.886,122831.181,222.0,148000.0,222500.0,304500.0,516000.0


In [42]:
df_iacov_model_lean['mean_press'] = df_iacov_model_lean.apply(isNullMeanPressure, axis=1)

In [43]:
#Check mean_press missing after recalculating
#column_summaries(df_iacov_model_lean)

In [44]:
X_test_specific['mean_press'] = X_test_specific.apply(isNullMeanPressure, axis=1)

In [45]:
#column_summaries(X_test_specific)

### Put X_test and y_test in separate files

In [46]:
x_test_specific_name = 'X_test_all_regions_abs_number_' + X_test_specific.region.iloc[0]  \
                        + '_' + X_test_specific.state.iloc[0]  \
                        + '_' + X_test_specific.city_hospital.iloc[0] +  ".csv"
x_test_specific_name

'X_test_all_regions_abs_number_SUL_RS_PELOTAS.csv'

In [47]:
y_test_specific_name = 'y_icu_all_regions_abs_number_' + X_test_specific.region.iloc[0]  \
                        + '_' + X_test_specific.state.iloc[0]  \
                        + '_' + X_test_specific.city_hospital.iloc[0] +  ".csv"
y_test_specific_name

'y_icu_all_regions_abs_number_SUL_RS_PELOTAS.csv'

In [48]:
X_test_specific.to_csv(x_test_specific_name, sep=';')
X_test_specific.shape

(28, 25)

In [49]:
y_test_specific.to_csv(y_test_specific_name, sep=';')
y_test_specific.shape

(28,)

## 2.2) Prepare Experiment 

### All hospitals except specific

In [50]:
df_iacov_model_lean.shape

(126, 25)

In [51]:
df_iacov_model_train = df_iacov_model_lean

In [52]:
df_iacov_model_train = df_iacov_model_train.drop(['region'],axis=1)
df_iacov_model_train = df_iacov_model_train.drop(['state'],axis=1)
df_iacov_model_train.columns

Index(['age', 'male', 'heart_rate', 'resp_rate', 'sys_press', 'dias_press',
       'mean_press', 'temp', 'hemoglobin', 'platelets', 'hematocrit',
       'red_cells_count', 'hcm', 'rdw', 'mcv', 'leukocytes', 'neutrophil',
       'lymphocytes', 'basophils', 'eosinophils', 'monocytes', 'crp', 'class'],
      dtype='object')

In [53]:
df_iacov_model_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 126 entries, 2291 to 4477
Data columns (total 23 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   age              126 non-null    float64
 1   male             126 non-null    float64
 2   heart_rate       117 non-null    float64
 3   resp_rate        104 non-null    float64
 4   sys_press        108 non-null    float64
 5   dias_press       108 non-null    float64
 6   mean_press       110 non-null    float64
 7   temp             112 non-null    float64
 8   hemoglobin       102 non-null    float64
 9   platelets        114 non-null    float64
 10  hematocrit       98 non-null     float64
 11  red_cells_count  34 non-null     float64
 12  hcm              111 non-null    float64
 13  rdw              111 non-null    float64
 14  mcv              111 non-null    float64
 15  leukocytes       113 non-null    float64
 16  neutrophil       37 non-null     float64
 17  lymphocytes      

In [54]:
exp = setup(df_iacov_model_train
            , target='class'
            , categorical_features = ['male']
            , numeric_features = ['crp','basophils','eosinophils','red_cells_count','monocytes','hemoglobin','resp_rate','neutrophil','hematocrit']
            , normalize=True
            #, remove_multicollinearity=True
            ,numeric_imputation='median'
#             ,numeric_imputation='ignore'
            #,multicollinearity_threshold=0.9
            , resample=True
            , resample_method='random_over'
#             , train_size = 1.0 #100pct for training
           )

 
Setup Succesfully Completed!


Unnamed: 0,Description,Value
0,session_id,42
1,Target Type,Binary
2,Label Encoded,
3,Original Data,"(126, 23)"
4,Missing Values,True
5,Numeric Features,21
6,Categorical Features,1
7,Ordinal Features,False
8,High Cardinality Features,False
9,High Cardinality Method,


In [55]:
X, y, X_train, X_test, y_train, y_test, seed, prep_pipe, _ = exp

In [56]:
X_train.shape

(142, 22)

In [57]:
X_test.shape

(38, 22)

In [58]:
X_test.columns

Index(['age', 'heart_rate', 'resp_rate', 'sys_press', 'dias_press',
       'mean_press', 'temp', 'hemoglobin', 'platelets', 'hematocrit',
       'red_cells_count', 'hcm', 'rdw', 'mcv', 'leukocytes', 'neutrophil',
       'lymphocytes', 'basophils', 'eosinophils', 'monocytes', 'crp',
       'male_1.0'],
      dtype='object')

In [59]:
seed

42

In [60]:
# Xtrain.head()

### Filtering only 5 pre-selected models

In [61]:
# print(inspect.getsource(compare_models))

In [62]:
#Modelos que suportam missing (1) - Modelos pre-selecionados(2)
compare_models(blacklist = ["lr","knn","nb","dt","svm","rbfsvm","gpc","ridge","qda","ada","gbc","lda","et","mlp","rf"] , turbo = False)

Unnamed: 0,Model,Accuracy,AUC,Recall,Specificity,Prec.,F1,Kappa,Hosmer_lemeshow,Spiegelhalter,Scaled_brier
0,Light Gradient Boosting Machine,0.9233,0.9959,0.9857,0.8625,0.8849,0.9297,0.8474,0.6704,0.4712,0.784
1,CatBoost Classifier,0.951,0.9959,0.9571,0.9446,0.9542,0.9506,0.9018,0.9325,0.4565,0.8605
2,Extreme Gradient Boosting,0.9362,0.9816,0.9857,0.8857,0.9052,0.9415,0.8722,0.7035,0.4179,0.8092


### Tune selected models by AUC (and hyperopt bayesian optimization) to use on test data

In [63]:
cattuned = tune_model('catboost', optimize='AUC', n_iter=20)

Unnamed: 0,Accuracy,AUC,Recall,Specificity,Prec.,F1,Kappa,Hosmer_lemeshow,Spiegelhalter,Scaled_brier
0,0.933,1.0,1.0,0.875,0.875,0.933,0.867,0.911,0.329,0.796
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.41,0.968
2,0.857,1.0,0.714,1.0,1.0,0.833,0.714,0.897,0.742,0.744
3,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.608,0.996
4,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996,0.434,0.933
5,0.857,0.918,0.857,0.857,0.857,0.857,0.714,0.0,0.129,0.609
6,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.617,0.996
7,0.929,1.0,1.0,0.857,0.875,0.933,0.857,0.906,0.733,0.808
8,0.857,1.0,1.0,0.714,0.778,0.875,0.714,0.735,0.856,0.692
9,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.99,0.356,0.9


In [64]:
lgbmtuned = tune_model('lightgbm', optimize='AUC', n_iter=20)

Unnamed: 0,Accuracy,AUC,Recall,Specificity,Prec.,F1,Kappa,Hosmer_lemeshow,Spiegelhalter,Scaled_brier
0,0.8,1.0,1.0,0.625,0.7,0.824,0.609,0.087,0.011,0.465
1,0.933,1.0,1.0,0.857,0.889,0.941,0.865,0.988,0.792,0.878
2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.706,0.999
3,0.929,1.0,1.0,0.857,0.875,0.933,0.857,0.93,0.178,0.838
4,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996,0.3,0.928
5,0.857,0.959,0.857,0.857,0.857,0.857,0.714,0.0,0.0,0.442
6,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.879,1.0
7,0.929,1.0,1.0,0.857,0.875,0.933,0.857,0.817,0.538,0.78
8,0.857,1.0,1.0,0.714,0.778,0.875,0.714,0.0,0.0,0.446
9,0.929,1.0,1.0,0.857,0.875,0.933,0.857,0.982,0.98,0.872


In [65]:
xgbtuned = tune_model('xgboost', optimize='AUC', n_iter=20)

Unnamed: 0,Accuracy,AUC,Recall,Specificity,Prec.,F1,Kappa,Hosmer_lemeshow,Spiegelhalter,Scaled_brier
0,0.533,0.732,1.0,0.125,0.5,0.667,0.118,0.01,0.191,-0.043
1,0.533,1.0,1.0,0.0,0.533,0.696,0.0,0.232,0.953,0.272
2,0.786,1.0,1.0,0.571,0.7,0.824,0.571,0.303,0.31,0.377
3,0.786,1.0,1.0,0.571,0.7,0.824,0.571,0.37,0.412,0.442
4,0.714,0.959,1.0,0.429,0.636,0.778,0.429,0.408,0.344,0.335
5,0.786,0.898,0.857,0.714,0.75,0.8,0.571,0.431,0.47,0.34
6,0.786,1.0,1.0,0.571,0.7,0.824,0.571,0.488,0.171,0.523
7,0.714,1.0,1.0,0.429,0.636,0.778,0.429,0.239,0.576,0.353
8,0.714,1.0,1.0,0.429,0.636,0.778,0.429,0.17,0.805,0.307
9,0.786,1.0,1.0,0.571,0.7,0.824,0.571,0.259,0.463,0.354


### Select best algorithm based on AUC in predicts

In [66]:
preds_cat = predict_model(cattuned)

Unnamed: 0,Model,Accuracy,AUC,Recall,Specificity,Prec.,F1,Kappa,Hosmer_lemeshow,Spiegelhalter,Scaled_brier
0,CatBoost Classifier,0.816,0.899,0.571,0.871,0.5,0.533,0.419,0.84,0.693,0.329


In [67]:
preds_lgb = predict_model(lgbmtuned)

Unnamed: 0,Model,Accuracy,AUC,Recall,Specificity,Prec.,F1,Kappa,Hosmer_lemeshow,Spiegelhalter,Scaled_brier
0,Light Gradient Boosting Machine,0.868,0.876,0.714,0.903,0.625,0.667,0.585,0.0,0.009,0.226


In [68]:
preds_xgb = predict_model(xgbtuned)

Unnamed: 0,Model,Accuracy,AUC,Recall,Specificity,Prec.,F1,Kappa,Hosmer_lemeshow,Spiegelhalter,Scaled_brier
0,Extreme Gradient Boosting,0.5,0.926,1.0,0.387,0.269,0.424,0.189,0.0,0.357,-0.641


In [69]:
# Put in PREDS the best algorithm chosen in the previous lines: preds_cat OR preds_lgb OR preds_xgb
#preds = preds_cat
#preds = preds_lgb
preds = preds_xgb

### Best model

In [70]:
npv(y_test, preds['Label'])

1.0

In [71]:
#ROC AUC (INTERVALO)
["{0:0.2f}".format(i) for i in AUC_CI(y_test, preds['Score'].values)]

['0.83', '1.00']

In [74]:
import MLFlow_Utils
binary_classification_metrics(y_test, preds["Label"], preds["Score"])

ZeroDivisionError: float division by zero

### Understand best algorithm features and results
#### Run INTERPRETE_MODEL only for the best algorithm chosen in the previous lines

In [None]:
#evaluate_model(lgbmtuned)

In [None]:
#interpret_model(cattuned)
#interpret_model(lgbmtuned)
interpret_model(xgbtuned)

In [None]:
#plot_model(lgbmtuned, plot='ktops')

### 2.3.1) Build a model with all data
#### Run FINALIZE MODEL only for the best algorithm chosen in the previous lines

In [None]:
#final_model = finalize_model(cattuned)
#final_model = finalize_model(lgbmtuned)
final_model = finalize_model(xgbtuned)

In [None]:
str(final_model.__class__.__name__)

In [None]:
final_model_name = str(final_model.__class__.__name__) + '_all_regions_abs_number_' \
                    + X_test_specific.region.iloc[0]  \
                    + '_' + X_test_specific.state.iloc[0]  \
                    + '_' + X_test_specific.city_hospital.iloc[0]
final_model_name

In [None]:
save_model(final_model, final_model_name ,verbose=True)

In [None]:
#final_model.get_all_params()
final_model

# Use the saved model to predict on specific hospital
## ALL_REGIONS_ABS_NUMBER

In [None]:
import pickle
import joblib

#Carrego o modelo unico
print('loaded: ' + final_model_name)
all_regions_abs_number_model_pkl = final_model_name + ".pkl"
prep_pipe_all_regions_abs_number, all_regions_abs_number_model = joblib.load(all_regions_abs_number_model_pkl)

In [None]:
all_regions_abs_number_model.__class__.__name__

In [None]:
x_test_specific_name

In [None]:
#Carrego o X_test do modelo único
X_test_single = pd.read_csv(x_test_specific_name, delimiter=";",index_col=0)
X_test_single.shape

In [None]:
X_test_single.columns

In [None]:
#Carrego o Y_test do modelo único
y_test_single = pd.read_csv(y_test_specific_name, delimiter=";",index_col=0)
y_test_single = y_test_single["icu"]
y_test_single.shape

In [None]:
pd.set_option('display.max_columns', None)
X_test_single.head()

In [None]:
X_test_single["class"] = y_test_single

In [None]:
X_test_single.columns

In [None]:
X_test_single = X_test_single.drop(['city_hospital'],axis=1)
X_test_single = X_test_single.drop(['region'],axis=1)
X_test_single = X_test_single.drop(['state'],axis=1)

In [None]:
X_test_single_transformed = prep_pipe_all_regions_abs_number.fit_transform(X_test_single)
X_test_single_transformed.head()

In [None]:
X_test_single_transformed = X_test_single_transformed.drop(['class'],axis=1)

In [None]:
X_test.columns

In [None]:
X_test_single_transformed.columns

In [None]:
X_test.columns

In [None]:
#Show distinct columns
distinct_columns = X_test[X_test.columns.difference(X_test_single_transformed.columns)].columns.tolist()
distinct_columns

### Add missing columns to some hospitals that didn't collect all columns
e.g. HC_USP (missing: red_cells_count and eosinophils)

In [None]:
for column in distinct_columns:
    if(column=='male_0.0'):
        X_test_single_transformed['male_0.0'] = X_test_single_transformed['male_1.0'].apply(lambda x: 1.0 if x==0.0 else 0.0)
        X_test_single_transformed = X_test_single_transformed.drop('male_1.0', axis=1)
    elif(column=='male_1.0'):
        X_test_single_transformed['male_1.0'] = X_test_single_transformed['male_0.0'].apply(lambda x: 1.0 if x==0.0 else 0.0)
        X_test_single_transformed = X_test_single_transformed.drop('male_0.0', axis=1)
    else:
        X_test_single_transformed[column]=np.NaN

In [None]:
# X_test_single_transformed['male_1.0'] = X_test_single_transformed['male_0.0'].apply(lambda x: 1.0 if x==0.0 else 0.0)

In [None]:
X_test_single_transformed.columns

In [None]:
X_test_single_transformed.shape

In [None]:
X_test_single_transformed.shape

In [None]:
X_test.columns

In [None]:
X_test_single_transformed.columns

**Reorder columns**

In [None]:
# Reorder columns
X_test_single_transformed = X_test_single_transformed[X_test.columns]
X_test_single_transformed.shape

In [None]:
#Vejo a curva ROC do modelo INDIVIDUAL e confirmo o valor da AUC
from sklearn.metrics import (roc_curve,auc)

y_pred_prob1 = all_regions_abs_number_model.predict_proba(X_test_single_transformed)[:,1]
fpr1 , tpr1, thresholds1 = roc_curve(y_test_single, y_pred_prob1)

auc_ind = auc(fpr1, tpr1)
print(auc_ind)
plt.plot(fpr1, tpr1, label= "Single Model - best model - AUC " +  str(auc_ind), color="yellow")

In [None]:
y_pred_prob1.shape

In [None]:
y_pred = all_regions_abs_number_model.predict(X_test_single_transformed)

In [None]:
import MLFlow_Utils
binary_classification_metrics(y_test_single, y_pred, pd.Series(y_pred_prob1))

In [None]:
fprate, tprate, thresholds = roc_curve(y_test_single, y_pred_prob1, pos_label=1)
auc(fprate, tprate)

In [None]:
#ROC_AUC
["{0:0.2f}".format(i) for i in AUC_CI(y_test_single, y_pred_prob1)]

In [None]:
#recall
tpr(y_test_single, y_pred)

In [None]:
#specificity
tnr(y_test_single, pd.Series(y_pred))

In [None]:
#accuracy
accuracy(y_test_single, y_pred)

In [None]:
ppv(y_test_single, y_pred)

In [None]:
npv(y_test_single, y_pred)

# End of File