<a href="https://colab.research.google.com/github/itsGab/previsao_uti_em_covid/blob/main/notebooks/5_TESTE_FINAL_nb_para_teste_com_dados_externos.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<h1 align='center'> Previsão de Admissão na UTI por COVID-19 </h1>
<h2 align='center'> Notebook TESTE FINAL</h2>

# DADOS E BASE DE TREINO

## Bibliotecas e funções

In [8]:
# Importação de bibliotecas
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, classification_report, recall_score
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import AdaBoostClassifier
from lightgbm import LGBMClassifier
import warnings
warnings.filterwarnings('ignore')

# Funções
def preparando_janela(grupo):
    ''' Preparar primeira janela com a marcador da variável alvo. '''
    if any(grupo['ICU']):
        grupo.loc[grupo['WINDOW'] == '0-2', 'ICU'] = 1
    return grupo

def arrumando_listas(lista):
    ''' As listas foram salvas como string, fiz esse função para transformar em
    listas novamento. '''
    caracteres = "[],'"
    for caracter in caracteres:
        lista = lista.replace(caracter, '')
    return lista.split(' ')

def semente(num=185542):
    np.random.seed(num)

## Importação dos dados


In [9]:
# Importação dos dados
data_raw = 'https://github.com/itsGab/previsao_uti_em_covid/blob/main/dados/Kaggle_Sirio_Libanes_ICU_Prediction.xlsx?raw=true'
info_col = 'https://github.com/itsGab/previsao_uti_em_covid/raw/main/dados/Colunas_por_tipo'

# Lendo os dados com pandas
df = pd.read_excel(data_raw)

## Função de tratamento dos dados

In [10]:
def tratando_dados(df):
    # TRATAMENTO INICIAL

    # Removendo espaco no nome das colunas
    df.columns = df.columns.str.replace(' ', '_')
    # Abrindo lista de colunas por tipo
    s_colunas = pd.read_csv(info_col, squeeze=True, index_col=0)
    s_colunas = s_colunas.apply(arrumando_listas)


    # REMOVENDO LINHAS DA PRIMEIRA JANELA NA UTI

    # Listando os pacientes que entraram na uti na primeira janela (0 a 2 horas)
    lista_pacientes_uti_primeira_janela = df.query('WINDOW == "0-2" & ICU == 1').index.tolist()
    # Mantendo apenas os pacientes que não estão na lista de entrada na primeira janela.
    df_tratado = df.query('PATIENT_VISIT_IDENTIFIER not in @lista_pacientes_uti_primeira_janela')
    # Aplicando função de preparar primeira janela com a marcador da variável alvo.
    df_tratado = df_tratado.groupby('PATIENT_VISIT_IDENTIFIER').apply(preparando_janela)
    index_linhas_remover = df_tratado.query('WINDOW != "0-2" & ICU == 1').index.tolist()
    df_tratado = df_tratado.drop(index=index_linhas_remover)


    # MISSING VALUES

    # Quantidade de linhas com valores faltantes (missing values) ANTES do tratamento
    n_linhas_sem_prenche = df_tratado.query('WINDOW == "0-2"').dropna().describe().loc['count', :]['PATIENT_VISIT_IDENTIFIER']
    # Realizando o tratamento e preenchimento
    df_tratado.update(df_tratado.groupby('PATIENT_VISIT_IDENTIFIER', as_index=False).fillna(method='bfill', axis=0))
    df_tratado = df_tratado.query('WINDOW == "0-2"').dropna()
    # Quantidade de linhas com valores faltantes (missing values) APÓS do tratamento
    n_linhas_com_prenche = df_tratado.dropna().describe().loc['count', :]['PATIENT_VISIT_IDENTIFIER']
    # Transformando dados categoricas binarios em integer
    for col in df_tratado.columns:
        if len(df_tratado[col].unique()) == 2:
            df_tratado[col] = df_tratado[col].astype('int64')
    # Juntando as colunas de exames laboratoriais com as de sinais vistais
    cols_exames_e_vitais = s_colunas['exames_laboratoriais'] + s_colunas['sinais_vitais']


    # CORRELAÇÃO

    # Computando a correlaçãoda matrix
    matrix_corr = df_tratado[cols_exames_e_vitais].corr()
    # Gerando a mascara para a parte superior do triangulo
    mask = np.triu(np.ones_like(matrix_corr, dtype=bool))
    # Computando a triangulo da matrix de correlação absoluta
    matrix_corr_abs = matrix_corr.mask(mask).abs()
    # Listando as colunas com alta correlação ou sem variancia
    taxa_corte_corr = 0.90
    excluir_cols = [col for col in matrix_corr_abs.columns if any(matrix_corr_abs[col]>taxa_corte_corr) or matrix_corr_abs[col].sum() == 0]
    # Listando colunas exames e sinais vitais usaveis
    cols_exames_e_vitais_usaveis = [col for col in cols_exames_e_vitais if col not in excluir_cols]
    # Listando todas as colunas usaveis
    cols_usaveis = s_colunas['informativas'] + s_colunas['demograficas'] + s_colunas['comorbidades'] + cols_exames_e_vitais_usaveis
    # Fatiando DataFrame com colunas usaveis
    df_tratado_sem_corr = df_tratado[cols_usaveis]


    # REMOVAÇÃO DE COLUNAS DESNECESSARIAS

    # Removendo WINDOW
    df_tratado_sem_corr = df_tratado_sem_corr.drop('WINDOW', axis=1)
    # Transformando age percentil em dados categoricos
    col_age_percentil_catcodes = df_tratado_sem_corr['AGE_PERCENTIL'].astype('category').cat.codes
    df_tratado_sem_corr['AGE_PERCENTIL'] = col_age_percentil_catcodes
    # Usando ID como indice
    df_tratado_sem_corr.set_index('PATIENT_VISIT_IDENTIFIER', inplace=True)


    # DATAFRAME FINAL

    return df_tratado_sem_corr

# DADOS E BASE DE TESTE

## TREINANDO

In [11]:
final = tratando_dados(df)

X_train = final.drop(columns=['ICU'])
y_train = final['ICU']

modelo = NearestCentroid().fit(X_train,y_train)
#modelo = AdaBoostClassifier().fit(X,y)
#modelo = LGBMClassifier().fit(X,y)

In [12]:
final.to_csv('df_tratado')

## TESTANDO

Insira o link dos dados de teste `dados_externo` abaixo.

In [13]:
dados_externos = 'https://github.com/itsGab/previsao_uti_em_covid/blob/main/dados/Kaggle_Sirio_Libanes_ICU_Prediction.xlsx?raw=true'# LINK AQUI
df_teste = pd.read_excel(dados_externos)
df_teste.head()

Unnamed: 0,PATIENT_VISIT_IDENTIFIER,AGE_ABOVE65,AGE_PERCENTIL,GENDER,DISEASE GROUPING 1,DISEASE GROUPING 2,DISEASE GROUPING 3,DISEASE GROUPING 4,DISEASE GROUPING 5,DISEASE GROUPING 6,HTN,IMMUNOCOMPROMISED,OTHER,ALBUMIN_MEDIAN,ALBUMIN_MEAN,ALBUMIN_MIN,ALBUMIN_MAX,ALBUMIN_DIFF,BE_ARTERIAL_MEDIAN,BE_ARTERIAL_MEAN,BE_ARTERIAL_MIN,BE_ARTERIAL_MAX,BE_ARTERIAL_DIFF,BE_VENOUS_MEDIAN,BE_VENOUS_MEAN,BE_VENOUS_MIN,BE_VENOUS_MAX,BE_VENOUS_DIFF,BIC_ARTERIAL_MEDIAN,BIC_ARTERIAL_MEAN,BIC_ARTERIAL_MIN,BIC_ARTERIAL_MAX,BIC_ARTERIAL_DIFF,BIC_VENOUS_MEDIAN,BIC_VENOUS_MEAN,BIC_VENOUS_MIN,BIC_VENOUS_MAX,BIC_VENOUS_DIFF,BILLIRUBIN_MEDIAN,BILLIRUBIN_MEAN,...,DIMER_MAX,DIMER_DIFF,BLOODPRESSURE_DIASTOLIC_MEAN,BLOODPRESSURE_SISTOLIC_MEAN,HEART_RATE_MEAN,RESPIRATORY_RATE_MEAN,TEMPERATURE_MEAN,OXYGEN_SATURATION_MEAN,BLOODPRESSURE_DIASTOLIC_MEDIAN,BLOODPRESSURE_SISTOLIC_MEDIAN,HEART_RATE_MEDIAN,RESPIRATORY_RATE_MEDIAN,TEMPERATURE_MEDIAN,OXYGEN_SATURATION_MEDIAN,BLOODPRESSURE_DIASTOLIC_MIN,BLOODPRESSURE_SISTOLIC_MIN,HEART_RATE_MIN,RESPIRATORY_RATE_MIN,TEMPERATURE_MIN,OXYGEN_SATURATION_MIN,BLOODPRESSURE_DIASTOLIC_MAX,BLOODPRESSURE_SISTOLIC_MAX,HEART_RATE_MAX,RESPIRATORY_RATE_MAX,TEMPERATURE_MAX,OXYGEN_SATURATION_MAX,BLOODPRESSURE_DIASTOLIC_DIFF,BLOODPRESSURE_SISTOLIC_DIFF,HEART_RATE_DIFF,RESPIRATORY_RATE_DIFF,TEMPERATURE_DIFF,OXYGEN_SATURATION_DIFF,BLOODPRESSURE_DIASTOLIC_DIFF_REL,BLOODPRESSURE_SISTOLIC_DIFF_REL,HEART_RATE_DIFF_REL,RESPIRATORY_RATE_DIFF_REL,TEMPERATURE_DIFF_REL,OXYGEN_SATURATION_DIFF_REL,WINDOW,ICU
0,0,1,60th,0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,0.08642,-0.230769,-0.283019,-0.59322,-0.285714,0.736842,0.08642,-0.230769,-0.283019,-0.586207,-0.285714,0.736842,0.237113,0.0,-0.162393,-0.5,0.208791,0.89899,-0.247863,-0.459459,-0.432836,-0.636364,-0.42029,0.736842,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0-2,0
1,0,1,60th,0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,0.333333,-0.230769,-0.132075,-0.59322,0.535714,0.578947,0.333333,-0.230769,-0.132075,-0.586207,0.535714,0.578947,0.443299,0.0,-0.025641,-0.5,0.714286,0.838384,-0.076923,-0.459459,-0.313433,-0.636364,0.246377,0.578947,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,2-4,0
2,0,1,60th,0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.605263,0.605263,0.605263,0.605263,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-0.317073,-0.317073,-0.317073,-0.317073,-1.0,-0.317073,-0.317073,-0.317073,-0.317073,-1.0,-0.93895,-0.93895,...,-0.994912,-1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4-6,0
3,0,1,60th,0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,-0.107143,0.736842,,,,,-0.107143,0.736842,,,,,0.318681,0.89899,,,,,-0.275362,0.736842,,,,,-1.0,-1.0,,,,,-1.0,-1.0,6-12,0
4,0,1,60th,0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-1.0,-0.871658,-0.871658,-0.871658,-0.871658,-1.0,-0.863874,-0.863874,-0.863874,-0.863874,-1.0,-0.317073,-0.317073,-0.317073,-0.317073,-1.0,-0.414634,-0.414634,-0.414634,-0.414634,-1.0,-0.979069,-0.979069,...,-0.996762,-1.0,-0.243021,-0.338537,-0.213031,-0.317859,0.033779,0.665932,-0.283951,-0.376923,-0.188679,-0.37931,0.035714,0.631579,-0.340206,-0.4875,-0.57265,-0.857143,0.098901,0.79798,-0.076923,0.286486,0.298507,0.272727,0.362319,0.947368,-0.33913,0.325153,0.114504,0.176471,-0.238095,-0.818182,-0.389967,0.407558,-0.230462,0.096774,-0.242282,-0.814433,ABOVE_12,1


In [14]:
teste = tratando_dados(df_teste)
X_test, y_test = teste.drop(columns=['ICU']), teste['ICU']
print(classification_report(y_test, modelo.predict(X_test)))

              precision    recall  f1-score   support

           0       0.75      0.65      0.69       186
           1       0.51      0.63      0.56       110

    accuracy                           0.64       296
   macro avg       0.63      0.64      0.63       296
weighted avg       0.66      0.64      0.64       296

