In [1]:
import os
import pandas as pd
import numpy as np
from pandas_profiling import ProfileReport

# Load data

In [2]:
path = os.getcwd()+os.sep+'data'
file_1 = 'dataset.xlsx'


pathNs = os.getcwd()+os.sep+'dataNs'
file_2 = 'kaggle_einstein.xlsx'

In [3]:
data_df = pd.read_excel(os.path.join(path,file_1), encoding= 'latin-1')

In [4]:
explain_df = pd.read_excel( os.path.join(pathNs,file_2), encoding= 'latin-1')

# Cleaning

- Criar padrões (abstrato mesmo)
- Colunas com mesmo tipo (str, float) tirar (nan)
- Renomear (colunas com nome maisculo ... exemplo)

In [5]:
#list of features with medical meaning
sel_feat_lst = explain_df[explain_df['clinical_meaning'] == 1]['variable_einstein'].tolist()

In [6]:
data_df.rename(columns = {"Patient addmited to regular ward (1=yes, 0=no)":"REGULAR",
                          "Patient addmited to semi-intensive unit (1=yes, 0=no)":"SEMI",
                          "Patient addmited to intensive care unit (1=yes, 0=no)":"UTI"
                         }, inplace = True)

In [7]:
df = data_df.drop(columns=['Patient ID'] )

In [8]:
df['CRITICO'] = ((df['SEMI'] == 1) | (df['UTI'] == 1)).astype(int)

## Creating new features

In [9]:
df['Neutrophils/Lymphocytes ratio'] = df['Neutrophils']/df['Lymphocytes']

In [13]:
df['Gaso performed '] = ~pd.isna(df[['pH (arterial blood gas analysis)',
                                     'HCO3 (arterial blood gas analysis)',
                                     'pO2 (arterial blood gas analysis)',
                                     'pH (venous blood gas analysis)',
                                     'HCO3 (venous blood gas analysis)',
                                     'pO2 (venous blood gas analysis)']]).all(axis=1)

In [16]:
#Curiosidade Laura:
df['Gaso performed '] = df['Gaso performed '].apply(lambda x: 0 if x is False else 1)
#Tambem funciona:
#df['Gaso performed '] = df['Gaso performed '].astype(int)

In [11]:
df['Urine - Leukocytes'] = df['Urine - Leukocytes'].apply(lambda x: 1 if x is not np.nan else 0)

## Selecting features with medical meaning

In [17]:
sel_feat_lst.extend(['REGULAR','SEMI','UTI','CRITICO'])

In [18]:
df = df[sel_feat_lst]

# Explore (Report Generator)

In [None]:
profile = ProfileReport(features_df, title='Kaggle Einstein Feature Data', html={'style':{'full_width':True}})

In [None]:
profile.to_file(output_file="featureReport.html")

# Data Selection (Features and instances)

In [19]:
lst1 = ['REGULAR', 'SEMI', 'UTI', 'CRITICO',
        'Hematocrit','Patient age quantile',
        'Neutrophils/Lymphocytes ratio',
        'Proteina C reativa mg/dL',
        'Platelets', 'Urine - Leukocytes',
        'Creatinine', 'Urea', 'Gaso performed ']

df1 = df[lst1]

lst2 = ['REGULAR', 'SEMI', 'UTI', 'CRITICO',
        'Hematocrit','Patient age quantile',            
        'Neutrophils/Lymphocytes ratio',         
        'Proteina C reativa mg/dL',              
        'Platelets', 'Urine - Leukocytes', 'Gaso performed ']

df2 = df[lst2]

In [20]:
df1 = df1[~pd.isna(df1).any(axis=1)]
df1.reset_index(drop = True, inplace = True)

In [21]:
df2 = df2[~pd.isna(df2).any(axis=1)]
df2.reset_index(drop = True, inplace = True)

# Models

In [22]:
from sklearn.model_selection import train_test_split, StratifiedKFold

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score

In [None]:
# Rodar um RandomForest

In [60]:
def prettyReport(dic):
    
    for i in dic.keys():
        print(f'{i}- Treino: {round(dic[i][0],3)} Teste:{round(dic[i][1],3)}')
        print()

In [62]:
def makeModelScores(skf, model, X, y):
    
    scores_dic = {'AUC' : [],
                  'PRECISION' : [],
                  'RECALL' : [],
                  'F1SCORE' : []}
    
    auc_train = 0
    precision_train = 0
    recall_train = 0
    f1Score_train = 0
    
    auc_test = 0
    precision_test = 0
    recall_test = 0
    f1Score_test = 0
        
    for train_index, test_index in skf.split(X, y):

        X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
        y_train, y_test = y[train_index], y[test_index]
        
        model.fit(X_train,y_train)

        y_train_hat = model.predict(X_train)

        auc_train += roc_auc_score(y_train, y_train_hat)
        precision_train += precision_score(y_train, y_train_hat)
        recall_train += recall_score(y_train, y_train_hat)
        f1Score_train += f1_score(y_train, y_train_hat)
        
        y_hat = model.predict(X_test)
        
        auc_test += roc_auc_score(y_test, y_hat)
        precision_test += precision_score(y_test, y_hat)
        recall_test += recall_score(y_test, y_hat)
        f1Score_test += f1_score(y_test, y_hat)

    n_fold = skf.get_n_splits(X, y)
       
    #COLOCA TREINO
    scores_dic['AUC'].append(auc_train/n_fold)
    scores_dic['PRECISION'].append(precision_train/n_fold)
    scores_dic['RECALL'].append(recall_train/n_fold)
    scores_dic['F1SCORE'].append(f1Score_train/n_fold)
    
    #COLOCA TEST
    scores_dic['AUC'].append(auc_test/n_fold)
    scores_dic['PRECISION'].append(precision_test/n_fold)
    scores_dic['RECALL'].append(recall_test/n_fold)
    scores_dic['F1SCORE'].append(f1Score_test/n_fold)
        
    return scores_dic

In [23]:
logreg = LogisticRegression(
                solver   = 'sag',
                penalty  = 'l2',
                max_iter = 10000,
                #l1_ratio = 0.9,
                n_jobs   = -1)

In [26]:
skf = StratifiedKFold(n_splits = 5)

## Com ureia e creatina

In [25]:
y1_regular = df1['REGULAR']
y1_semi    = df1['SEMI']
y1_uti     = df1['UTI']
y1_critico = df1['CRITICO']
X1 = df1.drop(columns=['REGULAR','SEMI','UTI','CRITICO'])

### UTI

In [55]:
score_list = makeModelScores(skf, logreg, X1, y1_critico)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [61]:
prettyReport(score_list)

AUC- Treino: 0.636 Teste:0.589

PRECISION- Treino: 0.685 Teste:0.595

RECALL- Treino: 0.291 Teste:0.221

F1SCORE- Treino: 0.407 Teste:0.253



### UTI

## Sem ureia e creatina

In [None]:
y_regular = df2['REGULAR']
y_semi    = df2['SEMI']
y_uti     = df2['UTI']
y_critico = df2['CRITICO']
X = df2.drop(columns=['REGULAR','SEMI','UTI','CRITICO'])