In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
all_data = pd.read_csv("data/INFLU20-04052020.csv", sep=';', encoding = "ISO-8859-1")

# COVID-19 patients
all_data = all_data[all_data['PCR_SARS2']==1]

In [3]:
print (all_data.shape)

(19940, 138)


### Remove patients without ethnicity recorded

In [4]:
all_data = all_data[~all_data['CS_RACA'].isnull()]
all_data = all_data[all_data['CS_RACA']!=9]

In [5]:
print (all_data.shape)

(12221, 138)


### Keep only hospitalized patients

In [6]:
hospitalized_patients = all_data[(all_data['HOSPITAL']==1)]

In [7]:
print (hospitalized_patients.shape)

(11321, 138)


## Data pre-processing

In [8]:
translate = {'NU_IDADE_N': 'Age', 'CS_SEXO': 'Sex', 'EVOLUCAO':'Evolution', 'CS_RACA':'Race',
    
             'FEBRE':'Fever', 'TOSSE': 'Cough', 'GARGANTA': 'Sore_throat', 
             'DISPNEIA':'Shortness_of_breath', 'DESC_RESP':'Respiratory_discomfort', 'SATURACAO':'SPO2',  
             'DIARREIA':'Dihareea', 'VOMITO':'Vomitting', 
             
             'CARDIOPATI': 'Cardiovascular', 'HEPATICA': 'Liver', 'ASMA': 'Asthma', 
             'DIABETES': 'Diabetis', 'NEUROLOGIC': 'Neurologic', 'PNEUMOPATI': 'Pulmonary',
             'IMUNODEPRE': 'Immunosuppresion', 'RENAL':'Renal', 'OBESIDADE': 'Obesity'}
hospitalized_patients = hospitalized_patients.rename(columns=translate)

In [9]:
demographics = ['Age', 'Sex', 'Race', 'SG_UF_NOT']

symptoms = ['Fever','Cough', 'Sore_throat', 'Shortness_of_breath', 'Respiratory_discomfort', 'SPO2', 'Dihareea', 'Vomitting']

comorbidities = ['Cardiovascular',  'Asthma', 'Diabetis', 'Pulmonary', 'Immunosuppresion',
                 'Obesity', 'Liver', 'Neurologic', 'Renal']

outcome = ['Evolution']

races = ['Branca', 'Preta', 'Amarela', 'Parda', 'Indigena']

event_dates = ['DT_SIN_PRI', 'DT_COLETA', 'DT_PCR', 'DT_INTERNA',  'DT_ENTUTI', 'DT_EVOLUCA', 'DT_ENCERRA']

age_groups = ['Age_40', 'Age_40_50', 'Age_50_60', 'Age_60_70', 'Age_70']

In [10]:
race_encoding = {1.0: 'Branca', 2.0:'Preta', 3.0:'Amarela', 4.0:'Parda', 5.0: 'Indigena'}
hospitalized_patients['Race'] = hospitalized_patients['Race'].apply(lambda i: race_encoding[i])

for race in races:
    hospitalized_patients[race] = (hospitalized_patients['Race'] == race).astype(np.int32)

In [11]:
hospitalized_patients['Age_40'] = (hospitalized_patients['Age'] < 40).astype(np.int32)
hospitalized_patients['Age_40_50'] = ((hospitalized_patients['Age'] >= 40) &  (hospitalized_patients['Age'] < 50)).astype(np.int32)
hospitalized_patients['Age_50_60'] = ((hospitalized_patients['Age'] >= 50) &  (hospitalized_patients['Age'] < 60)).astype(np.int32)
hospitalized_patients['Age_60_70'] = ((hospitalized_patients['Age'] >= 60) &  (hospitalized_patients['Age'] < 70)).astype(np.int32)
hospitalized_patients['Age_70'] = (hospitalized_patients['Age'] >= 70).astype(np.int32)

In [12]:
for event_date in event_dates:
    hospitalized_patients[event_date] = pd.to_datetime(hospitalized_patients[event_date], format="%d/%m/%Y")

In [13]:
hospitalized_patients['Sex_male'] = (hospitalized_patients['Sex'] == 'M').astype(np.int32)
hospitalized_patients['is_dead'] = (hospitalized_patients['Evolution'] == 2).astype(np.int32)

In [None]:
for comorbidity in comorbidities:
    # fill in missing values for comorbidities
    hospitalized_patients[comorbidity][hospitalized_patients[comorbidity].isnull()] = 0
    hospitalized_patients[comorbidity][hospitalized_patients[comorbidity] == 9] = 0
    # 2 indicates in the data that the comorbidity is not present
    hospitalized_patients[comorbidity][hospitalized_patients[comorbidity] == 2] = 0

for symptom in symptoms:
    # fill in missing values for symptoms
    hospitalized_patients[symptom][hospitalized_patients[symptom].isnull()] = 0
    hospitalized_patients[symptom][hospitalized_patients[symptom] == 9] = 0
    # 2 indicates in the data that the symptom is not present
    hospitalized_patients[symptom][hospitalized_patients[symptom] == 2] = 0

### Compute number of days between hospital admission and outcome

In [15]:
hospitalized_patients['Days_hospital_to_outcome'] = (hospitalized_patients['DT_EVOLUCA'] - hospitalized_patients['DT_INTERNA']).dt.days

### Keep patients that have an outcome recorded (indicated by a valid number of days between hospitalization and outcome)

In [16]:
patients_with_outcome = hospitalized_patients[~hospitalized_patients['Days_hospital_to_outcome'].isnull()]

# Save data to perform Cox regression in R

In [17]:
feature_to_save = ['is_dead', 'Days_hospital_to_outcome', 'Sex_male']  + \
                  demographics + age_groups + symptoms + comorbidities + races 

In [18]:
data_save = patients_with_outcome[feature_to_save]
data_save.to_csv('data/data_for_cox_regression_in_hospital_mortality.csv', index=None)

In [19]:
data_save.shape

(6882, 34)