In [13]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import seaborn as sns
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from tensorflow import keras
from xgboost import XGBClassifier

In [14]:
brazil_sars_df = pd.read_csv('SRAG_01-06.csv', sep=';')

In [15]:
idx = [12, 14, 15, 17, 18, 32, 33, 34, 35, 36, 37, 38, 39, 41, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 126]
cols = ['Sex','Age', 'Age Type', 'Pregnant', 'Race', 'Fever', 'Cough', 'Throat infection', 'Breathing difficulty', 'Respiratory', 'Low oxygen levels', 
        'Diarrhea', 'Vomiting', 'Additional Symptoms', 'Cardiovascular', 'Hematologic (Blood Disease)', 'Down Syndrome', 'Liver disease', 'Asthma',
        'Diabetes', 'Neurological', 'Pneumonia', 'Immunosuppression', 'Kidney disease', 'Obesity', 'COVID-19']

In [16]:
brazil_sars_df = brazil_sars_df.iloc[:, idx]
brazil_sars_df.columns = cols

In [17]:
brazil_sars_df['COVID-19'] = brazil_sars_df['COVID-19'] == 5.0
brazil_sars_df['Age'].where(brazil_sars_df['Age Type'] == 3, inplace=True, other=1)
del brazil_sars_df['Age Type']
brazil_sars_df['Myalgia'] = brazil_sars_df['Additional Symptoms'] == "MIALGIA"
brazil_sars_df['Headache'] = brazil_sars_df['Additional Symptoms'] == "CEFALEIA"
del brazil_sars_df['Additional Symptoms']
brazil_sars_df['Sex'].where(brazil_sars_df['Sex'] == 'I', inplace=True, other='M')
brazil_sars_df['Male'] = brazil_sars_df['Sex'] == 'M'
del brazil_sars_df['Sex']
brazil_sars_df['Pregnant'].where(brazil_sars_df['Pregnant'] < 5, inplace=True, other=False)
brazil_sars_df['Pregnant'] = brazil_sars_df['Pregnant'] != 0
brazil_sars_df.iloc[:, 3:-4] = brazil_sars_df.iloc[:, 3:-4] == 1.
brazil_sars_df['Race'].where(brazil_sars_df['Race'] != 9, inplace=True, other=2)
brazil_sars_df['Race'].where(brazil_sars_df['Race'] != 5, inplace=True, other=4)
brazil_sars_df['Race'].replace([1.,2.,3.,4.], ["White", "Black", "Asian", "Hispanic/Latino"], inplace=True)
brazil_sars_df = pd.get_dummies(brazil_sars_df, drop_first=True)

In [18]:
brazil_sars_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 217925 entries, 0 to 217924
Data columns (total 28 columns):
 #   Column                       Non-Null Count   Dtype
---  ------                       --------------   -----
 0   Age                          217925 non-null  int64
 1   Pregnant                     217925 non-null  bool 
 2   Fever                        217925 non-null  bool 
 3   Cough                        217925 non-null  bool 
 4   Throat infection             217925 non-null  bool 
 5   Breathing difficulty         217925 non-null  bool 
 6   Respiratory                  217925 non-null  bool 
 7   Low oxygen levels            217925 non-null  bool 
 8   Diarrhea                     217925 non-null  bool 
 9   Vomiting                     217925 non-null  bool 
 10  Cardiovascular               217925 non-null  bool 
 11  Hematologic (Blood Disease)  217925 non-null  bool 
 12  Down Syndrome                217925 non-null  bool 
 13  Liver disease                

In [19]:
def train_and_evaluate(clf, df, train=True):
  X, y = brazil_sars_df.drop(['COVID-19'], axis=1), brazil_sars_df['COVID-19']
  if train:
    clf.fit(X, y)
  y_pred = clf.predict(X)
  print('Precision Score: {}%'.format(round(precision_score(y_pred, y) * 100, 2)))
  print('Recall Score: {}%'.format(round(recall_score(y_pred, y) * 100, 2)))

In [21]:
# KNN
from sklearn.neighbors import KNeighborsClassifier
brazil_rf = KNeighborsClassifier(n_neighbors = 3)
train_and_evaluate(brazil_rf, brazil_sars_df)

Precision Score: 47.61%
Recall Score: 64.27%


In [23]:
# Random forest
from sklearn.ensemble import RandomForestClassifier
brazil_rf = RandomForestClassifier(random_state=42, n_jobs=-1, n_estimators=300)
train_and_evaluate(brazil_rf, brazil_sars_df)

Precision Score: 61.93%
Recall Score: 80.3%


In [26]:
# AdaBoost
from sklearn.ensemble import AdaBoostClassifier
brazil_rf = AdaBoostClassifier(n_estimators=100)
train_and_evaluate(brazil_rf, brazil_sars_df)

Precision Score: 4.25%
Recall Score: 51.03%


In [30]:
# MLPClassifier
from sklearn.neural_network import MLPClassifier
brazil_rf = MLPClassifier(random_state=42)
train_and_evaluate(brazil_rf, brazil_sars_df)

Precision Score: 5.85%
Recall Score: 54.57%
