In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import seaborn as sns
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from tensorflow import keras
from xgboost import XGBClassifier

In [2]:
mexico_df = pd.read_csv('covid.csv')

In [3]:
cols = ['sex', 'age', 'pneumonia', 'pregnancy', 'diabetes', 'copd', 'asthma', 'hypertension', 'inmsupr', 'other_disease', 
                       'cardiovascular', 'obesity', 'renal_chronic', 'tobacco', 'covid_res']
mexico_df = mexico_df[cols]
cols.remove('sex')
cols.remove('age')
mexico_df[cols] = mexico_df[mexico_df[cols] < 3][cols]
mexico_df['pregnancy'] = mexico_df['pregnancy'].fillna(1)
mexico_df.dropna(inplace=True)
mexico_df[cols] = mexico_df[cols] == 1
mexico_df['male'] = mexico_df['sex'] == 2
del mexico_df['sex']
mexico_df.rename(columns={'inmsupr': 'Immunosuppression',
                          'renal_chronic': 'Kidney Disease',
                          'copd': "Breathing difficulty",
                          'other_disease': 'Other diseases',
                          'pregnancy': 'Pregnant'}, inplace=True)
mexico_df.columns = [s.capitalize() for s in mexico_df.columns]
mexico_df.rename(columns={'Covid_res': 'COVID-19'}, inplace=True)

In [4]:
mexico_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 496291 entries, 0 to 499691
Data columns (total 15 columns):
 #   Column                Non-Null Count   Dtype
---  ------                --------------   -----
 0   Age                   496291 non-null  int64
 1   Pneumonia             496291 non-null  bool 
 2   Pregnant              496291 non-null  bool 
 3   Diabetes              496291 non-null  bool 
 4   Breathing difficulty  496291 non-null  bool 
 5   Asthma                496291 non-null  bool 
 6   Hypertension          496291 non-null  bool 
 7   Immunosuppression     496291 non-null  bool 
 8   Other diseases        496291 non-null  bool 
 9   Cardiovascular        496291 non-null  bool 
 10  Obesity               496291 non-null  bool 
 11  Kidney disease        496291 non-null  bool 
 12  Tobacco               496291 non-null  bool 
 13  COVID-19              496291 non-null  bool 
 14  Male                  496291 non-null  bool 
dtypes: bool(14), int64(1)
memory usage

In [5]:
def train_and_evaluate(clf, df, train=True):
  X, y = mexico_df.drop(['COVID-19'], axis=1), mexico_df['COVID-19']
  if train:
    clf.fit(X, y)
  y_pred = clf.predict(X)
  print('Precision Score: {}%'.format(round(precision_score(y_pred, y) * 100, 2)))
  print('Recall Score: {}%'.format(round(recall_score(y_pred, y) * 100, 2)))

In [32]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
mexico_rf = RandomForestClassifier(random_state=42, n_jobs=-1, n_estimators=125)
train_and_evaluate(mexico_rf, mexico_df)

Precision Score: 37.33%
Recall Score: 67.93%


In [9]:
# KNN
from sklearn.neighbors import KNeighborsClassifier
mexico_rf = KNeighborsClassifier(n_neighbors = 3)
train_and_evaluate(mexico_rf, mexico_df)

Precision Score: 54.61%
Recall Score: 50.47%


In [30]:
# AdaBoost
from sklearn.ensemble import AdaBoostClassifier
mexico_rf = AdaBoostClassifier()
train_and_evaluate(mexico_rf, mexico_df)

Precision Score: 34.72%
Recall Score: 63.24%


In [27]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
mexico_rf = LogisticRegression(solver='lbfgs', max_iter=300)
train_and_evaluate(mexico_rf, mexico_df)

Precision Score: 34.39%
Recall Score: 63.04%


In [31]:
# MLPClassifier
from sklearn.neural_network import MLPClassifier
mexico_rf = MLPClassifier(random_state=42)
train_and_evaluate(mexico_rf, mexico_df)

Precision Score: 30.14%
Recall Score: 66.21%
