In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score

In [2]:
mexico_df = pd.read_csv('covid.csv')

In [3]:
cols = ['sex', 'age', 'pneumonia', 'pregnancy', 'diabetes', 'copd', 'asthma', 'hypertension', 'inmsupr', 'other_disease', 
                       'cardiovascular', 'obesity', 'renal_chronic', 'tobacco', 'covid_res']
mexico_df = mexico_df[cols]
cols.remove('sex')
cols.remove('age')
mexico_df[cols] = mexico_df[mexico_df[cols] < 3][cols]
mexico_df['pregnancy'] = mexico_df['pregnancy'].fillna(1)
mexico_df.dropna(inplace=True)
mexico_df[cols] = mexico_df[cols] == 1
mexico_df['male'] = mexico_df['sex'] == 2
del mexico_df['sex']
mexico_df.rename(columns={'inmsupr': 'Immunosuppression',
                          'renal_chronic': 'Kidney Disease',
                          'copd': "Breathing difficulty",
                          'other_disease': 'Other diseases',
                          'pregnancy': 'Pregnant'}, inplace=True)
mexico_df.columns = [s.capitalize() for s in mexico_df.columns]
mexico_df.rename(columns={'Covid_res': 'COVID-19'}, inplace=True)

In [4]:
mexico_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 496291 entries, 0 to 499691
Data columns (total 15 columns):
 #   Column                Non-Null Count   Dtype
---  ------                --------------   -----
 0   Age                   496291 non-null  int64
 1   Pneumonia             496291 non-null  bool 
 2   Pregnant              496291 non-null  bool 
 3   Diabetes              496291 non-null  bool 
 4   Breathing difficulty  496291 non-null  bool 
 5   Asthma                496291 non-null  bool 
 6   Hypertension          496291 non-null  bool 
 7   Immunosuppression     496291 non-null  bool 
 8   Other diseases        496291 non-null  bool 
 9   Cardiovascular        496291 non-null  bool 
 10  Obesity               496291 non-null  bool 
 11  Kidney disease        496291 non-null  bool 
 12  Tobacco               496291 non-null  bool 
 13  COVID-19              496291 non-null  bool 
 14  Male                  496291 non-null  bool 
dtypes: bool(14), int64(1)
memory usage

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(mexico_df.drop('COVID-19', axis=1), mexico_df['COVID-19'], test_size=0.1, random_state=42)

In [6]:
print("How data is split")
print("-----------------")
print("X_test: ", len(X_test))
print("y_test: ", len(y_test))
print("X_train:", len(X_train))
print("y_train:", len(y_train))

How data is split
-----------------
X_test:  49630
y_test:  49630
X_train: 446661
y_train: 446661


In [7]:
#Random Forest
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
rf = accuracy_score(y_test, y_pred)
rf2 = f1_score(y_test, y_pred)
print('Accuracy score:', accuracy_score(y_test, y_pred))
print('F1 score:', f1_score(y_test, y_pred))
print('-------------------------------------')
print('Precision score:', precision_score(y_test, y_pred))
print('Recall(true positive) score:', recall_score(y_test, y_pred))
print('ROC score:', roc_auc_score(y_test, y_pred))
print('Misclassified samples: %d' % (y_test != y_pred).sum())

Accuracy score: 0.6159379407616361
F1 score: 0.44018913918176744
-------------------------------------
Precision score: 0.6224252491694352
Recall(true positive) score: 0.3404970693807079
ROC score: 0.5879560760538093
Misclassified samples: 19061


In [8]:
# Decsion tree
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
dt = accuracy_score(y_test, y_pred)
dt2 = f1_score(y_test, y_pred)
print('Accuracy score:', accuracy_score(y_test, y_pred))
print('F1 score:', f1_score(y_test, y_pred))
print('Misclassified samples: %d' % (y_test != y_pred).sum())

Accuracy score: 0.6160185371750957
F1 score: 0.433821563326302
Misclassified samples: 19057


In [9]:
# KNN
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()
#Fit the model
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
knn = accuracy_score(y_test, y_pred)
knn2 = f1_score(y_test, y_pred)
print('Accuracy score:', accuracy_score(y_test, y_pred))
print('F1 score:', f1_score(y_test, y_pred))
print('Misclassified samples: %d' % (y_test != y_pred).sum())

Accuracy score: 0.5667338303445497
F1 score: 0.4901723688266117
Misclassified samples: 21503


In [10]:
# SVC
# from sklearn.svm import SVC
# model = SVC()
# model.fit(X_train, y_train)
# y_pred = model.predict(X_test)
# svm = accuracy_score(y_test, y_pred)
# svm2 = f1_score(y_test, y_pred)
# print('Accuracy score:', accuracy_score(y_test, y_pred))
# print('F1 score:', f1_score(y_test, y_pred))
# print('Precision score:', precision_score(y_test, y_pred))
# print('Recall score:', recall_score(y_test, y_pred))
# print('ROC score:', roc_auc_score(y_test, y_pred))
# print('Misclassified samples: %d' % (y_test != y_pred).sum())

In [11]:
# XG Boost
from sklearn.ensemble import GradientBoostingClassifier
model = GradientBoostingClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
gb = accuracy_score(y_test, y_pred)
gb2 = f1_score(y_test, y_pred)
print('Accuracy score:', accuracy_score(y_test, y_pred))
print('F1 score:', f1_score(y_test, y_pred))
print('Misclassified samples: %d' % (y_test != y_pred).sum())

Accuracy score: 0.6241789240378803
F1 score: 0.4443848674411677
Misclassified samples: 18652


In [12]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
#Fit the model
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
#Score/Accuracy
acc_logreg=model.score(X_test, y_test)
acc_logreg
lr2 = f1_score(y_test, y_pred)
print('Accuracy score:', accuracy_score(y_test, y_pred))
print('F1 score:', f1_score(y_test, y_pred))
print('Misclassified samples: %d' % (y_test != y_pred).sum())

Accuracy score: 0.6203102961918194
F1 score: 0.44664356610089856
Misclassified samples: 18844


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [13]:
models = pd.DataFrame({
    'Model': ['KNN', 'Logistic Regression', 
              'Random Forest', 'Decision Tree', 'Gradient Boosting Classifier'],
    'Accuracy Score': [knn, acc_logreg, rf, dt, gb],
    'F1 Score': [knn2, lr2, rf2, dt2, gb2]})
models.sort_values(by='Accuracy Score', ascending=False)

Unnamed: 0,Model,Accuracy Score,F1 Score
4,Gradient Boosting Classifier,0.624179,0.444385
1,Logistic Regression,0.62031,0.446644
3,Decision Tree,0.616019,0.433822
2,Random Forest,0.615938,0.440189
0,KNN,0.566734,0.490172
