In [15]:
# Import libraries

# Pandas
import pandas as pd
from collections import Counter

import matplotlib.pyplot as plt
import numpy as np


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.metrics import confusion_matrix


from sklearn.linear_model import LogisticRegression

from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import accuracy_score
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from imblearn.ensemble import EasyEnsembleClassifier

from sklearn.svm import SVC
import sklearn as skl
import tensorflow as tf

In [16]:
file = 'clean_ml_data_0625.csv'
clean_df = pd.read_csv(file)

In [17]:
clean_df = clean_df.sample(30000)

In [18]:
clean_df.head()

Unnamed: 0,gender,pneumonia,pregnant,diabetes,copd,asthma,immunosup,hypertension,cardiovascular,obesity,renal_chronic,tobacco,closed_contact,another_complication,death,intubation,ICU,new_age
1285746,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,3
1083141,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,5
834190,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,4
345676,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2
303295,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,4


In [19]:
Counter(clean_df.death)

Counter({0: 26867, 1: 3133})

In [20]:
y= clean_df['death']
X = clean_df.drop(columns = ['ICU','intubation','death'])

In [21]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state =42, stratify=y)

In [22]:
# Creating StandardScaler instance
scaler = StandardScaler()
# Fitting standard scaler
X_scaler = scaler.fit(X_train)

# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [23]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE
X_resampled_s, y_resampled_s = SMOTE(random_state=1,
sampling_strategy='auto').fit_resample(X_train_scaled,y_train)
Counter(y_resampled_s)

Counter({0: 20150, 1: 20150})

In [24]:
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score

results = []
sc = "balanced_accuracy"

clf = DecisionTreeClassifier(random_state=42)

sk_folds = StratifiedKFold(n_splits = 5)

scores = cross_val_score(clf, X_resampled_s, y_resampled_s, cv = sk_folds)
print('Accuracy', np.mean(scores), scores)
balanced_scores = cross_val_score(clf,  X_resampled_s, y_resampled_s, cv = sk_folds, scoring=sc)
print('Balanced Accuracy', np.mean(balanced_scores), balanced_scores)
recall = cross_val_score(clf, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='recall')
print('Recall', np.mean(recall), recall)
precision = cross_val_score(clf, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='precision')
print('Precision', np.mean(precision), precision)
f1 = cross_val_score(clf, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='f1')
print('F1', np.mean(f1), f1)

results.append({
    "name": "Decision Tree",
    "Accuracy":np.mean(scores),
    "Balanced_Accuracy":np.mean(balanced_scores),
    "Recall":np.mean(recall),
    "Precision":np.mean(precision),
    "F1": np.mean(f1)
})

Accuracy 0.9002233250620348 [0.87171216 0.90756824 0.90533499 0.91191067 0.90459057]
Balanced Accuracy 0.9002233250620346 [0.87171216 0.90756824 0.90533499 0.91191067 0.90459057]
Recall 0.9198014888337468 [0.85657568 0.93275434 0.93796526 0.93970223 0.93200993]
Precision 0.8851239753153692 [0.88331627 0.88802268 0.88050314 0.89022097 0.88355681]
F1 0.90186705728607 [0.86974049 0.90983904 0.90832632 0.91429261 0.90713682]


In [25]:
svm = SVC(kernel='linear')

sk_folds = StratifiedKFold(n_splits = 5)

scores = cross_val_score(svm, X_resampled_s, y_resampled_s, cv = sk_folds)
print('Accuracy', np.mean(scores), scores)
balanced_scores = cross_val_score(svm,  X_resampled_s, y_resampled_s, cv = sk_folds, scoring=sc)
print('Balanced Accuracy', np.mean(balanced_scores), balanced_scores)
recall = cross_val_score(svm, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='recall')
print('Recall', np.mean(recall), recall)
precision = cross_val_score(svm, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='precision')
print('Precision', np.mean(precision), precision)
f1 = cross_val_score(svm, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='f1')
print('F1', np.mean(f1), f1)

results.append({
    "name": "Support Vector Machine",
    "Accuracy":np.mean(scores),
    "Balanced_Accuracy":np.mean(balanced_scores),
    "Recall":np.mean(recall),
    "Precision":np.mean(precision),
    "F1": np.mean(f1)
})

Accuracy 0.8720843672456576 [0.86861042 0.87295285 0.87617866 0.87406948 0.86861042]
Balanced Accuracy 0.8720843672456576 [0.86861042 0.87295285 0.87617866 0.87406948 0.86861042]
Recall 0.8713151364764269 [0.85930521 0.87171216 0.87940447 0.87369727 0.87245658]
Precision 0.8726786225792729 [0.87560051 0.8738806  0.87376726 0.87434815 0.8657966 ]
F1 0.8719769160633749 [0.86737633 0.87279503 0.8765768  0.87402259 0.86911383]


In [26]:
lr = LogisticRegression(max_iter=1000,solver='lbfgs', random_state=1)

sk_folds = StratifiedKFold(n_splits = 5)

scores = cross_val_score(lr, X_resampled_s, y_resampled_s, cv = sk_folds)
print('Accuracy', np.mean(scores), scores)
balanced_scores = cross_val_score(lr,  X_resampled_s, y_resampled_s, cv = sk_folds, scoring=sc)
print('Balanced Accuracy', np.mean(balanced_scores), balanced_scores)
recall = cross_val_score(lr, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='recall')
print('Recall', np.mean(recall), recall)
precision = cross_val_score(lr, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='precision')
print('Precision', np.mean(precision), precision)
f1 = cross_val_score(lr, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='f1')
print('F1', np.mean(f1), f1)

results.append({
    "name": "Logistic Regression",
    "Accuracy":np.mean(scores),
    "Balanced_Accuracy":np.mean(balanced_scores),
    "Recall":np.mean(recall),
    "Precision":np.mean(precision),
    "F1": np.mean(f1)
})

Accuracy 0.8727543424317619 [0.87245658 0.87258065 0.87320099 0.8764268  0.8691067 ]
Balanced Accuracy 0.8727543424317619 [0.87245658 0.87258065 0.87320099 0.8764268  0.8691067 ]
Recall 0.8743424317617867 [0.86724566 0.86997519 0.87369727 0.87990074 0.8808933 ]
Precision 0.8716355820039805 [0.87637914 0.8745323  0.87283094 0.87382947 0.86060606]
F1 0.8729572541293695 [0.87178848 0.87224779 0.87326389 0.8768546  0.87063151]


In [27]:
from sklearn.ensemble import GradientBoostingClassifier
xgb = GradientBoostingClassifier(n_estimators=20,
                                        learning_rate=1,
                                        max_features=5,
                                        max_depth=3,
                                        random_state=0)

sk_folds = StratifiedKFold(n_splits = 5)

scores = cross_val_score(xgb,  X_resampled_s, y_resampled_s, cv = sk_folds)
print('Accuracy', np.mean(scores), scores)
balanced_scores = cross_val_score(xgb,  X_resampled_s, y_resampled_s, cv = sk_folds, scoring=sc)
print('Balanced Accuracy', np.mean(balanced_scores), balanced_scores)
recall = cross_val_score(xgb, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='recall')
print('Recall', np.mean(recall), recall)

precision = cross_val_score(xgb, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='precision')
print('Precision', np.mean(precision), precision)

f1 = cross_val_score(xgb, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='f1')
print('F1', np.mean(f1), f1)

results.append({
    "name": "XGBoost",
    "Accuracy":np.mean(scores),
    "Balanced_Accuracy":np.mean(balanced_scores),
    "Recall":np.mean(recall),
    "Precision":np.mean(precision),
    "F1": np.mean(f1)
})

Accuracy 0.8935980148883376 [0.87171216 0.89950372 0.89416873 0.90583127 0.89677419]
Balanced Accuracy 0.8935980148883376 [0.87171216 0.89950372 0.89416873 0.90583127 0.89677419]
Recall 0.9131017369727047 [0.85558313 0.92382134 0.92382134 0.93647643 0.92580645]
Precision 0.8789145754968477 [0.88410256 0.88097492 0.87210119 0.8823942  0.875     ]
F1 0.8954065883291605 [0.86960908 0.90188953 0.89721653 0.90863127 0.89968652]


In [28]:
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators = 13, random_state = 42)

sk_folds = StratifiedKFold(n_splits = 5)


scores = cross_val_score(rf,  X_resampled_s, y_resampled_s, cv = sk_folds)
print('Accuracy', np.mean(scores), scores)

balanced_scores = cross_val_score(rf,  X_resampled_s, y_resampled_s, cv = sk_folds, scoring=sc)
print('Balanced Accuracy', np.mean(balanced_scores), balanced_scores)

recall = cross_val_score(rf, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='recall')
print('Recall', np.mean(recall), recall)

precision = cross_val_score(rf, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='precision')
print('Precision', np.mean(precision), precision)

f1 = cross_val_score(rf, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='f1')
print('F1', np.mean(f1), f1)

results.append({
    "name": "Random Forest",
    "Accuracy":np.mean(scores),
    "Balanced_Accuracy":np.mean(balanced_scores),
    "Recall":np.mean(recall),
    "Precision":np.mean(precision),
    "F1": np.mean(f1)
})

Accuracy 0.9010669975186104 [0.87890819 0.90682382 0.90384615 0.91191067 0.90384615]
Balanced Accuracy 0.9010669975186104 [0.87890819 0.90682382 0.90384615 0.91191067 0.90384615]
Recall 0.9231265508684864 [0.86699752 0.93275434 0.93846154 0.94168734 0.93573201]
Precision 0.884202605961978 [0.88815455 0.88676575 0.87769784 0.88875878 0.87963611]
F1 0.902993153903763 [0.87744852 0.90917886 0.9070632  0.91445783 0.90681736]


In [29]:
from sklearn.neighbors import KNeighborsClassifier
knn= KNeighborsClassifier(n_neighbors=3)

sk_folds = StratifiedKFold(n_splits = 5)

scores = cross_val_score(knn,  X_resampled_s, y_resampled_s, cv = sk_folds)
print('Accuracy', np.mean(scores), scores)

balanced_scores = cross_val_score(knn,  X_resampled_s, y_resampled_s, cv = sk_folds, scoring=sc)
print('Balanced Accuracy', np.mean(balanced_scores), balanced_scores)

recall = cross_val_score(knn, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='recall')
print('Recall', np.mean(recall), recall)

precision = cross_val_score(knn, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='precision')
print('Precision', np.mean(precision), precision)

f1 = cross_val_score(knn, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='f1')
print('F1', np.mean(f1), f1)

results.append({
    "name": "KNN",
    "Accuracy":np.mean(scores),
    "Balanced_Accuracy":np.mean(balanced_scores),
    "Recall":np.mean(recall),
    "Precision":np.mean(precision),
    "F1": np.mean(f1)
})

Accuracy 0.769032258064516 [0.6130273  0.8101737  0.80409429 0.80669975 0.81116625]
Balanced Accuracy 0.769032258064516 [0.6130273  0.8101737  0.80409429 0.80669975 0.81116625]
Recall 0.5944913151364764 [0.24739454 0.68560794 0.67096774 0.68287841 0.68560794]
Precision 0.9142553036851734 [0.92059095 0.91308658 0.91444031 0.90765172 0.91550696]
F1 0.7021186545776279 [0.38998631 0.78316327 0.77400887 0.77938261 0.78405221]


In [30]:
from imblearn.ensemble import EasyEnsembleClassifier
eec = EasyEnsembleClassifier(random_state=1,n_estimators = 100)

sk_folds = StratifiedKFold(n_splits = 5)

scores = cross_val_score(eec,  X_train_scaled, y_train, cv = sk_folds)
print('Accuracy', np.mean(scores), scores)

balanced_scores = cross_val_score(eec,   X_train_scaled, y_train, cv = sk_folds, scoring=sc)
print('Balanced Accuracy', np.mean(balanced_scores), balanced_scores)

recall = cross_val_score(eec,  X_train_scaled, y_train, cv = sk_folds, scoring='recall')
print('Recall', np.mean(recall), recall)

precision = cross_val_score(eec,  X_train_scaled, y_train, cv = sk_folds, scoring='precision')
print('Precision', np.mean(precision), precision)

f1 = cross_val_score(eec,  X_train_scaled, y_train, cv = sk_folds, scoring='f1')
print('F1', np.mean(f1), f1)

results.append({
    "name": "Easy Ensemble",
    "Accuracy":np.mean(scores),
    "Balanced_Accuracy":np.mean(balanced_scores),
    "Recall":np.mean(recall),
    "Precision":np.mean(precision),
    "F1": np.mean(f1)
})

Accuracy 0.8752000000000001 [0.88688889 0.876      0.878      0.87511111 0.86      ]
Balanced Accuracy 0.8651032152473471 [0.8785835  0.86028721 0.86704239 0.85979093 0.85981205]
Recall 0.8523404255319148 [0.86808511 0.84042553 0.85319149 0.84042553 0.85957447]
Precision 0.4494889699654596 [0.47719298 0.4498861  0.45516459 0.4478458  0.41735537]
F1 0.5883495713236038 [0.61584906 0.58605341 0.59363434 0.58431953 0.56189152]


In [31]:
pd.DataFrame(results).sort_values("Balanced_Accuracy", ascending=False)

Unnamed: 0,name,Accuracy,Balanced_Accuracy,Recall,Precision,F1
4,Random Forest,0.901067,0.901067,0.923127,0.884203,0.902993
0,Decision Tree,0.900223,0.900223,0.919801,0.885124,0.901867
3,XGBoost,0.893598,0.893598,0.913102,0.878915,0.895407
2,Logistic Regression,0.872754,0.872754,0.874342,0.871636,0.872957
1,Support Vector Machine,0.872084,0.872084,0.871315,0.872679,0.871977
6,Easy Ensemble,0.8752,0.865103,0.85234,0.449489,0.58835
5,KNN,0.769032,0.769032,0.594491,0.914255,0.702119


## Validate the top 3 models with the test data

In [32]:
ml = {
    "Decision Tree Classifier": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators = 13, random_state = 42),
    "XGBoost":GradientBoostingClassifier(n_estimators=20,
                                        learning_rate=1,
                                        max_features=5,
                                        max_depth=3,
                                        random_state=0)

}
newline = '\n'
results = []
for x in ml:
    model = ml[x]
    model.fit(X_resampled_s, y_resampled_s)
    y_pred_s = model.predict(X_test_scaled)
    accuracy = model.score(X_test_scaled, y_test)
    balanced_accuracy = balanced_accuracy_score(y_test,y_pred_s)
    precision = precision_score(y_test, y_pred_s)
    recall = recall_score(y_test, y_pred_s)
    print(f'model {x}:{newline}')
    print(f'Confusion Matrix: {newline}{confusion_matrix(y_test,y_pred_s)}{newline}')
    print(f'Classification Report: {newline}{ classification_report_imbalanced(y_test, y_pred_s)}{newline}')
    results.append({
    "name": x,
    "Accuracy":accuracy,
    "Balanced_Accuracy":balanced_accuracy,
    "Precision":precision,
    "Recall":recall
})
    
pd.DataFrame(results).sort_values("Recall", ascending=False)

model Decision Tree Classifier:

Confusion Matrix: 
[[5919  798]
 [ 211  572]]

Classification Report: 
                   pre       rec       spe        f1       geo       iba       sup

          0       0.97      0.88      0.73      0.92      0.80      0.65      6717
          1       0.42      0.73      0.88      0.53      0.80      0.63       783

avg / total       0.91      0.87      0.75      0.88      0.80      0.65      7500


model Random Forest:

Confusion Matrix: 
[[5920  797]
 [ 192  591]]

Classification Report: 
                   pre       rec       spe        f1       geo       iba       sup

          0       0.97      0.88      0.75      0.92      0.82      0.67      6717
          1       0.43      0.75      0.88      0.54      0.82      0.66       783

avg / total       0.91      0.87      0.77      0.88      0.82      0.67      7500


model XGBoost:

Confusion Matrix: 
[[5899  818]
 [ 154  629]]

Classification Report: 
                   pre       rec       spe  

Unnamed: 0,name,Accuracy,Balanced_Accuracy,Precision,Recall
2,XGBoost,0.8704,0.84077,0.434692,0.803321
1,Random Forest,0.868133,0.818068,0.425793,0.754789
0,Decision Tree Classifier,0.865467,0.80586,0.417518,0.730524


In [33]:
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.ensemble import EasyEnsembleClassifier
ml2 = {
    "Balanced Random Forest": BalancedRandomForestClassifier(n_estimators=100, random_state=42),
    "Easy Ensemble": EasyEnsembleClassifier(random_state=42,n_estimators = 100)

}
newline = '\n'
results2 = []
for x in ml2:
    model = ml2[x]
    model.fit(X_train_scaled, y_train)
    y_pred_s = model.predict(X_test_scaled)
    accuracy = model.score(X_test_scaled, y_test)
    balanced_accuracy = balanced_accuracy_score(y_test,y_pred_s)
    precision = precision_score(y_test, y_pred_s)
    recall = recall_score(y_test, y_pred_s)
    print(f'model {x}:{newline}')
    print(f'Confusion Matrix: {newline}{confusion_matrix(y_test,y_pred_s)}{newline}')
    print(f'Classification Report: {newline}{ classification_report_imbalanced(y_test, y_pred_s)}{newline}')
    results2.append({
    "name": x,
    "Accuracy":accuracy,
    "Balanced_Accuracy":balanced_accuracy,
    "Precision":precision,
    "Recall":recall
})
    
pd.DataFrame(results2).sort_values("Recall", ascending=False)

model Balanced Random Forest:

Confusion Matrix: 
[[5702 1015]
 [ 113  670]]

Classification Report: 
                   pre       rec       spe        f1       geo       iba       sup

          0       0.98      0.85      0.86      0.91      0.85      0.73      6717
          1       0.40      0.86      0.85      0.54      0.85      0.73       783

avg / total       0.92      0.85      0.85      0.87      0.85      0.73      7500


model Easy Ensemble:

Confusion Matrix: 
[[5903  814]
 [ 145  638]]

Classification Report: 
                   pre       rec       spe        f1       geo       iba       sup

          0       0.98      0.88      0.81      0.92      0.85      0.72      6717
          1       0.44      0.81      0.88      0.57      0.85      0.71       783

avg / total       0.92      0.87      0.82      0.89      0.85      0.72      7500




Unnamed: 0,name,Accuracy,Balanced_Accuracy,Precision,Recall
0,Balanced Random Forest,0.8496,0.852287,0.397626,0.855683
1,Easy Ensemble,0.872133,0.846815,0.439394,0.814815
