In [1]:
# Import libraries

# Pandas
import pandas as pd
from collections import Counter

import matplotlib.pyplot as plt
import numpy as np


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.metrics import confusion_matrix


from sklearn.linear_model import LogisticRegression

from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import accuracy_score
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from imblearn.ensemble import EasyEnsembleClassifier

from sklearn.svm import SVC
import sklearn as skl
import tensorflow as tf

In [2]:
file = 'clean_ml_data_0625.csv'
clean_df = pd.read_csv(file)

In [3]:
clean_df = clean_df.sample(30000)

In [4]:
clean_df.head()

Unnamed: 0,gender,pneumonia,pregnant,diabetes,copd,asthma,immunosup,hypertension,cardiovascular,obesity,renal_chronic,tobacco,closed_contact,another_complication,death,intubation,ICU,new_age
102182,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,2
650451,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,3
19471,1,1,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,5
1228977,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4
799306,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,3


In [5]:
Counter(clean_df.death)

Counter({0: 26955, 1: 3045})

In [6]:
y= clean_df['death']
X = clean_df.drop(columns = ['ICU','intubation','death'])

In [7]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state =42, stratify=y)

In [8]:
# Creating StandardScaler instance
scaler = StandardScaler()
# Fitting standard scaler
X_scaler = scaler.fit(X_train)

# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [9]:
# # Resample the training data with SMOTE
# from imblearn.over_sampling import SMOTE
# X_resampled_s, y_resampled_s = SMOTE(random_state=1,
# sampling_strategy='auto').fit_resample(X_train_scaled,y_train)
# Counter(y_resampled_s)

In [10]:
from imblearn.under_sampling import RandomUnderSampler
ros = RandomUnderSampler(random_state=1)
X_resampled_s, y_resampled_s = ros.fit_resample(X_train_scaled, y_train)
Counter(y_resampled_s)

Counter({0: 2284, 1: 2284})

In [11]:
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score

results = []
sc = "balanced_accuracy"

clf = DecisionTreeClassifier(random_state=42)

sk_folds = StratifiedKFold(n_splits = 5)

scores = cross_val_score(clf, X_resampled_s, y_resampled_s, cv = sk_folds)
print('Accuracy', np.mean(scores), scores)
balanced_scores = cross_val_score(clf,  X_resampled_s, y_resampled_s, cv = sk_folds, scoring=sc)
print('Balanced Accuracy', np.mean(balanced_scores), balanced_scores)
recall = cross_val_score(clf, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='recall')
print('Recall', np.mean(recall), recall)
precision = cross_val_score(clf, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='precision')
print('Precision', np.mean(precision), precision)
f1 = cross_val_score(clf, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='f1')
print('F1', np.mean(f1), f1)

results.append({
    "name": "Decision Tree",
    "Accuracy":np.mean(scores),
    "Balanced_Accuracy":np.mean(balanced_scores),
    "Recall":np.mean(recall),
    "Precision":np.mean(precision),
    "F1": np.mean(f1)
})

Accuracy 0.8443508667652507 [0.85886214 0.84354486 0.83588621 0.8422782  0.84118291]
Balanced Accuracy 0.8443524703443511 [0.85886214 0.84354486 0.83588621 0.84227562 0.84119352]
Recall 0.8415054320703289 [0.84682713 0.85557987 0.83369803 0.83991228 0.83150985]
Precision 0.8464744695666152 [0.867713   0.83547009 0.83736264 0.84361233 0.84821429]
F1 0.843922365124168 [0.85714286 0.84540541 0.83552632 0.84175824 0.83977901]


In [12]:
svm = SVC(kernel='linear')

sk_folds = StratifiedKFold(n_splits = 5)

scores = cross_val_score(svm, X_resampled_s, y_resampled_s, cv = sk_folds)
print('Accuracy', np.mean(scores), scores)
balanced_scores = cross_val_score(svm,  X_resampled_s, y_resampled_s, cv = sk_folds, scoring=sc)
print('Balanced Accuracy', np.mean(balanced_scores), balanced_scores)
recall = cross_val_score(svm, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='recall')
print('Recall', np.mean(recall), recall)
precision = cross_val_score(svm, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='precision')
print('Precision', np.mean(precision), precision)
f1 = cross_val_score(svm, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='f1')
print('F1', np.mean(f1), f1)

results.append({
    "name": "Support Vector Machine",
    "Accuracy":np.mean(scores),
    "Balanced_Accuracy":np.mean(balanced_scores),
    "Recall":np.mean(recall),
    "Precision":np.mean(precision),
    "F1": np.mean(f1)
})

Accuracy 0.8612097085377515 [0.86323851 0.86652079 0.84792123 0.86308872 0.8652793 ]
Balanced Accuracy 0.8612096433644286 [0.86323851 0.86652079 0.84792123 0.86308975 0.86527794]
Recall 0.8559580022265731 [0.8380744  0.8643326  0.84682713 0.86403509 0.86652079]
Precision 0.8652155598025694 [0.88248848 0.86813187 0.84868421 0.86214442 0.86462882]
F1 0.860470681440549 [0.85970819 0.86622807 0.84775465 0.86308872 0.86557377]


In [13]:
lr = LogisticRegression(max_iter=1000,solver='lbfgs', random_state=1)

sk_folds = StratifiedKFold(n_splits = 5)

scores = cross_val_score(lr, X_resampled_s, y_resampled_s, cv = sk_folds)
print('Accuracy', np.mean(scores), scores)
balanced_scores = cross_val_score(lr,  X_resampled_s, y_resampled_s, cv = sk_folds, scoring=sc)
print('Balanced Accuracy', np.mean(balanced_scores), balanced_scores)
recall = cross_val_score(lr, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='recall')
print('Recall', np.mean(recall), recall)
precision = cross_val_score(lr, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='precision')
print('Precision', np.mean(precision), precision)
f1 = cross_val_score(lr, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='f1')
print('F1', np.mean(f1), f1)

results.append({
    "name": "Logistic Regression",
    "Accuracy":np.mean(scores),
    "Balanced_Accuracy":np.mean(balanced_scores),
    "Recall":np.mean(recall),
    "Precision":np.mean(precision),
    "F1": np.mean(f1)
})

Accuracy 0.8625242965096911 [0.86105033 0.86870897 0.84682713 0.86966046 0.86637459]
Balanced Accuracy 0.8625239932435026 [0.86105033 0.86870897 0.84682713 0.8696567  0.86637683]
Recall 0.855521325194825 [0.8380744  0.87308534 0.83588621 0.86622807 0.8643326 ]
Precision 0.8677265612316493 [0.87844037 0.86550976 0.85458613 0.87196468 0.86813187]
F1 0.8615023045480024 [0.85778275 0.86928105 0.84513274 0.86908691 0.86622807]


In [14]:
from sklearn.ensemble import GradientBoostingClassifier
xgb = GradientBoostingClassifier(n_estimators=20,
                                        learning_rate=1,
                                        max_features=5,
                                        max_depth=3,
                                        random_state=0)

sk_folds = StratifiedKFold(n_splits = 5)

scores = cross_val_score(xgb,  X_resampled_s, y_resampled_s, cv = sk_folds)
print('Accuracy', np.mean(scores), scores)
balanced_scores = cross_val_score(xgb,  X_resampled_s, y_resampled_s, cv = sk_folds, scoring=sc)
print('Balanced Accuracy', np.mean(balanced_scores), balanced_scores)
recall = cross_val_score(xgb, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='recall')
print('Recall', np.mean(recall), recall)

precision = cross_val_score(xgb, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='precision')
print('Precision', np.mean(precision), precision)

f1 = cross_val_score(xgb, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='f1')
print('F1', np.mean(f1), f1)

results.append({
    "name": "XGBoost",
    "Accuracy":np.mean(scores),
    "Balanced_Accuracy":np.mean(balanced_scores),
    "Recall":np.mean(recall),
    "Precision":np.mean(precision),
    "F1": np.mean(f1)
})

Accuracy 0.8612092291984729 [0.86323851 0.8654267  0.8512035  0.86637459 0.85980285]
Balanced Accuracy 0.8612173212023494 [0.86323851 0.8654267  0.8512035  0.86640322 0.85981468]
Recall 0.8690995815578333 [0.84901532 0.88402626 0.87089716 0.89254386 0.84901532]
Precision 0.8560029802067983 [0.87387387 0.85232068 0.83789474 0.84791667 0.86800895]
F1 0.8622583418095751 [0.86126526 0.867884   0.85407725 0.86965812 0.85840708]


In [15]:
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators = 13, random_state = 42)

sk_folds = StratifiedKFold(n_splits = 5)


scores = cross_val_score(rf,  X_resampled_s, y_resampled_s, cv = sk_folds)
print('Accuracy', np.mean(scores), scores)

balanced_scores = cross_val_score(rf,  X_resampled_s, y_resampled_s, cv = sk_folds, scoring=sc)
print('Balanced Accuracy', np.mean(balanced_scores), balanced_scores)

recall = cross_val_score(rf, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='recall')
print('Recall', np.mean(recall), recall)

precision = cross_val_score(rf, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='precision')
print('Precision', np.mean(precision), precision)

f1 = cross_val_score(rf, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='f1')
print('F1', np.mean(f1), f1)

results.append({
    "name": "Random Forest",
    "Accuracy":np.mean(scores),
    "Balanced_Accuracy":np.mean(balanced_scores),
    "Recall":np.mean(recall),
    "Precision":np.mean(precision),
    "F1": np.mean(f1)
})

Accuracy 0.8550787194930507 [0.87089716 0.85667396 0.8380744  0.85323111 0.85651698]
Balanced Accuracy 0.8550760105954163 [0.87089716 0.85667396 0.8380744  0.85323813 0.85649641]
Recall 0.8734615532266113 [0.88840263 0.88183807 0.86214442 0.85964912 0.87527352]
Precision 0.8425695925185197 [0.85835095 0.83958333 0.82254697 0.84848485 0.84388186]
F1 0.8577024619698392 [0.87311828 0.8601921  0.84188034 0.8540305  0.85929108]


In [16]:
from sklearn.neighbors import KNeighborsClassifier
knn= KNeighborsClassifier(n_neighbors=3)

sk_folds = StratifiedKFold(n_splits = 5)

scores = cross_val_score(knn,  X_resampled_s, y_resampled_s, cv = sk_folds)
print('Accuracy', np.mean(scores), scores)

balanced_scores = cross_val_score(knn,  X_resampled_s, y_resampled_s, cv = sk_folds, scoring=sc)
print('Balanced Accuracy', np.mean(balanced_scores), balanced_scores)

recall = cross_val_score(knn, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='recall')
print('Recall', np.mean(recall), recall)

precision = cross_val_score(knn, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='precision')
print('Precision', np.mean(precision), precision)

f1 = cross_val_score(knn, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='f1')
print('F1', np.mean(f1), f1)

results.append({
    "name": "KNN",
    "Accuracy":np.mean(scores),
    "Balanced_Accuracy":np.mean(balanced_scores),
    "Recall":np.mean(recall),
    "Precision":np.mean(precision),
    "F1": np.mean(f1)
})

Accuracy 0.7092838431506012 [0.71772429 0.69912473 0.702407   0.71303395 0.71412924]
Balanced Accuracy 0.7092853852355177 [0.71772429 0.69912473 0.702407   0.71279368 0.71437723]
Recall 0.4820540135897732 [0.49452954 0.46827133 0.46608315 0.49342105 0.48796499]
Precision 0.8835846017967903 [0.89328063 0.8699187  0.88381743 0.87890625 0.892     ]
F1 0.6237222468300944 [0.63661972 0.60881935 0.61031519 0.63202247 0.63083451]


In [20]:
from imblearn.ensemble import EasyEnsembleClassifier
eec = EasyEnsembleClassifier(random_state=1,n_estimators = 100)

sk_folds = StratifiedKFold(n_splits = 5)

scores = cross_val_score(eec,  X_train_scaled, y_train, cv = sk_folds)
print('Accuracy', np.mean(scores), scores)

balanced_scores = cross_val_score(eec, X_train_scaled, y_train, cv = sk_folds, scoring=sc)
print('Balanced Accuracy', np.mean(balanced_scores), balanced_scores)

recall = cross_val_score(eec, X_train_scaled, y_train, cv = sk_folds, scoring='recall')
print('Recall', np.mean(recall), recall)

precision = cross_val_score(eec, X_train_scaled, y_train, cv = sk_folds, scoring='precision')
print('Precision', np.mean(precision), precision)

f1 = cross_val_score(eec, X_train_scaled, y_train, cv = sk_folds, scoring='f1')
print('F1', np.mean(f1), f1)

results.append({
    "name": "Easy Ensemble",
    "Accuracy":np.mean(scores),
    "Balanced_Accuracy":np.mean(balanced_scores),
    "Recall":np.mean(recall),
    "Precision":np.mean(precision),
    "F1": np.mean(f1)
})

Accuracy 0.8714666666666666 [0.876      0.87311111 0.868      0.87377778 0.86644444]
Balanced Accuracy 0.8612822787422495 [0.85707221 0.87212926 0.85181725 0.8647369  0.86065577]
Recall 0.848504741064916 [0.83333333 0.87089716 0.83150985 0.85339168 0.85339168]
Precision 0.43232412013003163 [0.44083527 0.43736264 0.42363434 0.43771044 0.42207792]
F1 0.5727342474814148 [0.57663126 0.582297   0.56129985 0.57863501 0.56480811]


In [24]:
pd.DataFrame(results).sort_values("Recall", ascending=False)

Unnamed: 0,name,Accuracy,Balanced_Accuracy,Recall,Precision,F1
4,Random Forest,0.855079,0.855076,0.873462,0.84257,0.857702
3,XGBoost,0.861209,0.861217,0.8691,0.856003,0.862258
1,Support Vector Machine,0.86121,0.86121,0.855958,0.865216,0.860471
2,Logistic Regression,0.862524,0.862524,0.855521,0.867727,0.861502
6,Easy Ensemble,0.871467,0.861282,0.848505,0.432324,0.572734
0,Decision Tree,0.844351,0.844352,0.841505,0.846474,0.843922
5,KNN,0.709284,0.709285,0.482054,0.883585,0.623722


## Validate the top 3 models with the test data

In [26]:
ml = {
    "Support Vector Machine": SVC(kernel='linear'),
    "Random Forest": RandomForestClassifier(n_estimators = 13, random_state = 42),
    "XGBoost":GradientBoostingClassifier(n_estimators=20,
                                        learning_rate=1,
                                        max_features=5,
                                        max_depth=3,
                                        random_state=0)
}
newline = '\n'
results = []
for x in ml:
    model = ml[x]
    model.fit(X_resampled_s, y_resampled_s)
    y_pred_s = model.predict(X_test_scaled)
    accuracy = model.score(X_test_scaled, y_test)
    balanced_accuracy = balanced_accuracy_score(y_test,y_pred_s)
    precision = precision_score(y_test, y_pred_s)
    recall = recall_score(y_test, y_pred_s)
    print(f'model {x}:{newline}')
    print(f'Confusion Matrix: {newline}{confusion_matrix(y_test,y_pred_s)}{newline}')
    print(f'Classification Report: {newline}{ classification_report_imbalanced(y_test, y_pred_s)}{newline}')
    results.append({
    "name": x,
    "Accuracy":accuracy,
    "Balanced_Accuracy":balanced_accuracy,
    "Precision":precision,
    "Recall":recall
})
    
pd.DataFrame(results).sort_values("Recall", ascending=False)

model Support Vector Machine:

Confusion Matrix: 
[[5808  931]
 [  93  668]]

Classification Report: 
                   pre       rec       spe        f1       geo       iba       sup

          0       0.98      0.86      0.88      0.92      0.87      0.76      6739
          1       0.42      0.88      0.86      0.57      0.87      0.76       761

avg / total       0.93      0.86      0.88      0.88      0.87      0.76      7500


model Random Forest:

Confusion Matrix: 
[[5666 1073]
 [  90  671]]

Classification Report: 
                   pre       rec       spe        f1       geo       iba       sup

          0       0.98      0.84      0.88      0.91      0.86      0.74      6739
          1       0.38      0.88      0.84      0.54      0.86      0.74       761

avg / total       0.92      0.84      0.88      0.87      0.86      0.74      7500


model XGBoost:

Confusion Matrix: 
[[5751  988]
 [  91  670]]

Classification Report: 
                   pre       rec       spe    

Unnamed: 0,name,Accuracy,Balanced_Accuracy,Precision,Recall
1,Random Forest,0.844933,0.861256,0.384748,0.881735
2,XGBoost,0.856133,0.866906,0.404101,0.88042
0,Support Vector Machine,0.863467,0.869821,0.417761,0.877792
