In [22]:
# Import libraries

# Pandas
import pandas as pd
from collections import Counter

import matplotlib.pyplot as plt
import numpy as np


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.metrics import confusion_matrix


from sklearn.linear_model import LogisticRegression

from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import accuracy_score
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from imblearn.ensemble import EasyEnsembleClassifier

from sklearn.svm import SVC
import sklearn as skl
import tensorflow as tf

In [2]:
file = 'clean_ml_data_0625.csv'
clean_df = pd.read_csv(file)

In [3]:
clean_df = clean_df.sample(30000)

In [4]:
clean_df.head()

Unnamed: 0,gender,pneumonia,pregnant,diabetes,copd,asthma,immunosup,hypertension,cardiovascular,obesity,renal_chronic,tobacco,closed_contact,another_complication,death,intubation,ICU,new_age
1247333,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2
232051,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3
527550,0,0,0,1,0,0,0,1,0,1,0,1,0,0,0,0,0,4
331633,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,4
580987,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,3


In [5]:
Counter(clean_df.death)

Counter({0: 26937, 1: 3063})

In [6]:
y= clean_df['death']
X = clean_df.drop(columns = ['ICU','intubation','death'])

In [7]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state =42, stratify=y)

In [8]:
# Creating StandardScaler instance
scaler = StandardScaler()
# Fitting standard scaler
X_scaler = scaler.fit(X_train)

# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [9]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE
X_resampled_s, y_resampled_s = SMOTE(random_state=1,
sampling_strategy='auto').fit_resample(X_train_scaled,y_train)
Counter(y_resampled_s)

Counter({0: 20203, 1: 20203})

In [11]:
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score

results = []
sc = "balanced_accuracy"

clf = DecisionTreeClassifier(random_state=42)

sk_folds = StratifiedKFold(n_splits = 5)

scores = cross_val_score(clf, X_resampled_s, y_resampled_s, cv = sk_folds)
print('Accuracy', np.mean(scores), scores)
balanced_scores = cross_val_score(clf,  X_resampled_s, y_resampled_s, cv = sk_folds, scoring=sc)
print('Balanced Accuracy', np.mean(balanced_scores), balanced_scores)
recall = cross_val_score(clf, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='recall')
print('Recall', np.mean(recall), recall)
precision = cross_val_score(clf, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='precision')
print('Precision', np.mean(precision), precision)
f1 = cross_val_score(clf, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='f1')
print('F1', np.mean(f1), f1)

results.append({
    "name": "Decision Tree",
    "Accuracy":np.mean(scores),
    "Balanced_Accuracy":np.mean(balanced_scores),
    "Recall":np.mean(recall),
    "Precision":np.mean(precision),
    "F1": np.mean(f1)
})

Accuracy 0.8962290923430212 [0.86352388 0.90780844 0.90038362 0.9053335  0.90409603]
Balanced Accuracy 0.896229164675933 [0.86352388 0.90781103 0.90038663 0.90533131 0.90409298]
Recall 0.9104109609179181 [0.84682009 0.92871287 0.92475248 0.92303885 0.92873051]
Precision 0.8851800150315459 [0.87608807 0.89142314 0.88175596 0.8914914  0.88514151]
F1 0.8974074397942623 [0.86120549 0.90968602 0.90274254 0.90699088 0.90641227]


In [15]:
svm = SVC(kernel='linear')

sk_folds = StratifiedKFold(n_splits = 5)

scores = cross_val_score(svm, X_resampled_s, y_resampled_s, cv = sk_folds)
print('Accuracy', np.mean(scores), scores)
balanced_scores = cross_val_score(svm,  X_resampled_s, y_resampled_s, cv = sk_folds, scoring=sc)
print('Balanced Accuracy', np.mean(balanced_scores), balanced_scores)
recall = cross_val_score(svm, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='recall')
print('Recall', np.mean(recall), recall)
precision = cross_val_score(svm, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='precision')
print('Precision', np.mean(precision), precision)
f1 = cross_val_score(svm, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='f1')
print('F1', np.mean(f1), f1)

results.append({
    "name": "Support Vector Machine",
    "Accuracy":np.mean(scores),
    "Balanced_Accuracy":np.mean(balanced_scores),
    "Recall":np.mean(recall),
    "Precision":np.mean(precision),
    "F1": np.mean(f1)
})

Accuracy 0.8632629640970304 [0.85993566 0.86672442 0.86226952 0.86424947 0.86313575]
Balanced Accuracy 0.8632631492547919 [0.85993566 0.86672406 0.86226895 0.86425056 0.86313651]
Recall 0.8578431840956926 [0.85523385 0.86386139 0.85767327 0.85548132 0.8569661 ]
Precision 0.8672488079078274 [0.86335249 0.86880757 0.8656008  0.87078086 0.86770233]
F1 0.8625168587876935 [0.85927399 0.86632742 0.8616188  0.86306329 0.8623008 ]


In [12]:
lr = LogisticRegression(max_iter=1000,solver='lbfgs', random_state=1)

sk_folds = StratifiedKFold(n_splits = 5)

scores = cross_val_score(lr, X_resampled_s, y_resampled_s, cv = sk_folds)
print('Accuracy', np.mean(scores), scores)
balanced_scores = cross_val_score(lr,  X_resampled_s, y_resampled_s, cv = sk_folds, scoring=sc)
print('Balanced Accuracy', np.mean(balanced_scores), balanced_scores)
recall = cross_val_score(lr, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='recall')
print('Recall', np.mean(recall), recall)
precision = cross_val_score(lr, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='precision')
print('Precision', np.mean(precision), precision)
f1 = cross_val_score(lr, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='f1')
print('F1', np.mean(f1), f1)

results.append({
    "name": "Logistic Regression",
    "Accuracy":np.mean(scores),
    "Balanced_Accuracy":np.mean(balanced_scores),
    "Recall":np.mean(recall),
    "Precision":np.mean(precision),
    "F1": np.mean(f1)
})

Accuracy 0.8670247981944504 [0.86302895 0.86994184 0.86214577 0.86845687 0.87155055]
Balanced Accuracy 0.8670246434442997 [0.86302895 0.86994265 0.86214461 0.86845676 0.87155024]
Recall 0.866455293636268 [0.8596882  0.87648515 0.85272277 0.86933927 0.87404108]
Precision 0.8674501161321903 [0.86547085 0.8651356  0.86907164 0.86783597 0.86973652]
F1 0.8669266540955206 [0.86256983 0.87077339 0.86081959 0.86858697 0.87188349]


In [13]:
from sklearn.ensemble import GradientBoostingClassifier
xgb = GradientBoostingClassifier(n_estimators=20,
                                        learning_rate=1,
                                        max_features=5,
                                        max_depth=3,
                                        random_state=0)

sk_folds = StratifiedKFold(n_splits = 5)

scores = cross_val_score(xgb,  X_resampled_s, y_resampled_s, cv = sk_folds)
print('Accuracy', np.mean(scores), scores)
balanced_scores = cross_val_score(xgb,  X_resampled_s, y_resampled_s, cv = sk_folds, scoring=sc)
print('Balanced Accuracy', np.mean(balanced_scores), balanced_scores)
recall = cross_val_score(xgb, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='recall')
print('Recall', np.mean(recall), recall)

precision = cross_val_score(xgb, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='precision')
print('Precision', np.mean(precision), precision)

f1 = cross_val_score(xgb, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='f1')
print('F1', np.mean(f1), f1)

results.append({
    "name": "XGBoost",
    "Accuracy":np.mean(scores),
    "Balanced_Accuracy":np.mean(balanced_scores),
    "Recall":np.mean(recall),
    "Precision":np.mean(precision),
    "F1": np.mean(f1)
})

Accuracy 0.8898436582509784 [0.86698837 0.89815617 0.8966712  0.89135008 0.89605247]
Balanced Accuracy 0.8898436753474901 [0.86698837 0.89815894 0.89667339 0.89134806 0.89604962]
Recall 0.9040749765399703 [0.85869834 0.92054455 0.91435644 0.90769611 0.91907944]
Precision 0.8789932017303688 [0.87317564 0.88107084 0.88309825 0.87898394 0.87863733]
F1 0.8912440198365145 [0.86587648 0.90037526 0.89845555 0.89310933 0.89840348]


In [14]:
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators = 13, random_state = 42)

sk_folds = StratifiedKFold(n_splits = 5)


scores = cross_val_score(rf,  X_resampled_s, y_resampled_s, cv = sk_folds)
print('Accuracy', np.mean(scores), scores)

balanced_scores = cross_val_score(rf,  X_resampled_s, y_resampled_s, cv = sk_folds, scoring=sc)
print('Balanced Accuracy', np.mean(balanced_scores), balanced_scores)

recall = cross_val_score(rf, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='recall')
print('Recall', np.mean(recall), recall)

precision = cross_val_score(rf, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='precision')
print('Precision', np.mean(precision), precision)

f1 = cross_val_score(rf, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='f1')
print('F1', np.mean(f1), f1)

results.append({
    "name": "Random Forest",
    "Accuracy":np.mean(scores),
    "Balanced_Accuracy":np.mean(balanced_scores),
    "Recall":np.mean(recall),
    "Precision":np.mean(precision),
    "F1": np.mean(f1)
})

Accuracy 0.8959320259016901 [0.86649344 0.90558099 0.90075486 0.90459102 0.90223982]
Balanced Accuracy 0.8959322146023065 [0.86649344 0.90558382 0.90075816 0.90458907 0.90223657]
Recall 0.9123412987178451 [0.8569661  0.92846535 0.92747525 0.92031675 0.92848305]
Precision 0.8832605140888081 [0.87361251 0.88781065 0.88040414 0.89227447 0.8822008 ]
F1 0.8974096530935285 [0.86520924 0.907683   0.9033269  0.90607869 0.90475042]


In [16]:
from sklearn.neighbors import KNeighborsClassifier
knn= KNeighborsClassifier(n_neighbors=3)

sk_folds = StratifiedKFold(n_splits = 5)

scores = cross_val_score(knn,  X_resampled_s, y_resampled_s, cv = sk_folds)
print('Accuracy', np.mean(scores), scores)

balanced_scores = cross_val_score(knn,  X_resampled_s, y_resampled_s, cv = sk_folds, scoring=sc)
print('Balanced Accuracy', np.mean(balanced_scores), balanced_scores)

recall = cross_val_score(knn, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='recall')
print('Recall', np.mean(recall), recall)

precision = cross_val_score(knn, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='precision')
print('Precision', np.mean(precision), precision)

f1 = cross_val_score(knn, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='f1')
print('F1', np.mean(f1), f1)

results.append({
    "name": "KNN",
    "Accuracy":np.mean(scores),
    "Balanced_Accuracy":np.mean(balanced_scores),
    "Recall":np.mean(recall),
    "Precision":np.mean(precision),
    "F1": np.mean(f1)
})

Accuracy 0.7362301996663881 [0.61878248 0.77960648 0.75646578 0.76488058 0.76141567]
Balanced Accuracy 0.7362303101134167 [0.61878248 0.77958579 0.75644155 0.76490315 0.76143857]
Recall 0.5189395209008651 [0.26280624 0.61237624 0.56064356 0.58252908 0.57634249]
Precision 0.9172096207123686 [0.91237113 0.92004463 0.92148088 0.91702376 0.9151277 ]
F1 0.6520518024515136 [0.40806916 0.73532471 0.69713758 0.71246973 0.70725782]


In [17]:
from imblearn.ensemble import EasyEnsembleClassifier
eec = EasyEnsembleClassifier(random_state=1,n_estimators = 100)

sk_folds = StratifiedKFold(n_splits = 5)

scores = cross_val_score(eec,  X_resampled_s, y_resampled_s, cv = sk_folds)
print('Accuracy', np.mean(scores), scores)

balanced_scores = cross_val_score(eec,  X_resampled_s, y_resampled_s, cv = sk_folds, scoring=sc)
print('Balanced Accuracy', np.mean(balanced_scores), balanced_scores)

recall = cross_val_score(eec, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='recall')
print('Recall', np.mean(recall), recall)

precision = cross_val_score(eec, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='precision')
print('Precision', np.mean(precision), precision)

f1 = cross_val_score(eec, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='f1')
print('F1', np.mean(f1), f1)

results.append({
    "name": "Easy Ensemble",
    "Accuracy":np.mean(scores),
    "Balanced_Accuracy":np.mean(balanced_scores),
    "Recall":np.mean(recall),
    "Precision":np.mean(precision),
    "F1": np.mean(f1)
})

Accuracy 0.8754149316125235 [0.8596882  0.88256404 0.87452048 0.87909912 0.88120282]
Balanced Accuracy 0.8754150158891167 [0.8596882  0.88256362 0.87451873 0.879101   0.88120352]
Recall 0.8611600892828705 [0.82677555 0.87920792 0.86039604 0.86389508 0.87552586]
Precision 0.8864317632000847 [0.88503311 0.88512335 0.88537952 0.89101582 0.88560701]
F1 0.8735122393392606 [0.854913   0.88215572 0.87270901 0.87724589 0.88053758]


In [18]:
pd.DataFrame(results).sort_values("Balanced_Accuracy", ascending=False)

Unnamed: 0,name,Accuracy,Balanced_Accuracy,Recall,Precision,F1
0,Decision Tree,0.896229,0.896229,0.910411,0.88518,0.897407
3,Random Forest,0.895932,0.895932,0.912341,0.883261,0.89741
2,XGBoost,0.889844,0.889844,0.904075,0.878993,0.891244
6,Easy Ensemble,0.875415,0.875415,0.86116,0.886432,0.873512
1,Logistic Regression,0.867025,0.867025,0.866455,0.86745,0.866927
4,Support Vector Machine,0.863263,0.863263,0.857843,0.867249,0.862517
5,KNN,0.73623,0.73623,0.51894,0.91721,0.652052


## Validate the top 3 models with the test data

In [23]:
ml = {
    "Decision Tree Classifier": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators = 13, random_state = 42),
    "XGBoost":GradientBoostingClassifier(n_estimators=20,
                                        learning_rate=1,
                                        max_features=5,
                                        max_depth=3,
                                        random_state=0)
}

results = []
for x in ml:
    model = ml[x]
    model.fit(X_resampled_s, y_resampled_s)
    y_pred_s = model.predict(X_test_scaled)
    accuracy = model.score(X_test_scaled, y_test)
    balanced_accuracy = balanced_accuracy_score(y_test,y_pred_s)
    precision = precision_score(y_test, y_pred_s)
    recall = recall_score(y_test, y_pred_s)
    print(f'model{x}: {confusion_matrix(y_test,y_pred_s)}')
    results.append({
    "name": x,
    "Accuracy":accuracy,
    "Balanced_Accuracy":balanced_accuracy,
    "Precision":precision,
    "Recall":recall
})
    
pd.DataFrame(results).sort_values("Recall", ascending=False)

modelDecision Tree Classifier: [[5942  792]
 [ 179  587]]
modelRandom Forest: [[5955  779]
 [ 157  609]]
modelXGBoost: [[5931  803]
 [ 130  636]]


Unnamed: 0,name,Accuracy,Balanced_Accuracy,Precision,Recall
2,XGBoost,0.8756,0.855521,0.441974,0.830287
1,Random Forest,0.8752,0.839679,0.438761,0.795039
0,Decision Tree Classifier,0.870533,0.824353,0.425671,0.766319
