In [19]:
# Import libraries

import pandas as pd
from collections import Counter

import matplotlib.pyplot as plt
import numpy as np


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE



from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from imblearn.ensemble import EasyEnsembleClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import accuracy_score
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.model_selection import StratifiedKFold, cross_val_score

In [2]:
file = 'clean_ml_data_0625.csv'
clean_df = pd.read_csv(file)

In [3]:
clean_df = clean_df.sample(30000)

In [4]:
clean_df.head()

Unnamed: 0,gender,pneumonia,pregnant,diabetes,copd,asthma,immunosup,hypertension,cardiovascular,obesity,renal_chronic,tobacco,closed_contact,another_complication,death,intubation,ICU,new_age
247510,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,2
135608,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3
1307306,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,3
278856,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,3
872001,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,3


In [5]:
Counter(clean_df.ICU)

Counter({0: 29492, 1: 508})

In [6]:
y= clean_df['ICU']
X = clean_df.drop(columns = ['ICU','intubation','death'])

In [7]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state =42, stratify=y)

In [8]:
# Creating StandardScaler instance
scaler = StandardScaler()
# Fitting standard scaler
X_scaler = scaler.fit(X_train)

# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [10]:
# Resample the training data with SMOTE
X_resampled_s, y_resampled_s = SMOTE(random_state=1,
sampling_strategy='auto').fit_resample(X_train_scaled,y_train)
Counter(y_resampled_s)

Counter({0: 22119, 1: 22119})

In [11]:
results = []
sc = "balanced_accuracy"

clf = DecisionTreeClassifier(random_state=42)

sk_folds = StratifiedKFold(n_splits = 5)

scores = cross_val_score(clf, X_resampled_s, y_resampled_s, cv = sk_folds)
print('Accuracy', np.mean(scores), scores)
balanced_scores = cross_val_score(clf,  X_resampled_s, y_resampled_s, cv = sk_folds, scoring=sc)
print('Balanced Accuracy', np.mean(balanced_scores), balanced_scores)
recall = cross_val_score(clf, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='recall')
print('Recall', np.mean(recall), recall)
precision = cross_val_score(clf, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='precision')
print('Precision', np.mean(precision), precision)
f1 = cross_val_score(clf, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='f1')
print('F1', np.mean(f1), f1)

results.append({
    "name": "Decision Tree",
    "Accuracy":np.mean(scores),
    "Balanced_Accuracy":np.mean(balanced_scores),
    "Recall":np.mean(recall),
    "Precision":np.mean(precision),
    "F1": np.mean(f1)
})

Accuracy 0.9446631565220359 [0.93433544 0.94495931 0.94597649 0.9491353  0.94890923]
Balanced Accuracy 0.944663115377083 [0.93433544 0.94495931 0.94597649 0.94913906 0.94890527]
Recall 0.9741402515782411 [0.94529837 0.98214286 0.97694394 0.98236491 0.98395118]
Precision 0.9199593407153829 [0.92501659 0.91415948 0.91996594 0.92113632 0.91951838]
F1 0.9461978763681052 [0.93504751 0.94693255 0.94759921 0.95076586 0.95064425]


In [16]:
svm = SVC(kernel='linear')

sk_folds = StratifiedKFold(n_splits = 5)

scores = cross_val_score(svm, X_resampled_s, y_resampled_s, cv = sk_folds)
print('Accuracy', np.mean(scores), scores)
balanced_scores = cross_val_score(svm,  X_resampled_s, y_resampled_s, cv = sk_folds, scoring=sc)
print('Balanced Accuracy', np.mean(balanced_scores), balanced_scores)
recall = cross_val_score(svm, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='recall')
print('Recall', np.mean(recall), recall)
precision = cross_val_score(svm, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='precision')
print('Precision', np.mean(precision), precision)
f1 = cross_val_score(svm, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='f1')
print('F1', np.mean(f1), f1)

results.append({
    "name": "SVM",
    "Accuracy":np.mean(scores),
    "Balanced_Accuracy":np.mean(balanced_scores),
    "Recall":np.mean(recall),
    "Precision":np.mean(precision),
    "F1": np.mean(f1)
})
pd.DataFrame(results)

Accuracy 0.8695239735540351 [0.87319168 0.86900995 0.86482821 0.86718662 0.87340341]
Balanced Accuracy 0.8695239294514658 [0.87319168 0.86900995 0.86482821 0.86718798 0.87340183]
Recall 0.8849402923809006 [0.88675407 0.88562387 0.88562387 0.87926747 0.88743219]
Precision 0.8584950050346783 [0.86333627 0.85714286 0.85026042 0.8584989  0.86323659]
F1 0.8715094649095085 [0.87488849 0.87115064 0.86758193 0.86875908 0.87516719]


Unnamed: 0,name,Accuracy,Balanced_Accuracy,Recall,Precision,F1
0,Decision Tree,0.944663,0.944663,0.97414,0.919959,0.946198
1,Logistic Regression,0.872124,0.872124,0.890366,0.859053,0.874422
2,XGBoost,0.930535,0.930535,0.965912,0.902124,0.932899
3,Random Forest,0.945771,0.945771,0.973643,0.922256,0.9472
4,Random Forest,0.945771,0.945771,0.973643,0.922256,0.9472
5,SVM,0.869524,0.869524,0.88494,0.858495,0.871509


In [12]:
lr = LogisticRegression(max_iter=1000,solver='lbfgs', random_state=1)

sk_folds = StratifiedKFold(n_splits = 5)

scores = cross_val_score(lr, X_resampled_s, y_resampled_s, cv = sk_folds)
print('Accuracy', np.mean(scores), scores)
balanced_scores = cross_val_score(lr,  X_resampled_s, y_resampled_s, cv = sk_folds, scoring=sc)
print('Balanced Accuracy', np.mean(balanced_scores), balanced_scores)
recall = cross_val_score(lr, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='recall')
print('Recall', np.mean(recall), recall)
precision = cross_val_score(lr, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='precision')
print('Precision', np.mean(precision), precision)
f1 = cross_val_score(lr, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='f1')
print('F1', np.mean(f1), f1)

results.append({
    "name": "Logistic Regression",
    "Accuracy":np.mean(scores),
    "Balanced_Accuracy":np.mean(balanced_scores),
    "Recall":np.mean(recall),
    "Precision":np.mean(precision),
    "F1": np.mean(f1)
})

Accuracy 0.8721235894678083 [0.875      0.87127034 0.86674955 0.87159489 0.87600316]
Balanced Accuracy 0.8721235913781282 [0.875      0.87127034 0.86674955 0.87159673 0.87600134]
Recall 0.8903656355750129 [0.89082278 0.89059675 0.89037071 0.88785892 0.89217902]
Precision 0.8590526933363506 [0.86349693 0.85745375 0.85020505 0.85986424 0.86424349]
F1 0.874421803038374 [0.87694704 0.87371105 0.86982445 0.87363737 0.8779891 ]


In [13]:
xgb = GradientBoostingClassifier(n_estimators=20,
                                        learning_rate=1,
                                        max_features=5,
                                        max_depth=3,
                                        random_state=0)

sk_folds = StratifiedKFold(n_splits = 5)

scores = cross_val_score(xgb,  X_resampled_s, y_resampled_s, cv = sk_folds)
print('Accuracy', np.mean(scores), scores)
balanced_scores = cross_val_score(xgb,  X_resampled_s, y_resampled_s, cv = sk_folds, scoring=sc)
print('Balanced Accuracy', np.mean(balanced_scores), balanced_scores)
recall = cross_val_score(xgb, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='recall')
print('Recall', np.mean(recall), recall)

precision = cross_val_score(xgb, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='precision')
print('Precision', np.mean(precision), precision)

f1 = cross_val_score(xgb, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='f1')
print('F1', np.mean(f1), f1)

results.append({
    "name": "XGBoost",
    "Accuracy":np.mean(scores),
    "Balanced_Accuracy":np.mean(balanced_scores),
    "Recall":np.mean(recall),
    "Precision":np.mean(precision),
    "F1": np.mean(f1)
})

Accuracy 0.9305350390024019 [0.92325949 0.93218807 0.92710217 0.9315022  0.93862326]
Balanced Accuracy 0.9305351689896518 [0.92325949 0.93218807 0.92710217 0.9315065  0.93861962]
Recall 0.9659118208738718 [0.94846293 0.97174503 0.96903255 0.96947773 0.97084087]
Precision 0.9021240086491321 [0.90294814 0.90050272 0.89405631 0.90102963 0.91208324]
F1 0.9328993751462893 [0.92514607 0.93476843 0.9300358  0.93400131 0.94054528]


In [15]:
rf = RandomForestClassifier(n_estimators = 13, random_state = 42
sk_folds = StratifiedKFold(n_splits = 5

scores = cross_val_score(rf,  X_resampled_s, y_resampled_s, cv = sk_folds)
print('Accuracy', np.mean(scores), scores)

balanced_scores = cross_val_score(rf,  X_resampled_s, y_resampled_s, cv = sk_folds, scoring=sc)
print('Balanced Accuracy', np.mean(balanced_scores), balanced_scores)

recall = cross_val_score(rf, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='recall')
print('Recall', np.mean(recall), recall)

precision = cross_val_score(rf, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='precision')
print('Precision', np.mean(precision), precision)

f1 = cross_val_score(rf, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='f1')
print('F1', np.mean(f1), f1)

results.append({
    "name": "Random Forest",
    "Accuracy":np.mean(scores),
    "Balanced_Accuracy":np.mean(balanced_scores),
    "Recall":np.mean(recall),
    "Precision":np.mean(precision),
    "F1": np.mean(f1)
})

Accuracy 0.9457708204434191 [0.93456148 0.94586347 0.94733273 0.95071776 0.95037866]
Balanced Accuracy 0.9457707716404344 [0.93456148 0.94586347 0.94733273 0.95072108 0.95037509]
Recall 0.9736428618445665 [0.94733273 0.98146474 0.97739602 0.980104   0.98191682]
Precision 0.9222562463875891 [0.92373815 0.91622705 0.92196162 0.92568866 0.92366575]
F1 0.9472003050724951 [0.93538668 0.94772454 0.94886987 0.95211948 0.95190095]


In [17]:
pd.DataFrame(results).sort_values("Balanced_Accuracy", ascending=False)

Unnamed: 0,name,Accuracy,Balanced_Accuracy,Recall,Precision,F1
3,Random Forest,0.945771,0.945771,0.973643,0.922256,0.9472
4,Random Forest,0.945771,0.945771,0.973643,0.922256,0.9472
0,Decision Tree,0.944663,0.944663,0.97414,0.919959,0.946198
2,XGBoost,0.930535,0.930535,0.965912,0.902124,0.932899
1,Logistic Regression,0.872124,0.872124,0.890366,0.859053,0.874422
5,SVM,0.869524,0.869524,0.88494,0.858495,0.871509


## Validate the top 3 models with the test data

In [20]:
ml = {
    "Decision Tree Classifier": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators = 13, random_state = 42),
    "XGBoost":GradientBoostingClassifier(n_estimators=20,
                                        learning_rate=1,
                                        max_features=5,
                                        max_depth=3,
                                        random_state=0)
}

results = []
for x in ml:
    model = ml[x]
    model.fit(X_resampled_s, y_resampled_s)
    y_pred_s = model.predict(X_test_scaled)
    accuracy = model.score(X_test_scaled, y_test)
    balanced_accuracy = balanced_accuracy_score(y_test,y_pred_s)
    precision = precision_score(y_test, y_pred_s)
    recall = recall_score(y_test, y_pred_s)
    print(f'model{x}: {confusion_matrix(y_test,y_pred_s)}')
    results.append({
    "name": x,
    "Accuracy":accuracy,
    "Balanced_Accuracy":balanced_accuracy,
    "Precision":precision,
    "Recall":recall
})
    
pd.DataFrame(results).sort_values("Recall", ascending=False)

modelDecision Tree Classifier: [[6766  607]
 [  67   60]]
modelRandom Forest: [[6765  608]
 [  66   61]]
modelXGBoost: [[6638  735]
 [  50   77]]


Unnamed: 0,name,Accuracy,Balanced_Accuracy,Precision,Recall
2,XGBoost,0.895333,0.753306,0.094828,0.606299
1,Random Forest,0.910133,0.698926,0.091181,0.480315
0,Decision Tree Classifier,0.910133,0.695057,0.089955,0.472441
