In [1]:
# Import libraries

# Pandas
import pandas as pd
from collections import Counter

import matplotlib.pyplot as plt
import numpy as np


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.metrics import confusion_matrix


from sklearn.linear_model import LogisticRegression

from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import accuracy_score
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import classification_report
from imblearn.ensemble import EasyEnsembleClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import plot_precision_recall_curve
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.svm import SVC
import sklearn as skl
import tensorflow as tf

In [2]:
file = '../clean_ml_data_0625.csv'
clean_df = pd.read_csv(file)

In [3]:
clean_df = clean_df.sample(50000)

In [4]:
clean_df.head()

Unnamed: 0,gender,pneumonia,pregnant,diabetes,copd,asthma,immunosup,hypertension,cardiovascular,obesity,renal_chronic,tobacco,closed_contact,another_complication,death,intubation,ICU,new_age
1201457,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,3
1073651,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3
247712,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1
869662,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,2
1243653,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,5


In [5]:
Counter(clean_df.death)

Counter({0: 44841, 1: 5159})

In [6]:
y= clean_df['death']
X = clean_df.drop(columns = ['ICU','intubation','death'])

In [7]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state =42, stratify=y)

In [8]:
# Creating StandardScaler instance
scaler = StandardScaler()
# Fitting standard scaler
X_scaler = scaler.fit(X_train)

# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [9]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE
X_resampled_s, y_resampled_s = SMOTE(random_state=1,
sampling_strategy='auto').fit_resample(X_train_scaled,y_train)
Counter(y_resampled_s)

Counter({0: 33631, 1: 33631})

In [10]:
# # Use SMOTEENN resampling method
# from imblearn.combine import SMOTEENN 
# smote_enn = SMOTEENN(random_state=0)
# X_resampled_s, y_resampled_s = smote_enn.fit_resample(X_train_scaled,y_train)
# Counter(y_resampled_s)

In [11]:
keep_col = ['gender',
         'pneumonia', 'new_age',
       'pregnant', 'diabetes', 'copd',
       'asthma', 'immunosup', 'hypertension',
       'cardiovascular', 'obesity', 'renal_chronic', 'tobacco', 'another_complication', 'closed_contact']

In [12]:
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score

results = []
clf = DecisionTreeClassifier(random_state=42)

sk_folds = StratifiedKFold(n_splits = 5)

scores = cross_val_score(clf, X_resampled_s, y_resampled_s, cv = sk_folds)
print('Accuracy', np.mean(scores), scores)
balanced_scores = cross_val_score(clf,  X_resampled_s, y_resampled_s, cv = sk_folds, scoring='balanced_accuracy')
print('Balanced Accuracy', np.mean(balanced_scores), balanced_scores)
recall = cross_val_score(clf, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='recall')
print('Recall', np.mean(recall), recall)
precision = cross_val_score(clf, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='precision')
print('Precision', np.mean(precision), precision)
f1 = cross_val_score(clf, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='f1')
print('F1', np.mean(f1), f1)

results.append({
    "name": "Decision Tree",
    "Accuracy":np.mean(scores),
    "Balanced_Accuracy":np.mean(balanced_scores),
    "Recall":np.mean(recall),
    "Precision":np.mean(precision),
    "F1": np.mean(f1)
})
pd.DataFrame(results)

Accuracy 0.8895515911509545 [0.86434253 0.89563666 0.89614927 0.89451383 0.89711567]
Balanced Accuracy 0.8895510350330401 [0.8643413  0.8956351  0.89614927 0.89451383 0.89711567]
Recall 0.9039870174032941 [0.84790366 0.91660473 0.92714838 0.91183467 0.91644365]
Precision 0.8786199429767741 [0.87671022 0.87972607 0.87302254 0.88130479 0.8823361 ]
F1 0.8909002586671066 [0.86206636 0.89778684 0.89927176 0.89630983 0.89906651]


Unnamed: 0,name,Accuracy,Balanced_Accuracy,Recall,Precision,F1
0,Decision Tree,0.889552,0.889551,0.903987,0.87862,0.8909


In [13]:
svm = SVC(kernel='linear')

sk_folds = StratifiedKFold(n_splits = 5)

scores = cross_val_score(svm, X_resampled_s, y_resampled_s, cv = sk_folds)
print('Accuracy', np.mean(scores), scores)
balanced_scores = cross_val_score(svm,  X_resampled_s, y_resampled_s, cv = sk_folds, scoring='balanced_accuracy')
print('Balanced Accuracy', np.mean(balanced_scores), balanced_scores)
recall = cross_val_score(svm, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='recall')
print('Recall', np.mean(recall), recall)
precision = cross_val_score(svm, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='precision')
print('Precision', np.mean(precision), precision)
f1 = cross_val_score(svm, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='f1')
print('F1', np.mean(f1), f1)

results.append({
    "name": "Support Vector Machine",
    "Accuracy":np.mean(scores),
    "Balanced_Accuracy":np.mean(balanced_scores),
    "Recall":np.mean(recall),
    "Precision":np.mean(precision),
    "F1": np.mean(f1)
})
pd.DataFrame(results)

Accuracy 0.8708186013136914 [0.86828217 0.87259347 0.87042819 0.86961047 0.87317871]
Balanced Accuracy 0.8708186142882383 [0.86828298 0.87259272 0.87042819 0.86961047 0.87317871]
Recall 0.8821622744138782 [0.87927446 0.88271146 0.88432947 0.87912578 0.88537021]
Precision 0.862594906136431 [0.86034332 0.86521929 0.86040793 0.86270791 0.86429608]
F1 0.8722668277159678 [0.86970588 0.87387785 0.87220471 0.87083947 0.87470623]


Unnamed: 0,name,Accuracy,Balanced_Accuracy,Recall,Precision,F1
0,Decision Tree,0.889552,0.889551,0.903987,0.87862,0.8909
1,Support Vector Machine,0.870819,0.870819,0.882162,0.862595,0.872267


In [14]:
lr = LogisticRegression(max_iter=1000,solver='lbfgs', random_state=1)

sk_folds = StratifiedKFold(n_splits = 5)

scores = cross_val_score(lr, X_resampled_s, y_resampled_s, cv = sk_folds)
print('Accuracy', np.mean(scores), scores)
balanced_scores = cross_val_score(lr,  X_resampled_s, y_resampled_s, cv = sk_folds, scoring='balanced_accuracy')
print('Balanced Accuracy', np.mean(balanced_scores), balanced_scores)
recall = cross_val_score(lr, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='recall')
print('Recall', np.mean(recall), recall)
precision = cross_val_score(lr, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='precision')
print('Precision', np.mean(precision), precision)
f1 = cross_val_score(lr, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='f1')
print('F1', np.mean(f1), f1)

results.append({
    "name": "Logistic Regression",
    "Accuracy":np.mean(scores),
    "Balanced_Accuracy":np.mean(balanced_scores),
    "Recall":np.mean(recall),
    "Precision":np.mean(precision),
    "F1": np.mean(f1)
})
pd.DataFrame(results)

Accuracy 0.8681127580235011 [0.86605218 0.86984316 0.86782635 0.8661909  0.8706512 ]
Balanced Accuracy 0.8681127345250728 [0.86605195 0.86984327 0.86782635 0.8661909  0.8706512 ]
Recall 0.8669679366054778 [0.86292001 0.86829196 0.86946179 0.86351472 0.8706512 ]
Precision 0.868958626586487 [0.86834231 0.87101103 0.86662715 0.86816143 0.8706512 ]
F1 0.8679594485443107 [0.86562267 0.86964937 0.86804216 0.86583184 0.8706512 ]


Unnamed: 0,name,Accuracy,Balanced_Accuracy,Recall,Precision,F1
0,Decision Tree,0.889552,0.889551,0.903987,0.87862,0.8909
1,Support Vector Machine,0.870819,0.870819,0.882162,0.862595,0.872267
2,Logistic Regression,0.868113,0.868113,0.866968,0.868959,0.867959


In [15]:
from sklearn.ensemble import GradientBoostingClassifier
xgb = GradientBoostingClassifier(n_estimators=20,
                                        learning_rate=1,
                                        max_features=5,
                                        max_depth=3,
                                        random_state=0)

sk_folds = StratifiedKFold(n_splits = 5)

scores = cross_val_score(xgb,  X_resampled_s, y_resampled_s, cv = sk_folds)
print('Accuracy', np.mean(scores), scores)
balanced_scores = cross_val_score(xgb,  X_resampled_s, y_resampled_s, cv = sk_folds, scoring='balanced_accuracy')
print('Balanced Accuracy', np.mean(balanced_scores), balanced_scores)
recall = cross_val_score(xgb, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='recall')
print('Recall', np.mean(recall), recall)

precision = cross_val_score(xgb, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='precision')
print('Precision', np.mean(precision), precision)

f1 = cross_val_score(xgb, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='f1')
print('F1', np.mean(f1), f1)

results.append({
    "name": "GradientBoostingClassifier",
    "Accuracy":np.mean(scores),
    "Balanced_Accuracy":np.mean(balanced_scores),
    "Recall":np.mean(recall),
    "Precision":np.mean(precision),
    "F1": np.mean(f1)
})
pd.DataFrame(results)

Accuracy 0.8859387554238621 [0.86939716 0.88991303 0.89399346 0.88648528 0.88990485]
Balanced Accuracy 0.8859383396497205 [0.86939675 0.88991137 0.89399346 0.88648528 0.88990485]
Recall 0.9036897920386074 [0.86381207 0.91229374 0.92283675 0.90469819 0.91480821]
Precision 0.8727175016473698 [0.87355285 0.8732214  0.87250492 0.87290202 0.87140632]
F1 0.8878092468183467 [0.86865515 0.89233006 0.89696532 0.88851573 0.89257997]


Unnamed: 0,name,Accuracy,Balanced_Accuracy,Recall,Precision,F1
0,Decision Tree,0.889552,0.889551,0.903987,0.87862,0.8909
1,Support Vector Machine,0.870819,0.870819,0.882162,0.862595,0.872267
2,Logistic Regression,0.868113,0.868113,0.866968,0.868959,0.867959
3,GradientBoostingClassifier,0.885939,0.885938,0.90369,0.872718,0.887809


In [16]:
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier
rf2 = RandomForestClassifier(n_estimators = 13, random_state = 42, max_depth = 2)

sk_folds = StratifiedKFold(n_splits = 5)
sc = "balanced_accuracy"

scores = cross_val_score(rf2,  X_resampled_s, y_resampled_s, cv = sk_folds)
print('Accuracy', np.mean(scores), scores)

balanced_scores = cross_val_score(rf2,  X_resampled_s, y_resampled_s, cv = sk_folds, scoring='balanced_accuracy')
print('Balanced Accuracy', np.mean(balanced_scores), balanced_scores)

recall = cross_val_score(rf2, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='recall')
print('Recall', np.mean(recall), recall)

precision = cross_val_score(rf2, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='precision')
print('Precision', np.mean(precision), precision)

f1 = cross_val_score(rf2, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='f1')
print('F1', np.mean(f1), f1)

results.append({
    "name": "Random Forest2",
    "Accuracy":np.mean(scores),
    "Balanced_Accuracy":np.mean(balanced_scores),
    "Recall":np.mean(recall),
    "Precision":np.mean(precision),
    "F1": np.mean(f1)
})
pd.DataFrame(results)

Accuracy 0.8554905594280626 [0.84850963 0.85646324 0.85779066 0.85518882 0.85950045]
Balanced Accuracy 0.8554905137939649 [0.84850929 0.85646335 0.85779066 0.85518882 0.85950045]
Recall 0.8588208426496673 [0.84403806 0.85506169 0.86143324 0.87317871 0.86039251]
Precision 0.8532069911198515 [0.85163516 0.8574836  0.85520295 0.84285304 0.8588602 ]
F1 0.8559541814499682 [0.84781959 0.85627093 0.85830679 0.85774792 0.85962567]


Unnamed: 0,name,Accuracy,Balanced_Accuracy,Recall,Precision,F1
0,Decision Tree,0.889552,0.889551,0.903987,0.87862,0.8909
1,Support Vector Machine,0.870819,0.870819,0.882162,0.862595,0.872267
2,Logistic Regression,0.868113,0.868113,0.866968,0.868959,0.867959
3,GradientBoostingClassifier,0.885939,0.885938,0.90369,0.872718,0.887809
4,Random Forest2,0.855491,0.855491,0.858821,0.853207,0.855954


In [17]:
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier
rf4 = RandomForestClassifier(n_estimators = 13, random_state = 42, max_depth = 4)

sk_folds = StratifiedKFold(n_splits = 5)
sc = "balanced_accuracy"

scores = cross_val_score(rf4,  X_resampled_s, y_resampled_s, cv = sk_folds)
print('Accuracy', np.mean(scores), scores)

balanced_scores = cross_val_score(rf4,  X_resampled_s, y_resampled_s, cv = sk_folds, scoring='balanced_accuracy')
print('Balanced Accuracy', np.mean(balanced_scores), balanced_scores)

recall = cross_val_score(rf4, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='recall')
print('Recall', np.mean(recall), recall)

precision = cross_val_score(rf4, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='precision')
print('Precision', np.mean(precision), precision)

f1 = cross_val_score(rf4, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='f1')
print('F1', np.mean(f1), f1)

results.append({
    "name": "Random Forest4",
    "Accuracy":np.mean(scores),
    "Balanced_Accuracy":np.mean(balanced_scores),
    "Recall":np.mean(recall),
    "Precision":np.mean(precision),
    "F1": np.mean(f1)
})
pd.DataFrame(results)

Accuracy 0.8703428687830026 [0.86144354 0.87720211 0.87035385 0.87109723 0.8716176 ]
Balanced Accuracy 0.8703425436021666 [0.86144355 0.87720047 0.87035385 0.87109723 0.8716176 ]
Recall 0.8893280043969604 [0.86158192 0.89921213 0.88596491 0.89785905 0.902022  ]
Precision 0.8568677110033892 [0.8613258  0.86131283 0.85914072 0.85224386 0.85031535]
F1 0.8727035703855707 [0.86145384 0.87985455 0.87234665 0.87445699 0.87540581]


Unnamed: 0,name,Accuracy,Balanced_Accuracy,Recall,Precision,F1
0,Decision Tree,0.889552,0.889551,0.903987,0.87862,0.8909
1,Support Vector Machine,0.870819,0.870819,0.882162,0.862595,0.872267
2,Logistic Regression,0.868113,0.868113,0.866968,0.868959,0.867959
3,GradientBoostingClassifier,0.885939,0.885938,0.90369,0.872718,0.887809
4,Random Forest2,0.855491,0.855491,0.858821,0.853207,0.855954
5,Random Forest4,0.870343,0.870343,0.889328,0.856868,0.872704


In [18]:
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier
rf6 = RandomForestClassifier(n_estimators = 13, random_state = 42, max_depth = 6)

sk_folds = StratifiedKFold(n_splits = 5)
sc = "balanced_accuracy"

scores = cross_val_score(rf6,  X_resampled_s, y_resampled_s, cv = sk_folds)
print('Accuracy', np.mean(scores), scores)

balanced_scores = cross_val_score(rf6,  X_resampled_s, y_resampled_s, cv = sk_folds, scoring='balanced_accuracy')
print('Balanced Accuracy', np.mean(balanced_scores), balanced_scores)

recall = cross_val_score(rf6, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='recall')
print('Recall', np.mean(recall), recall)

precision = cross_val_score(rf6, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='precision')
print('Precision', np.mean(precision), precision)

f1 = cross_val_score(rf6, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='f1')
print('F1', np.mean(f1), f1)

results.append({
    "name": "Random Forest6",
    "Accuracy":np.mean(scores),
    "Balanced_Accuracy":np.mean(balanced_scores),
    "Recall":np.mean(recall),
    "Precision":np.mean(precision),
    "F1": np.mean(f1)
})
pd.DataFrame(results)

Accuracy 0.8801553006459265 [0.87296514 0.88151342 0.88157895 0.88046387 0.88425513]
Balanced Accuracy 0.8801551688706943 [0.87296665 0.88151125 0.88157895 0.88046387 0.88425513]
Recall 0.9115697098263393 [0.89325007 0.91065854 0.91629497 0.91897116 0.9186738 ]
Precision 0.8576986276224442 [0.85840834 0.86051412 0.85680523 0.85325787 0.85950758]
F1 0.8837827767269673 [0.8754827  0.8848765  0.88555212 0.88489621 0.88810636]


Unnamed: 0,name,Accuracy,Balanced_Accuracy,Recall,Precision,F1
0,Decision Tree,0.889552,0.889551,0.903987,0.87862,0.8909
1,Support Vector Machine,0.870819,0.870819,0.882162,0.862595,0.872267
2,Logistic Regression,0.868113,0.868113,0.866968,0.868959,0.867959
3,GradientBoostingClassifier,0.885939,0.885938,0.90369,0.872718,0.887809
4,Random Forest2,0.855491,0.855491,0.858821,0.853207,0.855954
5,Random Forest4,0.870343,0.870343,0.889328,0.856868,0.872704
6,Random Forest6,0.880155,0.880155,0.91157,0.857699,0.883783


In [19]:
from imblearn.ensemble import BalancedRandomForestClassifier
brfc = BalancedRandomForestClassifier(n_estimators=100, random_state=42)

sk_folds = StratifiedKFold(n_splits = 5)

scores = cross_val_score(brfc,  X_train_scaled, y_train, cv = sk_folds)
print('Accuracy', np.mean(scores), scores)

balanced_scores = cross_val_score(brfc,  X_train_scaled, y_train, cv = sk_folds, scoring=sc)
print('Balanced Accuracy', np.mean(balanced_scores), balanced_scores)

recall = cross_val_score(brfc, X_train_scaled, y_train, cv = sk_folds, scoring='recall')
print('Recall', np.mean(recall), recall)

precision = cross_val_score(brfc, X_train_scaled, y_train, cv = sk_folds, scoring='precision')
print('Precision', np.mean(precision), precision)

f1 = cross_val_score(brfc, X_train_scaled, y_train, cv = sk_folds, scoring='f1')
print('F1', np.mean(f1), f1)

results.append({
    "name": "Balanced Random Forest",
    "Accuracy":np.mean(scores),
    "Balanced_Accuracy":np.mean(balanced_scores),
    "Recall":np.mean(recall),
    "Precision":np.mean(precision),
    "F1": np.mean(f1)
})
pd.DataFrame(results)

Accuracy 0.8455733333333333 [0.84706667 0.8472     0.83946667 0.84666667 0.84746667]
Balanced Accuracy 0.8618649568546868 [0.8534887  0.86335913 0.85618922 0.87277993 0.8635078 ]
Recall 0.8823931726786807 [0.86157827 0.88372093 0.87726098 0.90568475 0.88372093]
Precision 0.3902126523021826 [0.39038687 0.39310345 0.37975391 0.39426322 0.39355581]
F1 0.5410959700576266 [0.53731343 0.54415274 0.53005464 0.54937304 0.54458599]


Unnamed: 0,name,Accuracy,Balanced_Accuracy,Recall,Precision,F1
0,Decision Tree,0.889552,0.889551,0.903987,0.87862,0.8909
1,Support Vector Machine,0.870819,0.870819,0.882162,0.862595,0.872267
2,Logistic Regression,0.868113,0.868113,0.866968,0.868959,0.867959
3,GradientBoostingClassifier,0.885939,0.885938,0.90369,0.872718,0.887809
4,Random Forest2,0.855491,0.855491,0.858821,0.853207,0.855954
5,Random Forest4,0.870343,0.870343,0.889328,0.856868,0.872704
6,Random Forest6,0.880155,0.880155,0.91157,0.857699,0.883783
7,Balanced Random Forest,0.845573,0.861865,0.882393,0.390213,0.541096


In [20]:
from imblearn.ensemble import EasyEnsembleClassifier
eec = EasyEnsembleClassifier(random_state=1,n_estimators = 100)

sk_folds = StratifiedKFold(n_splits = 5)

scores = cross_val_score(eec,  X_train_scaled, y_train, cv = sk_folds)
print('Accuracy', np.mean(scores), scores)

balanced_scores = cross_val_score(eec,  X_train_scaled, y_train, cv = sk_folds, scoring=sc)
print('Balanced Accuracy', np.mean(balanced_scores), balanced_scores)

recall = cross_val_score(eec, X_train_scaled, y_train, cv = sk_folds, scoring='recall')
print('Recall', np.mean(recall), recall)

precision = cross_val_score(eec, X_train_scaled, y_train, cv = sk_folds, scoring='precision')
print('Precision', np.mean(precision), precision)

f1 = cross_val_score(eec, X_train_scaled, y_train, cv = sk_folds, scoring='f1')
print('F1', np.mean(f1), f1)

results.append({
    "name": "Easy Ensemble",
    "Accuracy":np.mean(scores),
    "Balanced_Accuracy":np.mean(balanced_scores),
    "Recall":np.mean(recall),
    "Precision":np.mean(precision),
    "F1": np.mean(f1)
})
pd.DataFrame(results)

Accuracy 0.8728533333333333 [0.87066667 0.87573333 0.8716     0.8728     0.87346667]
Balanced Accuracy 0.8595754161425733 [0.84603453 0.85983122 0.86038502 0.87648878 0.85513753]
Recall 0.8428462549013709 [0.81500647 0.83979328 0.84625323 0.88113695 0.83204134]
Precision 0.43941414470668283 [0.43239533 0.44581619 0.43695797 0.44170984 0.44019139]
F1 0.5775999937469813 [0.56502242 0.58243728 0.57633084 0.58843831 0.57577112]


Unnamed: 0,name,Accuracy,Balanced_Accuracy,Recall,Precision,F1
0,Decision Tree,0.889552,0.889551,0.903987,0.87862,0.8909
1,Support Vector Machine,0.870819,0.870819,0.882162,0.862595,0.872267
2,Logistic Regression,0.868113,0.868113,0.866968,0.868959,0.867959
3,GradientBoostingClassifier,0.885939,0.885938,0.90369,0.872718,0.887809
4,Random Forest2,0.855491,0.855491,0.858821,0.853207,0.855954
5,Random Forest4,0.870343,0.870343,0.889328,0.856868,0.872704
6,Random Forest6,0.880155,0.880155,0.91157,0.857699,0.883783
7,Balanced Random Forest,0.845573,0.861865,0.882393,0.390213,0.541096
8,Easy Ensemble,0.872853,0.859575,0.842846,0.439414,0.5776


In [21]:
pd.DataFrame(results).sort_values("Recall", ascending=False)

Unnamed: 0,name,Accuracy,Balanced_Accuracy,Recall,Precision,F1
6,Random Forest6,0.880155,0.880155,0.91157,0.857699,0.883783
0,Decision Tree,0.889552,0.889551,0.903987,0.87862,0.8909
3,GradientBoostingClassifier,0.885939,0.885938,0.90369,0.872718,0.887809
5,Random Forest4,0.870343,0.870343,0.889328,0.856868,0.872704
7,Balanced Random Forest,0.845573,0.861865,0.882393,0.390213,0.541096
1,Support Vector Machine,0.870819,0.870819,0.882162,0.862595,0.872267
2,Logistic Regression,0.868113,0.868113,0.866968,0.868959,0.867959
4,Random Forest2,0.855491,0.855491,0.858821,0.853207,0.855954
8,Easy Ensemble,0.872853,0.859575,0.842846,0.439414,0.5776


### Validation using test dataset

In [22]:
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
ml = {
    "Support Vector Machines": SVC(kernel='linear'),
    "LogisticRegression": LogisticRegression(max_iter=1000,solver='lbfgs', random_state=1),
    "Random Forest_2": RandomForestClassifier(n_estimators = 13, random_state = 42, max_depth = 2),
    "Random Forest_4": RandomForestClassifier(n_estimators = 13, random_state = 42, max_depth = 4),
    "Random Forest_6": RandomForestClassifier(n_estimators = 13, random_state = 42, max_depth = 6),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "GradientBoostingClassifier":  GradientBoostingClassifier(n_estimators=20,
                                        learning_rate=1,
                                        max_features=5,
                                        max_depth=3,
                                        random_state=0)
}

test_results = []
for x in ml:
    model = ml[x]
    model.fit(X_resampled_s, y_resampled_s)
    y_pred_s = model.predict(X_test_scaled)
    accuracy = model.score(X_test_scaled, y_test)
    balanced_accuracy = balanced_accuracy_score(y_test,y_pred_s)
    #calculating precision and reall
    precision = precision_score(y_test, y_pred_s)
    recall = recall_score(y_test, y_pred_s)
    print(f'model{x}: {confusion_matrix(y_test,y_pred_s)}')
    test_results.append({
    "name": x,
    "Accuracy":accuracy,
    "Balanced_Accuracy":balanced_accuracy,
    "Precision": precision,
    "Recall": recall
})
pd.DataFrame(test_results).sort_values("Recall", ascending=False)

modelSupport Vector Machines: [[9560 1650]
 [ 175 1115]]
modelLogisticRegression: [[9674 1536]
 [ 196 1094]]
modelRandom Forest_2: [[9586 1624]
 [ 218 1072]]
modelRandom Forest_4: [[9534 1676]
 [ 163 1127]]
modelRandom Forest_6: [[9553 1657]
 [ 157 1133]]
modelDecision Tree: [[9759 1451]
 [ 300  990]]
modelGradientBoostingClassifier: [[9697 1513]
 [ 204 1086]]


Unnamed: 0,name,Accuracy,Balanced_Accuracy,Precision,Recall
4,Random Forest_6,0.85488,0.86524,0.406093,0.878295
3,Random Forest_4,0.85288,0.862067,0.402069,0.873643
0,Support Vector Machines,0.854,0.858576,0.403255,0.864341
1,LogisticRegression,0.86144,0.855521,0.41597,0.848062
6,GradientBoostingClassifier,0.86264,0.853446,0.417853,0.84186
2,Random Forest_2,0.85264,0.843069,0.397626,0.831008
5,Decision Tree,0.85992,0.819002,0.405571,0.767442


In [23]:
from imblearn.ensemble import BalancedRandomForestClassifier
BE = {
    "Balanced Random Forest": BalancedRandomForestClassifier(n_estimators=100, random_state=42),
    "Easy Ensemble": EasyEnsembleClassifier(random_state=42,n_estimators = 100)
}

for x in BE:
    model2 = BE[x]
    model2.fit(X_train_scaled, y_train)
    y_pred_s2 = model2.predict(X_test_scaled)
    accuracy = model2.score(X_test_scaled, y_test)
    balanced_accuracy = balanced_accuracy_score(y_test,y_pred_s2)
    #calculating precision and reall
    precision = precision_score(y_test, y_pred_s2)
    recall = recall_score(y_test, y_pred_s2)
    print(f'model{x}: {confusion_matrix(y_test,y_pred_s2)}')
    test_results.append({
    "name": x,
    "Accuracy":accuracy,
    "Balanced_Accuracy":balanced_accuracy,
    "Precision": precision,
    "Recall": recall
})
pd.DataFrame(test_results).sort_values("Recall", ascending=False)

modelBalanced Random Forest: [[9352 1858]
 [ 154 1136]]
modelEasy Ensemble: [[9789 1421]
 [ 216 1074]]


Unnamed: 0,name,Accuracy,Balanced_Accuracy,Precision,Recall
7,Balanced Random Forest,0.83904,0.857438,0.379426,0.88062
4,Random Forest_6,0.85488,0.86524,0.406093,0.878295
3,Random Forest_4,0.85288,0.862067,0.402069,0.873643
0,Support Vector Machines,0.854,0.858576,0.403255,0.864341
1,LogisticRegression,0.86144,0.855521,0.41597,0.848062
6,GradientBoostingClassifier,0.86264,0.853446,0.417853,0.84186
8,Easy Ensemble,0.86904,0.852898,0.430461,0.832558
2,Random Forest_2,0.85264,0.843069,0.397626,0.831008
5,Decision Tree,0.85992,0.819002,0.405571,0.767442


In [None]:
# # # Create a method that creates a new Sequential model with hyperparameter options
# def create_model(hp):
#     nn_model = tf.keras.models.Sequential()

#     # Allow kerastuner to decide which activation function to use in hidden layers
#     activation = hp.Choice('activation',['relu','tanh','sigmoid'])
#     first_units = hp.Int('first_units', min_value=1, max_value=10, step=2)
    
#     # Allow kerastuner to decide number of neurons in first layer
#     nn_model.add(tf.keras.layers.Dense(units=first_units, activation=activation, input_dim=len(X_train_scaled[0])))

#     # Allow kerastuner to decide number of hidden layers and neurons in hidden layers
#     for i in range(hp.Int('num_layers', 1, 6)):
#         next_units = hp.Int('units_' + str(i), min_value=1, max_value=10, step=2)

#         nn_model.add(tf.keras.layers.Dense(units=next_units, activation=activation))
    
#     nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

#     # Compile the model
#     nn_model.compile(loss="binary_crossentropy", optimizer='adam', metrics=["Accuracy"])

#     return nn_model

In [None]:
# import keras_tuner as kt

# tuner = kt.Hyperband(
#     create_model, # function name
#     objective="val_accuracy",
#     max_epochs=20,
#     hyperband_iterations=2)

In [None]:
# # Run the kerastuner search for best hyperparameters
# tuner.search(X_resampled_s, y_resampled_s,epochs=20,validation_data=(X_test_scaled,y_test))

In [None]:
# # Get best model hyperparameters
# best_hyper = tuner.get_best_hyperparameters(1)[0]
# best_hyper.values

In [None]:
# best_model = tuner.get_best_models(1)[0]
# fit_model = best_model.fit(X_resampled_s, y_resampled_s, epochs=20)

In [None]:
# # Evaluate best model against full test data
# model_loss, model_accuracy = best_model.evaluate(X_test_scaled,y_test,verbose=2)
# print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

In [None]:
# import numpy as np
# from sklearn.feature_selection import SelectKBest, chi2, f_classif

# # Perform feature selection
# selector = SelectKBest(f_classif, k=10)
# selector.fit(X, y)

# # Get the raw p-values for each feature, and transform from p-values into scores
# scores = -np.log10(selector.pvalues_)

# # Plot the scores.
# plt.bar(range(len(keep_col)), scores)
# plt.xticks(range(len(keep_col)), X, rotation='vertical')
# plt.show()