In [1]:
# Import libraries

# Pandas
import pandas as pd
from collections import Counter

import matplotlib.pyplot as plt
import numpy as np


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.metrics import confusion_matrix


from sklearn.linear_model import LogisticRegression

from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import accuracy_score
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import classification_report
from imblearn.ensemble import EasyEnsembleClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import plot_precision_recall_curve
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.svm import SVC
import sklearn as skl
import tensorflow as tf

In [2]:
file = '../clean_ml_data_0625.csv'
clean_df = pd.read_csv(file)

In [3]:
clean_df = clean_df.sample(50000)

In [4]:
clean_df.head()

Unnamed: 0,gender,pneumonia,pregnant,diabetes,copd,asthma,immunosup,hypertension,cardiovascular,obesity,renal_chronic,tobacco,closed_contact,another_complication,death,intubation,ICU,new_age
511070,0,1,0,1,1,0,0,1,1,0,0,0,0,0,1,1,0,5
1154373,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,5
1110804,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,4
679791,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,2
1413101,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1


In [5]:
Counter(clean_df.death)

Counter({1: 5207, 0: 44793})

In [6]:
y= clean_df['death']
X = clean_df.drop(columns = ['ICU','intubation','death'])

In [7]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state =42, stratify=y)

In [8]:
# Creating StandardScaler instance
scaler = StandardScaler()
# Fitting standard scaler
X_scaler = scaler.fit(X_train)

# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [9]:
# # Resample the training data with SMOTE
# from imblearn.over_sampling import SMOTE
# X_resampled_s, y_resampled_s = SMOTE(random_state=1,
# sampling_strategy='auto').fit_resample(X_train_scaled,y_train)
# Counter(y_resampled_s)

In [10]:
from imblearn.under_sampling import RandomUnderSampler
ros = RandomUnderSampler(random_state=1)
X_resampled_s, y_resampled_s = ros.fit_resample(X_train_scaled, y_train)
Counter(y_resampled_s)

Counter({0: 3905, 1: 3905})

In [11]:
# # Use SMOTEENN resampling method
# from imblearn.combine import SMOTEENN 
# smote_enn = SMOTEENN(random_state=0)
# X_resampled_s, y_resampled_s = smote_enn.fit_resample(X_train_scaled,y_train)
# Counter(y_resampled_s)

In [12]:
keep_col = ['gender',
         'pneumonia', 'new_age',
       'pregnant', 'diabetes', 'copd',
       'asthma', 'immunosup', 'hypertension',
       'cardiovascular', 'obesity', 'renal_chronic', 'tobacco', 'another_complication', 'closed_contact']

In [13]:
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score

results = []
clf = DecisionTreeClassifier(random_state=42)

sk_folds = StratifiedKFold(n_splits = 5)

scores = cross_val_score(clf, X_resampled_s, y_resampled_s, cv = sk_folds)
print('Accuracy', np.mean(scores), scores)
balanced_scores = cross_val_score(clf,  X_resampled_s, y_resampled_s, cv = sk_folds, scoring='balanced_accuracy')
print('Balanced Accuracy', np.mean(balanced_scores), balanced_scores)
recall = cross_val_score(clf, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='recall')
print('Recall', np.mean(recall), recall)
precision = cross_val_score(clf, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='precision')
print('Precision', np.mean(precision), precision)
f1 = cross_val_score(clf, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='f1')
print('F1', np.mean(f1), f1)

results.append({
    "name": "Decision Tree",
    "Accuracy":np.mean(scores),
    "Balanced_Accuracy":np.mean(balanced_scores),
    "Recall":np.mean(recall),
    "Precision":np.mean(precision),
    "F1": np.mean(f1)
})
pd.DataFrame(results)

Accuracy 0.8500640204865556 [0.85339309 0.85339309 0.84891165 0.85083227 0.84379001]
Balanced Accuracy 0.8500640204865556 [0.85339309 0.85339309 0.84891165 0.85083227 0.84379001]
Recall 0.8522407170294495 [0.85787452 0.8553137  0.8553137  0.84891165 0.84379001]
Precision 0.8485540716645545 [0.85025381 0.85204082 0.84450063 0.85218509 0.84379001]
F1 0.8503858585682089 [0.85404716 0.85367412 0.84987277 0.85054522 0.84379001]


Unnamed: 0,name,Accuracy,Balanced_Accuracy,Recall,Precision,F1
0,Decision Tree,0.850064,0.850064,0.852241,0.848554,0.850386


In [14]:
svm = SVC(kernel='linear')

sk_folds = StratifiedKFold(n_splits = 5)

scores = cross_val_score(svm, X_resampled_s, y_resampled_s, cv = sk_folds)
print('Accuracy', np.mean(scores), scores)
balanced_scores = cross_val_score(svm,  X_resampled_s, y_resampled_s, cv = sk_folds, scoring='balanced_accuracy')
print('Balanced Accuracy', np.mean(balanced_scores), balanced_scores)
recall = cross_val_score(svm, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='recall')
print('Recall', np.mean(recall), recall)
precision = cross_val_score(svm, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='precision')
print('Precision', np.mean(precision), precision)
f1 = cross_val_score(svm, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='f1')
print('F1', np.mean(f1), f1)

results.append({
    "name": "Support Vector Machine",
    "Accuracy":np.mean(scores),
    "Balanced_Accuracy":np.mean(balanced_scores),
    "Recall":np.mean(recall),
    "Precision":np.mean(precision),
    "F1": np.mean(f1)
})
pd.DataFrame(results)

Accuracy 0.8620998719590268 [0.86299616 0.868758   0.85979513 0.87003841 0.84891165]
Balanced Accuracy 0.8620998719590268 [0.86299616 0.868758   0.85979513 0.87003841 0.84891165]
Recall 0.8573623559539053 [0.85019206 0.87195903 0.85915493 0.86683739 0.83866837]
Precision 0.8655673182796646 [0.87253614 0.86641221 0.86025641 0.87242268 0.85620915]
F1 0.8614140689663976 [0.8612192  0.86917677 0.85970532 0.86962107 0.84734799]


Unnamed: 0,name,Accuracy,Balanced_Accuracy,Recall,Precision,F1
0,Decision Tree,0.850064,0.850064,0.852241,0.848554,0.850386
1,Support Vector Machine,0.8621,0.8621,0.857362,0.865567,0.861414


In [15]:
lr = LogisticRegression(max_iter=1000,solver='lbfgs', random_state=1)

sk_folds = StratifiedKFold(n_splits = 5)

scores = cross_val_score(lr, X_resampled_s, y_resampled_s, cv = sk_folds)
print('Accuracy', np.mean(scores), scores)
balanced_scores = cross_val_score(lr,  X_resampled_s, y_resampled_s, cv = sk_folds, scoring='balanced_accuracy')
print('Balanced Accuracy', np.mean(balanced_scores), balanced_scores)
recall = cross_val_score(lr, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='recall')
print('Recall', np.mean(recall), recall)
precision = cross_val_score(lr, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='precision')
print('Precision', np.mean(precision), precision)
f1 = cross_val_score(lr, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='f1')
print('F1', np.mean(f1), f1)

results.append({
    "name": "Logistic Regression",
    "Accuracy":np.mean(scores),
    "Balanced_Accuracy":np.mean(balanced_scores),
    "Recall":np.mean(recall),
    "Precision":np.mean(precision),
    "F1": np.mean(f1)
})
pd.DataFrame(results)

Accuracy 0.8609475032010243 [0.85915493 0.87195903 0.85659411 0.86555698 0.85147247]
Balanced Accuracy 0.8609475032010243 [0.85915493 0.87195903 0.85659411 0.86555698 0.85147247]
Recall 0.8606914212548016 [0.84891165 0.87451985 0.86555698 0.86299616 0.85147247]
Precision 0.8611912329945272 [0.86666667 0.87006369 0.85031447 0.86743887 0.85147247]
F1 0.8609071328005179 [0.85769728 0.87228608 0.85786802 0.86521181 0.85147247]


Unnamed: 0,name,Accuracy,Balanced_Accuracy,Recall,Precision,F1
0,Decision Tree,0.850064,0.850064,0.852241,0.848554,0.850386
1,Support Vector Machine,0.8621,0.8621,0.857362,0.865567,0.861414
2,Logistic Regression,0.860948,0.860948,0.860691,0.861191,0.860907


In [16]:
from sklearn.ensemble import GradientBoostingClassifier
xgb = GradientBoostingClassifier(n_estimators=20,
                                        learning_rate=1,
                                        max_features=5,
                                        max_depth=3,
                                        random_state=0)

sk_folds = StratifiedKFold(n_splits = 5)

scores = cross_val_score(xgb,  X_resampled_s, y_resampled_s, cv = sk_folds)
print('Accuracy', np.mean(scores), scores)
balanced_scores = cross_val_score(xgb,  X_resampled_s, y_resampled_s, cv = sk_folds, scoring='balanced_accuracy')
print('Balanced Accuracy', np.mean(balanced_scores), balanced_scores)
recall = cross_val_score(xgb, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='recall')
print('Recall', np.mean(recall), recall)

precision = cross_val_score(xgb, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='precision')
print('Precision', np.mean(precision), precision)

f1 = cross_val_score(xgb, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='f1')
print('F1', np.mean(f1), f1)

results.append({
    "name": "GradientBoostingClassifier",
    "Accuracy":np.mean(scores),
    "Balanced_Accuracy":np.mean(balanced_scores),
    "Recall":np.mean(recall),
    "Precision":np.mean(precision),
    "F1": np.mean(f1)
})
pd.DataFrame(results)

Accuracy 0.8597951344430218 [0.86683739 0.86171575 0.85659411 0.87259923 0.84122919]
Balanced Accuracy 0.8597951344430218 [0.86683739 0.86171575 0.85659411 0.87259923 0.84122919]
Recall 0.8747759282970552 [0.88476312 0.87451985 0.87451985 0.89244558 0.84763124]
Precision 0.8492733808043272 [0.85414091 0.85268414 0.84425216 0.85837438 0.8369153 ]
F1 0.8618167021001113 [0.86918239 0.86346397 0.8591195  0.87507847 0.84223919]


Unnamed: 0,name,Accuracy,Balanced_Accuracy,Recall,Precision,F1
0,Decision Tree,0.850064,0.850064,0.852241,0.848554,0.850386
1,Support Vector Machine,0.8621,0.8621,0.857362,0.865567,0.861414
2,Logistic Regression,0.860948,0.860948,0.860691,0.861191,0.860907
3,GradientBoostingClassifier,0.859795,0.859795,0.874776,0.849273,0.861817


In [17]:
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier
rf2 = RandomForestClassifier(n_estimators = 13, random_state = 42, max_depth = 2)

sk_folds = StratifiedKFold(n_splits = 5)
sc = "balanced_accuracy"

scores = cross_val_score(rf2,  X_resampled_s, y_resampled_s, cv = sk_folds)
print('Accuracy', np.mean(scores), scores)

balanced_scores = cross_val_score(rf2,  X_resampled_s, y_resampled_s, cv = sk_folds, scoring='balanced_accuracy')
print('Balanced Accuracy', np.mean(balanced_scores), balanced_scores)

recall = cross_val_score(rf2, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='recall')
print('Recall', np.mean(recall), recall)

precision = cross_val_score(rf2, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='precision')
print('Precision', np.mean(precision), precision)

f1 = cross_val_score(rf2, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='f1')
print('F1', np.mean(f1), f1)

results.append({
    "name": "Random Forest2",
    "Accuracy":np.mean(scores),
    "Balanced_Accuracy":np.mean(balanced_scores),
    "Recall":np.mean(recall),
    "Precision":np.mean(precision),
    "F1": np.mean(f1)
})
pd.DataFrame(results)

Accuracy 0.8419974391805377 [0.84314981 0.85659411 0.83866837 0.84699104 0.82458387]
Balanced Accuracy 0.8419974391805377 [0.84314981 0.85659411 0.83866837 0.84699104 0.82458387]
Recall 0.819718309859155 [0.81434059 0.82714469 0.82586428 0.83354673 0.79769526]
Precision 0.8580442116179426 [0.86413043 0.87891156 0.84756899 0.85657895 0.84303112]
F1 0.8383916801060562 [0.83849703 0.85224274 0.83657588 0.84490591 0.81973684]


Unnamed: 0,name,Accuracy,Balanced_Accuracy,Recall,Precision,F1
0,Decision Tree,0.850064,0.850064,0.852241,0.848554,0.850386
1,Support Vector Machine,0.8621,0.8621,0.857362,0.865567,0.861414
2,Logistic Regression,0.860948,0.860948,0.860691,0.861191,0.860907
3,GradientBoostingClassifier,0.859795,0.859795,0.874776,0.849273,0.861817
4,Random Forest2,0.841997,0.841997,0.819718,0.858044,0.838392


In [18]:
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier
rf4 = RandomForestClassifier(n_estimators = 13, random_state = 42, max_depth = 4)

sk_folds = StratifiedKFold(n_splits = 5)
sc = "balanced_accuracy"

scores = cross_val_score(rf4,  X_resampled_s, y_resampled_s, cv = sk_folds)
print('Accuracy', np.mean(scores), scores)

balanced_scores = cross_val_score(rf4,  X_resampled_s, y_resampled_s, cv = sk_folds, scoring='balanced_accuracy')
print('Balanced Accuracy', np.mean(balanced_scores), balanced_scores)

recall = cross_val_score(rf4, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='recall')
print('Recall', np.mean(recall), recall)

precision = cross_val_score(rf4, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='precision')
print('Precision', np.mean(precision), precision)

f1 = cross_val_score(rf4, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='f1')
print('F1', np.mean(f1), f1)

results.append({
    "name": "Random Forest4",
    "Accuracy":np.mean(scores),
    "Balanced_Accuracy":np.mean(balanced_scores),
    "Recall":np.mean(recall),
    "Precision":np.mean(precision),
    "F1": np.mean(f1)
})
pd.DataFrame(results)

Accuracy 0.8606914212548015 [0.86171575 0.87259923 0.86299616 0.86299616 0.84314981]
Balanced Accuracy 0.8606914212548015 [0.86171575 0.87259923 0.86299616 0.86299616 0.84314981]
Recall 0.8755441741357235 [0.87708067 0.87195903 0.89372599 0.89628681 0.83866837]
Precision 0.8505152503232762 [0.85093168 0.87307692 0.84197829 0.84033613 0.84625323]
F1 0.8626521152145123 [0.86380832 0.87251762 0.86708075 0.86741016 0.84244373]


Unnamed: 0,name,Accuracy,Balanced_Accuracy,Recall,Precision,F1
0,Decision Tree,0.850064,0.850064,0.852241,0.848554,0.850386
1,Support Vector Machine,0.8621,0.8621,0.857362,0.865567,0.861414
2,Logistic Regression,0.860948,0.860948,0.860691,0.861191,0.860907
3,GradientBoostingClassifier,0.859795,0.859795,0.874776,0.849273,0.861817
4,Random Forest2,0.841997,0.841997,0.819718,0.858044,0.838392
5,Random Forest4,0.860691,0.860691,0.875544,0.850515,0.862652


In [19]:
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier
rf6 = RandomForestClassifier(n_estimators = 13, random_state = 42, max_depth = 6)

sk_folds = StratifiedKFold(n_splits = 5)
sc = "balanced_accuracy"

scores = cross_val_score(rf6,  X_resampled_s, y_resampled_s, cv = sk_folds)
print('Accuracy', np.mean(scores), scores)

balanced_scores = cross_val_score(rf6,  X_resampled_s, y_resampled_s, cv = sk_folds, scoring='balanced_accuracy')
print('Balanced Accuracy', np.mean(balanced_scores), balanced_scores)

recall = cross_val_score(rf6, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='recall')
print('Recall', np.mean(recall), recall)

precision = cross_val_score(rf6, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='precision')
print('Precision', np.mean(precision), precision)

f1 = cross_val_score(rf6, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='f1')
print('F1', np.mean(f1), f1)

results.append({
    "name": "Random Forest6",
    "Accuracy":np.mean(scores),
    "Balanced_Accuracy":np.mean(balanced_scores),
    "Recall":np.mean(recall),
    "Precision":np.mean(precision),
    "F1": np.mean(f1)
})
pd.DataFrame(results)

Accuracy 0.8620998719590268 [0.85979513 0.86939821 0.85979513 0.86619718 0.8553137 ]
Balanced Accuracy 0.8620998719590268 [0.85979513 0.86939821 0.85979513 0.86619718 0.8553137 ]
Recall 0.8891165172855313 [0.88732394 0.89116517 0.89116517 0.89372599 0.8822023 ]
Precision 0.8435659576005594 [0.84101942 0.85398773 0.83855422 0.84708738 0.83718104]
F1 0.8657351238333508 [0.8635514  0.87218045 0.86405959 0.86978193 0.85910224]


Unnamed: 0,name,Accuracy,Balanced_Accuracy,Recall,Precision,F1
0,Decision Tree,0.850064,0.850064,0.852241,0.848554,0.850386
1,Support Vector Machine,0.8621,0.8621,0.857362,0.865567,0.861414
2,Logistic Regression,0.860948,0.860948,0.860691,0.861191,0.860907
3,GradientBoostingClassifier,0.859795,0.859795,0.874776,0.849273,0.861817
4,Random Forest2,0.841997,0.841997,0.819718,0.858044,0.838392
5,Random Forest4,0.860691,0.860691,0.875544,0.850515,0.862652
6,Random Forest6,0.8621,0.8621,0.889117,0.843566,0.865735


In [20]:
from imblearn.ensemble import BalancedRandomForestClassifier
brfc = BalancedRandomForestClassifier(n_estimators=100, random_state=42)

sk_folds = StratifiedKFold(n_splits = 5)

scores = cross_val_score(brfc,  X_train_scaled, y_train, cv = sk_folds)
print('Accuracy', np.mean(scores), scores)

balanced_scores = cross_val_score(brfc,  X_train_scaled, y_train, cv = sk_folds, scoring=sc)
print('Balanced Accuracy', np.mean(balanced_scores), balanced_scores)

recall = cross_val_score(brfc, X_train_scaled, y_train, cv = sk_folds, scoring='recall')
print('Recall', np.mean(recall), recall)

precision = cross_val_score(brfc, X_train_scaled, y_train, cv = sk_folds, scoring='precision')
print('Precision', np.mean(precision), precision)

f1 = cross_val_score(brfc, X_train_scaled, y_train, cv = sk_folds, scoring='f1')
print('F1', np.mean(f1), f1)

results.append({
    "name": "Balanced Random Forest",
    "Accuracy":np.mean(scores),
    "Balanced_Accuracy":np.mean(balanced_scores),
    "Recall":np.mean(recall),
    "Precision":np.mean(precision),
    "F1": np.mean(f1)
})
pd.DataFrame(results)

Accuracy 0.8507466666666668 [0.84973333 0.8496     0.84786667 0.854      0.85253333]
Balanced Accuracy 0.8662305320646496 [0.86521234 0.86683529 0.86473631 0.86815944 0.86620928]
Recall 0.8857874519846349 [0.88476312 0.88860435 0.88604353 0.88604353 0.88348271]
Precision 0.40178072250523683 [0.39988426 0.4        0.39678899 0.40753828 0.40469208]
F1 0.5528006855935836 [0.55081706 0.55166932 0.54811881 0.55828963 0.55510861]


Unnamed: 0,name,Accuracy,Balanced_Accuracy,Recall,Precision,F1
0,Decision Tree,0.850064,0.850064,0.852241,0.848554,0.850386
1,Support Vector Machine,0.8621,0.8621,0.857362,0.865567,0.861414
2,Logistic Regression,0.860948,0.860948,0.860691,0.861191,0.860907
3,GradientBoostingClassifier,0.859795,0.859795,0.874776,0.849273,0.861817
4,Random Forest2,0.841997,0.841997,0.819718,0.858044,0.838392
5,Random Forest4,0.860691,0.860691,0.875544,0.850515,0.862652
6,Random Forest6,0.8621,0.8621,0.889117,0.843566,0.865735
7,Balanced Random Forest,0.850747,0.866231,0.885787,0.401781,0.552801


In [21]:
from imblearn.ensemble import EasyEnsembleClassifier
eec = EasyEnsembleClassifier(random_state=1,n_estimators = 100)

sk_folds = StratifiedKFold(n_splits = 5)

scores = cross_val_score(eec,  X_train_scaled, y_train, cv = sk_folds)
print('Accuracy', np.mean(scores), scores)

balanced_scores = cross_val_score(eec,  X_train_scaled, y_train, cv = sk_folds, scoring=sc)
print('Balanced Accuracy', np.mean(balanced_scores), balanced_scores)

recall = cross_val_score(eec, X_train_scaled, y_train, cv = sk_folds, scoring='recall')
print('Recall', np.mean(recall), recall)

precision = cross_val_score(eec, X_train_scaled, y_train, cv = sk_folds, scoring='precision')
print('Precision', np.mean(precision), precision)

f1 = cross_val_score(eec, X_train_scaled, y_train, cv = sk_folds, scoring='f1')
print('F1', np.mean(f1), f1)

results.append({
    "name": "Easy Ensemble",
    "Accuracy":np.mean(scores),
    "Balanced_Accuracy":np.mean(balanced_scores),
    "Recall":np.mean(recall),
    "Precision":np.mean(precision),
    "F1": np.mean(f1)
})
pd.DataFrame(results)

Accuracy 0.8745333333333333 [0.87466667 0.87253333 0.87573333 0.87733333 0.8724    ]
Balanced Accuracy 0.859137912076499 [0.85706233 0.86549009 0.86274976 0.8608138  0.84957358]
Recall 0.8396927016645327 [0.83482714 0.85659411 0.84635083 0.83994878 0.82074264]
Precision 0.44566337747121365 [0.4456596  0.44216788 0.44874406 0.452102   0.43964335]
F1 0.5822538757571811 [0.58110517 0.58326068 0.58651287 0.58781362 0.57257704]


Unnamed: 0,name,Accuracy,Balanced_Accuracy,Recall,Precision,F1
0,Decision Tree,0.850064,0.850064,0.852241,0.848554,0.850386
1,Support Vector Machine,0.8621,0.8621,0.857362,0.865567,0.861414
2,Logistic Regression,0.860948,0.860948,0.860691,0.861191,0.860907
3,GradientBoostingClassifier,0.859795,0.859795,0.874776,0.849273,0.861817
4,Random Forest2,0.841997,0.841997,0.819718,0.858044,0.838392
5,Random Forest4,0.860691,0.860691,0.875544,0.850515,0.862652
6,Random Forest6,0.8621,0.8621,0.889117,0.843566,0.865735
7,Balanced Random Forest,0.850747,0.866231,0.885787,0.401781,0.552801
8,Easy Ensemble,0.874533,0.859138,0.839693,0.445663,0.582254


In [22]:
pd.DataFrame(results).sort_values("Recall", ascending=False)

Unnamed: 0,name,Accuracy,Balanced_Accuracy,Recall,Precision,F1
6,Random Forest6,0.8621,0.8621,0.889117,0.843566,0.865735
7,Balanced Random Forest,0.850747,0.866231,0.885787,0.401781,0.552801
5,Random Forest4,0.860691,0.860691,0.875544,0.850515,0.862652
3,GradientBoostingClassifier,0.859795,0.859795,0.874776,0.849273,0.861817
2,Logistic Regression,0.860948,0.860948,0.860691,0.861191,0.860907
1,Support Vector Machine,0.8621,0.8621,0.857362,0.865567,0.861414
0,Decision Tree,0.850064,0.850064,0.852241,0.848554,0.850386
8,Easy Ensemble,0.874533,0.859138,0.839693,0.445663,0.582254
4,Random Forest2,0.841997,0.841997,0.819718,0.858044,0.838392


### Validation using test dataset

In [23]:
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
ml = {
    "Support Vector Machines": SVC(kernel='linear'),
    "LogisticRegression": LogisticRegression(max_iter=1000,solver='lbfgs', random_state=1),
    "Random Forest_2": RandomForestClassifier(n_estimators = 13, random_state = 42, max_depth = 2),
    "Random Forest_4": RandomForestClassifier(n_estimators = 13, random_state = 42, max_depth = 4),
    "Random Forest_6": RandomForestClassifier(n_estimators = 13, random_state = 42, max_depth = 6),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "GradientBoostingClassifier":  GradientBoostingClassifier(n_estimators=20,
                                        learning_rate=1,
                                        max_features=5,
                                        max_depth=3,
                                        random_state=0)
}

test_results = []
for x in ml:
    model = ml[x]
    model.fit(X_resampled_s, y_resampled_s)
    y_pred_s = model.predict(X_test_scaled)
    accuracy = model.score(X_test_scaled, y_test)
    balanced_accuracy = balanced_accuracy_score(y_test,y_pred_s)
    #calculating precision and reall
    precision = precision_score(y_test, y_pred_s)
    recall = recall_score(y_test, y_pred_s)
    print(f'model{x}: {confusion_matrix(y_test,y_pred_s)}')
    test_results.append({
    "name": x,
    "Accuracy":accuracy,
    "Balanced_Accuracy":balanced_accuracy,
    "Precision": precision,
    "Recall": recall
})
pd.DataFrame(test_results).sort_values("Accuracy", ascending=False)

modelSupport Vector Machines: [[9793 1405]
 [ 180 1122]]
modelLogisticRegression: [[9762 1436]
 [ 165 1137]]
modelRandom Forest_2: [[9723 1475]
 [ 217 1085]]
modelRandom Forest_4: [[9709 1489]
 [ 189 1113]]
modelRandom Forest_6: [[9486 1712]
 [ 134 1168]]
modelDecision Tree: [[9592 1606]
 [ 173 1129]]
modelGradientBoostingClassifier: [[9431 1767]
 [ 137 1165]]


Unnamed: 0,name,Accuracy,Balanced_Accuracy,Precision,Recall
0,Support Vector Machines,0.8732,0.868141,0.444005,0.861751
1,LogisticRegression,0.87192,0.872517,0.441897,0.873272
3,Random Forest_4,0.86576,0.860934,0.427748,0.854839
2,Random Forest_2,0.86464,0.850807,0.423828,0.833333
5,Decision Tree,0.85768,0.861855,0.412797,0.867127
4,Random Forest_6,0.85232,0.872098,0.405556,0.897081
6,GradientBoostingClassifier,0.84768,0.868491,0.39734,0.894777


In [24]:
from imblearn.ensemble import BalancedRandomForestClassifier
BE = {
    "Balanced Random Forest": BalancedRandomForestClassifier(n_estimators=100, random_state=42),
    "Easy Ensemble": EasyEnsembleClassifier(random_state=42,n_estimators = 100)
}

for x in BE:
    model2 = BE[x]
    model2.fit(X_train_scaled, y_train)
    y_pred_s2 = model2.predict(X_test_scaled)
    accuracy = model2.score(X_test_scaled, y_test)
    balanced_accuracy = balanced_accuracy_score(y_test,y_pred_s2)
    #calculating precision and reall
    precision = precision_score(y_test, y_pred_s2)
    recall = recall_score(y_test, y_pred_s2)
    print(f'model{x}: {confusion_matrix(y_test,y_pred_s2)}')
    test_results.append({
    "name": x,
    "Accuracy":accuracy,
    "Balanced_Accuracy":balanced_accuracy,
    "Precision": precision,
    "Recall": recall
})
pd.DataFrame(test_results).sort_values("Accuracy", ascending=False)

modelBalanced Random Forest: [[9522 1676]
 [ 129 1173]]
modelEasy Ensemble: [[9911 1287]
 [ 194 1108]]


Unnamed: 0,name,Accuracy,Balanced_Accuracy,Precision,Recall
8,Easy Ensemble,0.88152,0.868034,0.46263,0.850998
0,Support Vector Machines,0.8732,0.868141,0.444005,0.861751
1,LogisticRegression,0.87192,0.872517,0.441897,0.873272
3,Random Forest_4,0.86576,0.860934,0.427748,0.854839
2,Random Forest_2,0.86464,0.850807,0.423828,0.833333
5,Decision Tree,0.85768,0.861855,0.412797,0.867127
7,Balanced Random Forest,0.8556,0.875626,0.411723,0.900922
4,Random Forest_6,0.85232,0.872098,0.405556,0.897081
6,GradientBoostingClassifier,0.84768,0.868491,0.39734,0.894777


In [25]:
pd.DataFrame(test_results).sort_values("Recall", ascending=False)

Unnamed: 0,name,Accuracy,Balanced_Accuracy,Precision,Recall
7,Balanced Random Forest,0.8556,0.875626,0.411723,0.900922
4,Random Forest_6,0.85232,0.872098,0.405556,0.897081
6,GradientBoostingClassifier,0.84768,0.868491,0.39734,0.894777
1,LogisticRegression,0.87192,0.872517,0.441897,0.873272
5,Decision Tree,0.85768,0.861855,0.412797,0.867127
0,Support Vector Machines,0.8732,0.868141,0.444005,0.861751
3,Random Forest_4,0.86576,0.860934,0.427748,0.854839
8,Easy Ensemble,0.88152,0.868034,0.46263,0.850998
2,Random Forest_2,0.86464,0.850807,0.423828,0.833333


In [None]:
# # # Create a method that creates a new Sequential model with hyperparameter options
# def create_model(hp):
#     nn_model = tf.keras.models.Sequential()

#     # Allow kerastuner to decide which activation function to use in hidden layers
#     activation = hp.Choice('activation',['relu','tanh','sigmoid'])
#     first_units = hp.Int('first_units', min_value=1, max_value=10, step=2)
    
#     # Allow kerastuner to decide number of neurons in first layer
#     nn_model.add(tf.keras.layers.Dense(units=first_units, activation=activation, input_dim=len(X_train_scaled[0])))

#     # Allow kerastuner to decide number of hidden layers and neurons in hidden layers
#     for i in range(hp.Int('num_layers', 1, 6)):
#         next_units = hp.Int('units_' + str(i), min_value=1, max_value=10, step=2)

#         nn_model.add(tf.keras.layers.Dense(units=next_units, activation=activation))
    
#     nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

#     # Compile the model
#     nn_model.compile(loss="binary_crossentropy", optimizer='adam', metrics=["Accuracy"])

#     return nn_model

In [None]:
# import keras_tuner as kt

# tuner = kt.Hyperband(
#     create_model, # function name
#     objective="val_accuracy",
#     max_epochs=20,
#     hyperband_iterations=2)

In [None]:
# # Run the kerastuner search for best hyperparameters
# tuner.search(X_resampled_s, y_resampled_s,epochs=20,validation_data=(X_test_scaled,y_test))

In [None]:
# # Get best model hyperparameters
# best_hyper = tuner.get_best_hyperparameters(1)[0]
# best_hyper.values

In [None]:
# best_model = tuner.get_best_models(1)[0]
# fit_model = best_model.fit(X_resampled_s, y_resampled_s, epochs=20)

In [None]:
# # Evaluate best model against full test data
# model_loss, model_accuracy = best_model.evaluate(X_test_scaled,y_test,verbose=2)
# print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

In [None]:
# import numpy as np
# from sklearn.feature_selection import SelectKBest, chi2, f_classif

# # Perform feature selection
# selector = SelectKBest(f_classif, k=10)
# selector.fit(X, y)

# # Get the raw p-values for each feature, and transform from p-values into scores
# scores = -np.log10(selector.pvalues_)

# # Plot the scores.
# plt.bar(range(len(keep_col)), scores)
# plt.xticks(range(len(keep_col)), X, rotation='vertical')
# plt.show()