In [1]:
# Import libraries

# Pandas
import pandas as pd
from collections import Counter

import matplotlib.pyplot as plt
import numpy as np


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.metrics import confusion_matrix


from sklearn.linear_model import LogisticRegression

from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import accuracy_score
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import classification_report
from imblearn.ensemble import EasyEnsembleClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import plot_precision_recall_curve
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.svm import SVC
import sklearn as skl
import tensorflow as tf

In [2]:
file = '../clean_ml_data_0625.csv'
clean_df = pd.read_csv(file)

In [3]:
clean_df = clean_df.sample(50000)

In [4]:
clean_df.head()

Unnamed: 0,gender,pneumonia,pregnant,diabetes,copd,asthma,immunosup,hypertension,cardiovascular,obesity,renal_chronic,tobacco,closed_contact,another_complication,death,intubation,ICU,new_age
675367,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,5
173720,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,5
515104,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,2
12223,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3
1072844,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4


In [5]:
Counter(clean_df.death)

Counter({1: 5017, 0: 44983})

In [6]:
y= clean_df['death']
X = clean_df.drop(columns = ['ICU','intubation','death'])

In [7]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state =42, stratify=y)

In [8]:
# Creating StandardScaler instance
scaler = StandardScaler()
# Fitting standard scaler
X_scaler = scaler.fit(X_train)

# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
# # Resample the training data with SMOTE
# from imblearn.over_sampling import SMOTE
# X_resampled_s, y_resampled_s = SMOTE(random_state=1,
# sampling_strategy='auto').fit_resample(X_train_scaled,y_train)
# Counter(y_resampled_s)

In [9]:
# Use SMOTEENN resampling method
from imblearn.combine import SMOTEENN 
smote_enn = SMOTEENN(random_state=0)
X_resampled_s, y_resampled_s = smote_enn.fit_resample(X_train_scaled,y_train)
Counter(y_resampled_s)

Counter({0: 27484, 1: 10131})

In [None]:
keep_col = ['gender',
         'pneumonia', 'new_age',
       'pregnant', 'diabetes', 'copd',
       'asthma', 'immunosup', 'hypertension',
       'cardiovascular', 'obesity', 'renal_chronic', 'tobacco', 'another_complication', 'closed_contact']

In [10]:
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score

results = []
clf = DecisionTreeClassifier(random_state=42)

sk_folds = StratifiedKFold(n_splits = 5)

scores = cross_val_score(clf, X_resampled_s, y_resampled_s, cv = sk_folds)
print('Accuracy', np.mean(scores), scores)
balanced_scores = cross_val_score(clf,  X_resampled_s, y_resampled_s, cv = sk_folds, scoring='balanced_accuracy')
print('Balanced Accuracy', np.mean(balanced_scores), balanced_scores)
recall = cross_val_score(clf, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='recall')
print('Recall', np.mean(recall), recall)
precision = cross_val_score(clf, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='precision')
print('Precision', np.mean(precision), precision)
f1 = cross_val_score(clf, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='f1')
print('F1', np.mean(f1), f1)

results.append({
    "name": "Decision Tree",
    "Accuracy":np.mean(scores),
    "Balanced_Accuracy":np.mean(balanced_scores),
    "Recall":np.mean(recall),
    "Precision":np.mean(precision),
    "F1": np.mean(f1)
})
pd.DataFrame(results)

Accuracy 0.996357835969693 [0.9896318  0.99906952 0.99853782 0.99827197 0.99627808]
Balanced Accuracy 0.9947964211612843 [0.98199691 0.99936329 0.99837612 0.99835004 0.99589575]
Recall 0.991412135577405 [0.96544916 1.         0.99802567 0.99851925 0.9950666 ]
Precision 0.995053893896028 [0.99592668 0.99655681 0.99655002 0.99508116 0.99115479]
F1 0.9931835898736787 [0.98045113 0.99827544 0.9972873  0.99679724 0.99310684]


Unnamed: 0,name,Accuracy,Balanced_Accuracy,Recall,Precision,F1
0,Decision Tree,0.996358,0.994796,0.991412,0.995054,0.993184


In [11]:
svm = SVC(kernel='linear')

sk_folds = StratifiedKFold(n_splits = 5)

scores = cross_val_score(svm, X_resampled_s, y_resampled_s, cv = sk_folds)
print('Accuracy', np.mean(scores), scores)
balanced_scores = cross_val_score(svm,  X_resampled_s, y_resampled_s, cv = sk_folds, scoring='balanced_accuracy')
print('Balanced Accuracy', np.mean(balanced_scores), balanced_scores)
recall = cross_val_score(svm, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='recall')
print('Recall', np.mean(recall), recall)
precision = cross_val_score(svm, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='precision')
print('Precision', np.mean(precision), precision)
f1 = cross_val_score(svm, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='f1')
print('F1', np.mean(f1), f1)

results.append({
    "name": "Support Vector Machine",
    "Accuracy":np.mean(scores),
    "Balanced_Accuracy":np.mean(balanced_scores),
    "Recall":np.mean(recall),
    "Precision":np.mean(precision),
    "F1": np.mean(f1)
})
pd.DataFrame(results)

Accuracy 0.9559750099694272 [0.9617174  0.95640037 0.95586867 0.95507112 0.95081749]
Balanced Accuracy 0.9413928986644698 [0.9530781  0.94071311 0.94175177 0.93840102 0.93302049]
Recall 0.9097833736170777 [0.93435341 0.90671273 0.91115499 0.90227048 0.89442526]
Precision 0.9255059909403714 [0.92431641 0.92965587 0.92392392 0.92886179 0.92077197]
F1 0.917524856384915 [0.92930781 0.91804098 0.91749503 0.91537306 0.90740741]


Unnamed: 0,name,Accuracy,Balanced_Accuracy,Recall,Precision,F1
0,Decision Tree,0.996358,0.994796,0.991412,0.995054,0.993184
1,Support Vector Machine,0.955975,0.941393,0.909783,0.925506,0.917525


In [12]:
lr = LogisticRegression(max_iter=1000,solver='lbfgs', random_state=1)

sk_folds = StratifiedKFold(n_splits = 5)

scores = cross_val_score(lr, X_resampled_s, y_resampled_s, cv = sk_folds)
print('Accuracy', np.mean(scores), scores)
balanced_scores = cross_val_score(lr,  X_resampled_s, y_resampled_s, cv = sk_folds, scoring='balanced_accuracy')
print('Balanced Accuracy', np.mean(balanced_scores), balanced_scores)
recall = cross_val_score(lr, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='recall')
print('Recall', np.mean(recall), recall)
precision = cross_val_score(lr, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='precision')
print('Precision', np.mean(precision), precision)
f1 = cross_val_score(lr, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='f1')
print('F1', np.mean(f1), f1)

results.append({
    "name": "Logistic Regression",
    "Accuracy":np.mean(scores),
    "Balanced_Accuracy":np.mean(balanced_scores),
    "Recall":np.mean(recall),
    "Precision":np.mean(precision),
    "F1": np.mean(f1)
})
pd.DataFrame(results)

Accuracy 0.9562142762195933 [0.96158447 0.95520404 0.95733085 0.95520404 0.95174797]
Balanced Accuracy 0.9412449814968908 [0.95283131 0.93818032 0.94368731 0.93786865 0.93365732]
Recall 0.9087962067858832 [0.93385982 0.90128332 0.91411649 0.90029615 0.89442526]
Precision 0.9272191805914968 [0.92427943 0.93020886 0.92646323 0.93108729 0.92405708]
F1 0.9178487346839536 [0.92904493 0.91551767 0.92024845 0.91543287 0.90899975]


Unnamed: 0,name,Accuracy,Balanced_Accuracy,Recall,Precision,F1
0,Decision Tree,0.996358,0.994796,0.991412,0.995054,0.993184
1,Support Vector Machine,0.955975,0.941393,0.909783,0.925506,0.917525
2,Logistic Regression,0.956214,0.941245,0.908796,0.927219,0.917849


In [13]:
from sklearn.ensemble import GradientBoostingClassifier
xgb = GradientBoostingClassifier(n_estimators=20,
                                        learning_rate=1,
                                        max_features=5,
                                        max_depth=3,
                                        random_state=0)

sk_folds = StratifiedKFold(n_splits = 5)

scores = cross_val_score(xgb,  X_resampled_s, y_resampled_s, cv = sk_folds)
print('Accuracy', np.mean(scores), scores)
balanced_scores = cross_val_score(xgb,  X_resampled_s, y_resampled_s, cv = sk_folds, scoring='balanced_accuracy')
print('Balanced Accuracy', np.mean(balanced_scores), balanced_scores)
recall = cross_val_score(xgb, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='recall')
print('Recall', np.mean(recall), recall)

precision = cross_val_score(xgb, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='precision')
print('Precision', np.mean(precision), precision)

f1 = cross_val_score(xgb, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='f1')
print('F1', np.mean(f1), f1)

results.append({
    "name": "GradientBoostingClassifier",
    "Accuracy":np.mean(scores),
    "Balanced_Accuracy":np.mean(balanced_scores),
    "Recall":np.mean(recall),
    "Precision":np.mean(precision),
    "F1": np.mean(f1)
})
pd.DataFrame(results)

Accuracy 0.985830120962382 [0.98178918 0.98923302 0.97753556 0.99056228 0.99003057]
Balanced Accuracy 0.9810171515619412 [0.96992953 0.98639902 0.97605716 0.98652945 0.98617059]
Recall 0.9705846199699906 [0.94422507 0.98025666 0.97285291 0.97778875 0.9777997 ]
Precision 0.9769682629809321 [0.98760971 0.97977306 0.94532374 0.98704534 0.98508946]
F1 0.9736324201739688 [0.96543023 0.9800148  0.95889078 0.98239524 0.98143105]


Unnamed: 0,name,Accuracy,Balanced_Accuracy,Recall,Precision,F1
0,Decision Tree,0.996358,0.994796,0.991412,0.995054,0.993184
1,Support Vector Machine,0.955975,0.941393,0.909783,0.925506,0.917525
2,Logistic Regression,0.956214,0.941245,0.908796,0.927219,0.917849
3,GradientBoostingClassifier,0.98583,0.981017,0.970585,0.976968,0.973632


In [14]:
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier
rf2 = RandomForestClassifier(n_estimators = 13, random_state = 42, max_depth = 2)

sk_folds = StratifiedKFold(n_splits = 5)
sc = "balanced_accuracy"

scores = cross_val_score(rf2,  X_resampled_s, y_resampled_s, cv = sk_folds)
print('Accuracy', np.mean(scores), scores)

balanced_scores = cross_val_score(rf2,  X_resampled_s, y_resampled_s, cv = sk_folds, scoring='balanced_accuracy')
print('Balanced Accuracy', np.mean(balanced_scores), balanced_scores)

recall = cross_val_score(rf2, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='recall')
print('Recall', np.mean(recall), recall)

precision = cross_val_score(rf2, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='precision')
print('Precision', np.mean(precision), precision)

f1 = cross_val_score(rf2, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='f1')
print('F1', np.mean(f1), f1)

results.append({
    "name": "Random Forest2",
    "Accuracy":np.mean(scores),
    "Balanced_Accuracy":np.mean(balanced_scores),
    "Recall":np.mean(recall),
    "Precision":np.mean(precision),
    "F1": np.mean(f1)
})
pd.DataFrame(results)

Accuracy 0.945287784128672 [0.94895653 0.94191147 0.94736142 0.94510169 0.9431078 ]
Balanced Accuracy 0.9022948943626498 [0.90897198 0.89745035 0.90601048 0.90056835 0.89847332]
Recall 0.8091015126006221 [0.82230997 0.80108588 0.81638697 0.80404738 0.80167736]
Precision 0.9851005179369444 [0.98579882 0.97948099 0.98569726 0.99027356 0.98425197]
F1 0.8884454727113464 [0.89666308 0.88134673 0.89308855 0.88749659 0.88363241]


Unnamed: 0,name,Accuracy,Balanced_Accuracy,Recall,Precision,F1
0,Decision Tree,0.996358,0.994796,0.991412,0.995054,0.993184
1,Support Vector Machine,0.955975,0.941393,0.909783,0.925506,0.917525
2,Logistic Regression,0.956214,0.941245,0.908796,0.927219,0.917849
3,GradientBoostingClassifier,0.98583,0.981017,0.970585,0.976968,0.973632
4,Random Forest2,0.945288,0.902295,0.809102,0.985101,0.888445


In [15]:
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier
rf4 = RandomForestClassifier(n_estimators = 13, random_state = 42, max_depth = 4)

sk_folds = StratifiedKFold(n_splits = 5)
sc = "balanced_accuracy"

scores = cross_val_score(rf4,  X_resampled_s, y_resampled_s, cv = sk_folds)
print('Accuracy', np.mean(scores), scores)

balanced_scores = cross_val_score(rf4,  X_resampled_s, y_resampled_s, cv = sk_folds, scoring='balanced_accuracy')
print('Balanced Accuracy', np.mean(balanced_scores), balanced_scores)

recall = cross_val_score(rf4, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='recall')
print('Recall', np.mean(recall), recall)

precision = cross_val_score(rf4, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='precision')
print('Precision', np.mean(precision), precision)

f1 = cross_val_score(rf4, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='f1')
print('F1', np.mean(f1), f1)

results.append({
    "name": "Random Forest4",
    "Accuracy":np.mean(scores),
    "Balanced_Accuracy":np.mean(balanced_scores),
    "Recall":np.mean(recall),
    "Precision":np.mean(precision),
    "F1": np.mean(f1)
})
pd.DataFrame(results)

Accuracy 0.9498072577429217 [0.95321016 0.95161505 0.95161505 0.94935531 0.94324073]
Balanced Accuracy 0.9210932475066675 [0.92824512 0.92372529 0.92372529 0.91875067 0.91101987]
Recall 0.8588508248224489 [0.87413623 0.86327739 0.86327739 0.85241856 0.84114455]
Precision 0.9499923205169623 [0.94807281 0.95261438 0.95261438 0.95467109 0.94198895]
F1 0.9020936372325845 [0.90960452 0.90574832 0.90574832 0.90065189 0.88871514]


Unnamed: 0,name,Accuracy,Balanced_Accuracy,Recall,Precision,F1
0,Decision Tree,0.996358,0.994796,0.991412,0.995054,0.993184
1,Support Vector Machine,0.955975,0.941393,0.909783,0.925506,0.917525
2,Logistic Regression,0.956214,0.941245,0.908796,0.927219,0.917849
3,GradientBoostingClassifier,0.98583,0.981017,0.970585,0.976968,0.973632
4,Random Forest2,0.945288,0.902295,0.809102,0.985101,0.888445
5,Random Forest4,0.949807,0.921093,0.858851,0.949992,0.902094


In [16]:
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier
rf6 = RandomForestClassifier(n_estimators = 13, random_state = 42, max_depth = 6)

sk_folds = StratifiedKFold(n_splits = 5)
sc = "balanced_accuracy"

scores = cross_val_score(rf6,  X_resampled_s, y_resampled_s, cv = sk_folds)
print('Accuracy', np.mean(scores), scores)

balanced_scores = cross_val_score(rf6,  X_resampled_s, y_resampled_s, cv = sk_folds, scoring='balanced_accuracy')
print('Balanced Accuracy', np.mean(balanced_scores), balanced_scores)

recall = cross_val_score(rf6, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='recall')
print('Recall', np.mean(recall), recall)

precision = cross_val_score(rf6, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='precision')
print('Precision', np.mean(precision), precision)

f1 = cross_val_score(rf6, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='f1')
print('F1', np.mean(f1), f1)

results.append({
    "name": "Random Forest6",
    "Accuracy":np.mean(scores),
    "Balanced_Accuracy":np.mean(balanced_scores),
    "Recall":np.mean(recall),
    "Precision":np.mean(precision),
    "F1": np.mean(f1)
})
pd.DataFrame(results)

Accuracy 0.9694802605343613 [0.97394656 0.97062342 0.96995879 0.97288316 0.95998937]
Balanced Accuracy 0.9539997774907422 [0.95677131 0.95200402 0.95638005 0.96134197 0.94350154]
Recall 0.9204434604702264 [0.9195459  0.91164857 0.92694965 0.93632774 0.90774544]
Precision 0.9648218665305521 [0.98259494 0.97776601 0.9601227  0.96196755 0.94165814]
F1 0.9420371777419037 [0.9500255  0.94355045 0.9432446  0.94897449 0.92439086]


Unnamed: 0,name,Accuracy,Balanced_Accuracy,Recall,Precision,F1
0,Decision Tree,0.996358,0.994796,0.991412,0.995054,0.993184
1,Support Vector Machine,0.955975,0.941393,0.909783,0.925506,0.917525
2,Logistic Regression,0.956214,0.941245,0.908796,0.927219,0.917849
3,GradientBoostingClassifier,0.98583,0.981017,0.970585,0.976968,0.973632
4,Random Forest2,0.945288,0.902295,0.809102,0.985101,0.888445
5,Random Forest4,0.949807,0.921093,0.858851,0.949992,0.902094
6,Random Forest6,0.96948,0.954,0.920443,0.964822,0.942037


In [17]:
from sklearn.neighbors import KNeighborsClassifier
knn= KNeighborsClassifier(n_neighbors=3)

sk_folds = StratifiedKFold(n_splits = 5)

scores = cross_val_score(knn,  X_resampled_s, y_resampled_s, cv = sk_folds)
print('Accuracy', np.mean(scores), scores)

balanced_scores = cross_val_score(knn,  X_resampled_s, y_resampled_s, cv = sk_folds, scoring=sc)
print('Balanced Accuracy', np.mean(balanced_scores), balanced_scores)

recall = cross_val_score(knn, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='recall')
print('Recall', np.mean(recall), recall)

precision = cross_val_score(knn, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='precision')
print('Precision', np.mean(precision), precision)

f1 = cross_val_score(knn, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='f1')
print('F1', np.mean(f1), f1)

results.append({
    "name": "KNN",
    "Accuracy":np.mean(scores),
    "Balanced_Accuracy":np.mean(balanced_scores),
    "Recall":np.mean(recall),
    "Precision":np.mean(precision),
    "F1": np.mean(f1)
})
pd.DataFrame(results)

Accuracy 0.9954805263857504 [0.99641101 0.9958793  0.99494882 0.99627808 0.99388542]
Balanced Accuracy 0.9939470885712381 [0.99567412 0.99390779 0.99327108 0.994804   0.99207847]
Recall 0.990623083924765 [0.994077   0.98963475 0.98963475 0.99160908 0.98815984]
Precision 0.9925854314223292 [0.9926072  0.99503722 0.99159248 0.99455446 0.9891358 ]
F1 0.9916019999204057 [0.99334155 0.99232863 0.99061265 0.99307958 0.98864758]


Unnamed: 0,name,Accuracy,Balanced_Accuracy,Recall,Precision,F1
0,Decision Tree,0.996358,0.994796,0.991412,0.995054,0.993184
1,Support Vector Machine,0.955975,0.941393,0.909783,0.925506,0.917525
2,Logistic Regression,0.956214,0.941245,0.908796,0.927219,0.917849
3,GradientBoostingClassifier,0.98583,0.981017,0.970585,0.976968,0.973632
4,Random Forest2,0.945288,0.902295,0.809102,0.985101,0.888445
5,Random Forest4,0.949807,0.921093,0.858851,0.949992,0.902094
6,Random Forest6,0.96948,0.954,0.920443,0.964822,0.942037
7,KNN,0.995481,0.993947,0.990623,0.992585,0.991602


In [18]:
from imblearn.ensemble import BalancedRandomForestClassifier
brfc = BalancedRandomForestClassifier(n_estimators=100, random_state=42)

sk_folds = StratifiedKFold(n_splits = 5)

scores = cross_val_score(brfc,  X_train_scaled, y_train, cv = sk_folds)
print('Accuracy', np.mean(scores), scores)

balanced_scores = cross_val_score(brfc,  X_train_scaled, y_train, cv = sk_folds, scoring=sc)
print('Balanced Accuracy', np.mean(balanced_scores), balanced_scores)

recall = cross_val_score(brfc, X_train_scaled, y_train, cv = sk_folds, scoring='recall')
print('Recall', np.mean(recall), recall)

precision = cross_val_score(brfc, X_train_scaled, y_train, cv = sk_folds, scoring='precision')
print('Precision', np.mean(precision), precision)

f1 = cross_val_score(brfc, X_train_scaled, y_train, cv = sk_folds, scoring='f1')
print('F1', np.mean(f1), f1)
results = []
results.append({
    "name": "Balanced Random Forest",
    "Accuracy":np.mean(scores),
    "Balanced_Accuracy":np.mean(balanced_scores),
    "Recall":np.mean(recall),
    "Precision":np.mean(precision),
    "F1": np.mean(f1)
})
pd.DataFrame(results)

Accuracy 0.83872 [0.83853333 0.84186667 0.83866667 0.83573333 0.8388    ]
Balanced Accuracy 0.8601917922084679 [0.8624151  0.85481474 0.86903726 0.85029971 0.86439214]
Recall 0.8870553248000904 [0.89228723 0.87101064 0.90703851 0.8685259  0.89641434]
Precision 0.3725063993549553 [0.37257079 0.37557339 0.37465716 0.3659765  0.37375415]
F1 0.5246571583280268 [0.52565609 0.52483974 0.5302795  0.51496063 0.52754982]


Unnamed: 0,name,Accuracy,Balanced_Accuracy,Recall,Precision,F1
0,Balanced Random Forest,0.83872,0.860192,0.887055,0.372506,0.524657


In [19]:
from imblearn.ensemble import EasyEnsembleClassifier
eec = EasyEnsembleClassifier(random_state=1,n_estimators = 100)

sk_folds = StratifiedKFold(n_splits = 5)

scores = cross_val_score(eec,  X_train_scaled, y_train, cv = sk_folds)
print('Accuracy', np.mean(scores), scores)

balanced_scores = cross_val_score(eec,  X_train_scaled, y_train, cv = sk_folds, scoring=sc)
print('Balanced Accuracy', np.mean(balanced_scores), balanced_scores)

recall = cross_val_score(eec, X_train_scaled, y_train, cv = sk_folds, scoring='recall')
print('Recall', np.mean(recall), recall)

precision = cross_val_score(eec, X_train_scaled, y_train, cv = sk_folds, scoring='precision')
print('Precision', np.mean(precision), precision)

f1 = cross_val_score(eec, X_train_scaled, y_train, cv = sk_folds, scoring='f1')
print('F1', np.mean(f1), f1)

results.append({
    "name": "Easy Ensemble",
    "Accuracy":np.mean(scores),
    "Balanced_Accuracy":np.mean(balanced_scores),
    "Recall":np.mean(recall),
    "Precision":np.mean(precision),
    "F1": np.mean(f1)
})
pd.DataFrame(results)

Accuracy 0.87104 [0.86986667 0.8744     0.86933333 0.87293333 0.86866667]
Balanced Accuracy 0.8565518279535315 [0.85501417 0.85812424 0.85953621 0.84796932 0.8621152 ]
Recall 0.8384260829024328 [0.83643617 0.83776596 0.84727756 0.81673307 0.85391766]
Precision 0.4274094333502165 [0.42442645 0.43448276 0.42448436 0.43006993 0.42358366]
F1 0.5661281150945705 [0.56311549 0.57220708 0.56560284 0.5634448  0.56627037]


Unnamed: 0,name,Accuracy,Balanced_Accuracy,Recall,Precision,F1
0,Balanced Random Forest,0.83872,0.860192,0.887055,0.372506,0.524657
1,Easy Ensemble,0.87104,0.856552,0.838426,0.427409,0.566128


In [None]:
# from sklearn.model_selection import cross_val_score
# from sklearn.tree import DecisionTreeClassifi er
# clf = DecisionTreeClassifier(random_state=0)
# clf.fit(X_resampled_s, y_resampled_s)
# clf_pred = clf.predict(X_test_scaled)
# print(balanced_accuracy_score(y_test, clf_pred))
# print(confusion_matrix(y_test, clf_pred))
# print(classification_report_imbalanced(y_test, clf_pred))

In [None]:
# from sklearn.model_selection import cross_val_score
# from sklearn.tree import DecisionTreeClassifier
# clf1 = DecisionTreeClassifier(random_state=0)
# clf1.fit(X_train_scaled,y_train)
# clf_pred1 = clf1.predict(X_test_scaled)
# print(accuracy_score(y_test, clf_pred1))
# print(balanced_accuracy_score(y_test, clf_pred1))
# print(confusion_matrix(y_test, clf_pred1))
# print(classification_report_imbalanced(y_test, clf_pred1))

In [None]:
# from sklearn import tree
# import graphviz
# target = list(y.astype(str).unique())
# feature_names = list(X.astype(str).columns)
# dot_data = tree.export_graphviz(clf,
#                                 out_file=None, 
#                       feature_names=feature_names,  
#                       class_names=target,  
#                       filled=True, rounded=True,  
#                       special_characters=True)  
# graph = graphviz.Source(dot_data)  

In [None]:
# from imblearn.ensemble import BalancedRandomForestClassifier
# brfc = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
# brfc.fit(X_train_scaled,y_train)
# brfc_pred = brfc.predict(X_test_scaled)
# print(balanced_accuracy_score(y_test, brfc_pred))
# print(confusion_matrix(y_test, brfc_pred))
# print(classification_report_imbalanced(y_test, brfc_pred))

In [None]:
# from sklearn.ensemble import RandomForestClassifier
# rfc = RandomForestClassifier(max_depth=2, random_state=0)
# rfc.fit(X_resampled_s, y_resampled_s)
# rfc_pred = rfc.predict(X_test_scaled)
# print(balanced_accuracy_score(y_test, rfc_pred))
# print(confusion_matrix(y_test, rfc_pred))
# print(classification_report_imbalanced(y_test, rfc_pred))

In [None]:
# from sklearn.ensemble import RandomForestClassifier
# rfc4 = RandomForestClassifier(max_depth=4, random_state=0)
# rfc4.fit(X_resampled_s, y_resampled_s)
# rfc_pred4 = rfc4.predict(X_test_scaled)
# print(balanced_accuracy_score(y_test, rfc_pred4))
# print(confusion_matrix(y_test, rfc_pred4))
# print(classification_report_imbalanced(y_test, rfc_pred4))

In [None]:
# from sklearn.ensemble import RandomForestClassifier
# rfc6 = RandomForestClassifier(max_depth=6, random_state=0)
# rfc6.fit(X_resampled_s, y_resampled_s)
# rfc_pred6 = rfc6.predict(X_test_scaled)
# print(balanced_accuracy_score(y_test, rfc_pred6))
# print(confusion_matrix(y_test, rfc_pred6))
# print(classification_report_imbalanced(y_test, rfc_pred6))

In [None]:
# # Figure out the best scale_pos_weight for XGBoost classifier
# from sklearn.model_selection import GridSearchCV
# from sklearn.model_selection import RepeatedStratifiedKFold
# from xgboost import XGBClassifier
# xgbc = XGBClassifier()
# # define grid
# weights = [1, 10, 25, 50, 75, 99, 100, 1000]
# param_grid = dict(scale_pos_weight=weights)
# # define evaluation procedure
# cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# # define grid search
# grid = GridSearchCV(estimator=xgbc, param_grid=param_grid, n_jobs=-1, cv=cv, scoring='roc_auc')
# # execute the grid search
# grid_result = grid.fit(X_resampled_s, y_resampled_s)
# # report the best configuration
# print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# # report all configurations
# means = grid_result.cv_results_['mean_test_score']
# stds = grid_result.cv_results_['std_test_score']
# params = grid_result.cv_results_['params']
# for mean, stdev, param in zip(means, stds, params):
#     print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
# # Train on XGBoost classifier
# XGB = XGBClassifier(scale_pos_weight=1)
# XGB.fit(X_resampled_s, y_resampled_s)
# XGB_pred = XGB.predict(X_test_scaled)
# print(balanced_accuracy_score(y_test, XGB_pred))
# print(confusion_matrix(y_test, XGB_pred))
# print(classification_report_imbalanced(y_test, XGB_pred))

In [None]:
# # Train on the EasyEnsembleClassifier
# from imblearn.ensemble import EasyEnsembleClassifier
# eec = EasyEnsembleClassifier(random_state=1,n_estimators = 100)

# eec.fit(X_train_scaled,y_train)

# # Calculated the balanced accuracy score

# y_pred_eec = eec.predict(X_test_scaled)

# print(f"Use 'Death' as target, balanced_accuracy_score is: {balanced_accuracy_score(y_test, y_pred_eec)}")
# print(classification_report_imbalanced(y_test, y_pred_eec))
# print(confusion_matrix(y_test,y_pred_eec))

In [None]:
# pd.DataFrame(results).sort_values("Balanced_Accuracy", ascending=False)

In [None]:
# from imblearn.ensemble import BalancedRandomForestClassifier
# from sklearn.neighbors import KNeighborsClassifier
# ml = {
#     "Support Vector Machines": SVC(kernel='linear'),
#     "LogisticRegression": LogisticRegression(max_iter=1000,solver='lbfgs', random_state=1),
#     "Random Forest_2": RandomForestClassifier(n_estimators = 13, random_state = 42, max_depth = 2),
#     "Random Forest_4": RandomForestClassifier(n_estimators = 13, random_state = 42, max_depth = 4),
#     "Random Forest_6": RandomForestClassifier(n_estimators = 13, random_state = 42, max_depth = 6),
#     "Decision Tree": DecisionTreeClassifier(random_state=42),
#     "GradientBoostingClassifier":  GradientBoostingClassifier(n_estimators=20,
#                                         learning_rate=1,
#                                         max_features=5,
#                                         max_depth=3,
#                                         random_state=0),
#     "KNN": KNeighborsClassifier(n_neighbors=3)
# }

# results = []
# for x in ml:
#     model = ml[x]
#     model.fit(X_resampled_s, y_resampled_s)
#     y_pred_s = model.predict(X_test_scaled)
#     accuracy = model.score(X_test_scaled, y_test)
#     balanced_accuracy = balanced_accuracy_score(y_test,y_pred_s)
#     #calculating precision and reall
#     precision = precision_score(y_test, y_pred_s)
#     recall = recall_score(y_test, y_pred_s)
#     print(f'model{x}: {confusion_matrix(y_test,y_pred_s)}')
#     results.append({
#     "name": x,
#     "Accuracy":accuracy,
#     "Balanced_Accuracy":balanced_accuracy,
#     "Precision": precision,
#     "Recall": recall
# })
# pd.DataFrame(results).sort_values("Accuracy", ascending=False)

In [None]:
# from imblearn.ensemble import BalancedRandomForestClassifier
# BE = {
#     "Balanced Random Forest": BalancedRandomForestClassifier(n_estimators=100, random_state=42),
#     "Easy Ensemble": EasyEnsembleClassifier(random_state=42,n_estimators = 100)
# }

# results = []
# for x in BE:
#     model2 = BE[x]
#     model2.fit(X_train_scaled, y_train)
#     y_pred_s2 = model2.predict(X_test_scaled)
#     accuracy = model2.score(X_test_scaled, y_test)
#     balanced_accuracy = balanced_accuracy_score(y_test,y_pred_s2)
#     #calculating precision and reall
#     precision = precision_score(y_test, y_pred_s2)
#     recall = recall_score(y_test, y_pred_s2)
#     print(f'model{x}: {confusion_matrix(y_test,y_pred_s2)}')
#     results.append({
#     "name": x,
#     "Accuracy":accuracy,
#     "Balanced_Accuracy":balanced_accuracy,
#     "Precision": precision,
#     "Recall": recall
# })
# pd.DataFrame(results).sort_values("Accuracy", ascending=False)

In [None]:
# #Plotting Precision-Recall Curve
# disp = plot_precision_recall_curve(model, X_test_scaled, y_test)

In [None]:
# # # Create a method that creates a new Sequential model with hyperparameter options
# def create_model(hp):
#     nn_model = tf.keras.models.Sequential()

#     # Allow kerastuner to decide which activation function to use in hidden layers
#     activation = hp.Choice('activation',['relu','tanh','sigmoid'])
#     first_units = hp.Int('first_units', min_value=1, max_value=10, step=2)
    
#     # Allow kerastuner to decide number of neurons in first layer
#     nn_model.add(tf.keras.layers.Dense(units=first_units, activation=activation, input_dim=len(X_train_scaled[0])))

#     # Allow kerastuner to decide number of hidden layers and neurons in hidden layers
#     for i in range(hp.Int('num_layers', 1, 6)):
#         next_units = hp.Int('units_' + str(i), min_value=1, max_value=10, step=2)

#         nn_model.add(tf.keras.layers.Dense(units=next_units, activation=activation))
    
#     nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

#     # Compile the model
#     nn_model.compile(loss="binary_crossentropy", optimizer='adam', metrics=["Accuracy"])

#     return nn_model

In [None]:
# import keras_tuner as kt

# tuner = kt.Hyperband(
#     create_model, # function name
#     objective="val_accuracy",
#     max_epochs=20,
#     hyperband_iterations=2)

In [None]:
# # Run the kerastuner search for best hyperparameters
# tuner.search(X_resampled_s, y_resampled_s,epochs=20,validation_data=(X_test_scaled,y_test))

In [None]:
# # Get best model hyperparameters
# best_hyper = tuner.get_best_hyperparameters(1)[0]
# best_hyper.values

In [None]:
# best_model = tuner.get_best_models(1)[0]
# fit_model = best_model.fit(X_resampled_s, y_resampled_s, epochs=20)

In [None]:
# # Evaluate best model against full test data
# model_loss, model_accuracy = best_model.evaluate(X_test_scaled,y_test,verbose=2)
# print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

In [None]:
# import numpy as np
# from sklearn.feature_selection import SelectKBest, chi2, f_classif

# # Perform feature selection
# selector = SelectKBest(f_classif, k=10)
# selector.fit(X, y)

# # Get the raw p-values for each feature, and transform from p-values into scores
# scores = -np.log10(selector.pvalues_)

# # Plot the scores.
# plt.bar(range(len(keep_col)), scores)
# plt.xticks(range(len(keep_col)), X, rotation='vertical')
# plt.show()