In [1]:
# Import libraries

# Pandas
import pandas as pd
from collections import Counter

import matplotlib.pyplot as plt
import numpy as np


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.metrics import confusion_matrix


from sklearn.linear_model import LogisticRegression

from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import accuracy_score
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import classification_report
from imblearn.ensemble import EasyEnsembleClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import plot_precision_recall_curve
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.svm import SVC
import sklearn as skl
import tensorflow as tf

In [2]:
file = '../clean_ml_data_0625.csv'
clean_df = pd.read_csv(file)

In [3]:
clean_df = clean_df.sample(50000)

In [4]:
clean_df.head()

Unnamed: 0,gender,pneumonia,pregnant,diabetes,copd,asthma,immunosup,hypertension,cardiovascular,obesity,renal_chronic,tobacco,closed_contact,another_complication,death,intubation,ICU,new_age
1410996,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,5
60758,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,4
987351,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
425418,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,3
140556,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2


In [5]:
Counter(clean_df.death)

Counter({0: 44809, 1: 5191})

In [6]:
y= clean_df['death']
X = clean_df.drop(columns = ['ICU','intubation','death'])

In [7]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state =42, stratify=y)

In [8]:
# Creating StandardScaler instance
scaler = StandardScaler()
# Fitting standard scaler
X_scaler = scaler.fit(X_train)

# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [9]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE
X_resampled_s, y_resampled_s = SMOTE(random_state=1,
sampling_strategy='auto').fit_resample(X_train_scaled,y_train)
Counter(y_resampled_s)

Counter({0: 33607, 1: 33607})

In [None]:
# # Use SMOTEENN resampling method
# from imblearn.combine import SMOTEENN 
# smote_enn = SMOTEENN(random_state=0)
# X_resampled_s, y_resampled_s = smote_enn.fit_resample(X_train_scaled,y_train)
# Counter(y_resampled_s)

In [None]:
keep_col = ['gender',
         'pneumonia', 'new_age',
       'pregnant', 'diabetes', 'copd',
       'asthma', 'immunosup', 'hypertension',
       'cardiovascular', 'obesity', 'renal_chronic', 'tobacco', 'another_complication', 'closed_contact']

In [10]:
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score

results = []
clf = DecisionTreeClassifier(random_state=42)

sk_folds = StratifiedKFold(n_splits = 5)

scores = cross_val_score(clf, X_resampled_s, y_resampled_s, cv = sk_folds)
print('Accuracy', np.mean(scores), scores)
balanced_scores = cross_val_score(clf,  X_resampled_s, y_resampled_s, cv = sk_folds, scoring='balanced_accuracy')
print('Balanced Accuracy', np.mean(balanced_scores), balanced_scores)
recall = cross_val_score(clf, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='recall')
print('Recall', np.mean(recall), recall)
precision = cross_val_score(clf, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='precision')
print('Precision', np.mean(precision), precision)
f1 = cross_val_score(clf, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='f1')
print('F1', np.mean(f1), f1)

results.append({
    "name": "Decision Tree",
    "Accuracy":np.mean(scores),
    "Balanced_Accuracy":np.mean(balanced_scores),
    "Recall":np.mean(recall),
    "Precision":np.mean(precision),
    "F1": np.mean(f1)
})
pd.DataFrame(results)

Accuracy 0.8846669792939386 [0.86000149 0.89407126 0.8887897  0.8894592  0.89101324]
Balanced Accuracy 0.8846663623335334 [0.85999983 0.89407248 0.88878854 0.88945773 0.89101324]
Recall 0.8938903810174391 [0.83767297 0.91043    0.90449271 0.9092532  0.90760304]
Precision 0.8776894859509664 [0.87681047 0.88157326 0.87696524 0.87464224 0.87845622]
F1 0.885496871158281 [0.85679501 0.89576929 0.89051629 0.89161196 0.8927918 ]


Unnamed: 0,name,Accuracy,Balanced_Accuracy,Recall,Precision,F1
0,Decision Tree,0.884667,0.884666,0.89389,0.877689,0.885497


In [11]:
svm = SVC(kernel='linear')

sk_folds = StratifiedKFold(n_splits = 5)

scores = cross_val_score(svm, X_resampled_s, y_resampled_s, cv = sk_folds)
print('Accuracy', np.mean(scores), scores)
balanced_scores = cross_val_score(svm,  X_resampled_s, y_resampled_s, cv = sk_folds, scoring='balanced_accuracy')
print('Balanced Accuracy', np.mean(balanced_scores), balanced_scores)
recall = cross_val_score(svm, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='recall')
print('Recall', np.mean(recall), recall)
precision = cross_val_score(svm, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='precision')
print('Precision', np.mean(precision), precision)
f1 = cross_val_score(svm, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='f1')
print('F1', np.mean(f1), f1)

results.append({
    "name": "Support Vector Machine",
    "Accuracy":np.mean(scores),
    "Balanced_Accuracy":np.mean(balanced_scores),
    "Recall":np.mean(recall),
    "Precision":np.mean(precision),
    "F1": np.mean(f1)
})
pd.DataFrame(results)

Accuracy 0.8607433062584127 [0.85873689 0.86439039 0.85672841 0.86253069 0.86133016]
Balanced Accuracy 0.8607432281709186 [0.85873591 0.86438979 0.85672927 0.86253102 0.86133016]
Recall 0.8514297511284225 [0.8455587  0.85627139 0.84513538 0.85807795 0.85210534]
Precision 0.8675917901250987 [0.8684291  0.87038717 0.86521474 0.86580606 0.86812187]
F1 0.8594267638134794 [0.85684131 0.86327158 0.85505719 0.86192469 0.86003904]


Unnamed: 0,name,Accuracy,Balanced_Accuracy,Recall,Precision,F1
0,Decision Tree,0.884667,0.884666,0.89389,0.877689,0.885497
1,Support Vector Machine,0.860743,0.860743,0.85143,0.867592,0.859427


In [12]:
lr = LogisticRegression(max_iter=1000,solver='lbfgs', random_state=1)

sk_folds = StratifiedKFold(n_splits = 5)

scores = cross_val_score(lr, X_resampled_s, y_resampled_s, cv = sk_folds)
print('Accuracy', np.mean(scores), scores)
balanced_scores = cross_val_score(lr,  X_resampled_s, y_resampled_s, cv = sk_folds, scoring='balanced_accuracy')
print('Balanced Accuracy', np.mean(balanced_scores), balanced_scores)
recall = cross_val_score(lr, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='recall')
print('Recall', np.mean(recall), recall)
precision = cross_val_score(lr, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='precision')
print('Precision', np.mean(precision), precision)
f1 = cross_val_score(lr, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='f1')
print('F1', np.mean(f1), f1)

results.append({
    "name": "Logistic Regression",
    "Accuracy":np.mean(scores),
    "Balanced_Accuracy":np.mean(balanced_scores),
    "Recall":np.mean(recall),
    "Precision":np.mean(precision),
    "F1": np.mean(f1)
})
pd.DataFrame(results)

Accuracy 0.865072733543867 [0.86446478 0.86788663 0.86424161 0.86498549 0.86378515]
Balanced Accuracy 0.8650726289163432 [0.86446438 0.86788645 0.86424185 0.86498531 0.86378515]
Recall 0.8629748994666985 [0.85909835 0.86549621 0.86105326 0.86745016 0.86177652]
Precision 0.8666199352801115 [0.86840126 0.86963672 0.8665968  0.86321244 0.86525246]
F1 0.8647879035205351 [0.86372476 0.86756152 0.86381613 0.86532611 0.863511  ]


Unnamed: 0,name,Accuracy,Balanced_Accuracy,Recall,Precision,F1
0,Decision Tree,0.884667,0.884666,0.89389,0.877689,0.885497
1,Support Vector Machine,0.860743,0.860743,0.85143,0.867592,0.859427
2,Logistic Regression,0.865073,0.865073,0.862975,0.86662,0.864788


In [13]:
from sklearn.ensemble import GradientBoostingClassifier
xgb = GradientBoostingClassifier(n_estimators=20,
                                        learning_rate=1,
                                        max_features=5,
                                        max_depth=3,
                                        random_state=0)

sk_folds = StratifiedKFold(n_splits = 5)

scores = cross_val_score(xgb,  X_resampled_s, y_resampled_s, cv = sk_folds)
print('Accuracy', np.mean(scores), scores)
balanced_scores = cross_val_score(xgb,  X_resampled_s, y_resampled_s, cv = sk_folds, scoring='balanced_accuracy')
print('Balanced Accuracy', np.mean(balanced_scores), balanced_scores)
recall = cross_val_score(xgb, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='recall')
print('Recall', np.mean(recall), recall)

precision = cross_val_score(xgb, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='precision')
print('Precision', np.mean(precision), precision)

f1 = cross_val_score(xgb, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='f1')
print('F1', np.mean(f1), f1)

results.append({
    "name": "GradientBoostingClassifier",
    "Accuracy":np.mean(scores),
    "Balanced_Accuracy":np.mean(balanced_scores),
    "Recall":np.mean(recall),
    "Precision":np.mean(precision),
    "F1": np.mean(f1)
})
pd.DataFrame(results)

Accuracy 0.878998415756928 [0.86386967 0.88975675 0.87718515 0.8856654  0.8785151 ]
Balanced Accuracy 0.8789978220200989 [0.86386922 0.88975743 0.8771833  0.88566407 0.8785151 ]
Recall 0.8912422046544997 [0.85775926 0.89882458 0.90211247 0.90360012 0.8939146 ]
Precision 0.869994589753358 [0.86835367 0.88279994 0.85928865 0.87232515 0.86720554]
F1 0.8803977829850957 [0.86302395 0.89074019 0.88017998 0.88768725 0.88035754]


Unnamed: 0,name,Accuracy,Balanced_Accuracy,Recall,Precision,F1
0,Decision Tree,0.884667,0.884666,0.89389,0.877689,0.885497
1,Support Vector Machine,0.860743,0.860743,0.85143,0.867592,0.859427
2,Logistic Regression,0.865073,0.865073,0.862975,0.86662,0.864788
3,GradientBoostingClassifier,0.878998,0.878998,0.891242,0.869995,0.880398


In [14]:
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier
rf2 = RandomForestClassifier(n_estimators = 13, random_state = 42, max_depth = 2)

sk_folds = StratifiedKFold(n_splits = 5)
sc = "balanced_accuracy"

scores = cross_val_score(rf2,  X_resampled_s, y_resampled_s, cv = sk_folds)
print('Accuracy', np.mean(scores), scores)

balanced_scores = cross_val_score(rf2,  X_resampled_s, y_resampled_s, cv = sk_folds, scoring='balanced_accuracy')
print('Balanced Accuracy', np.mean(balanced_scores), balanced_scores)

recall = cross_val_score(rf2, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='recall')
print('Recall', np.mean(recall), recall)

precision = cross_val_score(rf2, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='precision')
print('Precision', np.mean(precision), precision)

f1 = cross_val_score(rf2, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='f1')
print('F1', np.mean(f1), f1)

results.append({
    "name": "Random Forest2",
    "Accuracy":np.mean(scores),
    "Balanced_Accuracy":np.mean(balanced_scores),
    "Recall":np.mean(recall),
    "Precision":np.mean(precision),
    "F1": np.mean(f1)
})
pd.DataFrame(results)

Accuracy 0.8486625056891002 [0.84318976 0.85367849 0.84512386 0.85100052 0.85031989]
Balanced Accuracy 0.8486623832781575 [0.84318832 0.85367717 0.84512511 0.85100142 0.85031989]
Recall 0.832267007524498 [0.82383574 0.83588752 0.8283249  0.83888724 0.83439964]
Precision 0.8604829111159621 [0.85698808 0.86670781 0.85714286 0.85973472 0.86184109]
F1 0.8461345260042975 [0.84008496 0.85101871 0.84248752 0.84918304 0.8478984 ]


Unnamed: 0,name,Accuracy,Balanced_Accuracy,Recall,Precision,F1
0,Decision Tree,0.884667,0.884666,0.89389,0.877689,0.885497
1,Support Vector Machine,0.860743,0.860743,0.85143,0.867592,0.859427
2,Logistic Regression,0.865073,0.865073,0.862975,0.86662,0.864788
3,GradientBoostingClassifier,0.878998,0.878998,0.891242,0.869995,0.880398
4,Random Forest2,0.848663,0.848662,0.832267,0.860483,0.846135


In [15]:
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier
rf4 = RandomForestClassifier(n_estimators = 13, random_state = 42, max_depth = 4)

sk_folds = StratifiedKFold(n_splits = 5)
sc = "balanced_accuracy"

scores = cross_val_score(rf4,  X_resampled_s, y_resampled_s, cv = sk_folds)
print('Accuracy', np.mean(scores), scores)

balanced_scores = cross_val_score(rf4,  X_resampled_s, y_resampled_s, cv = sk_folds, scoring='balanced_accuracy')
print('Balanced Accuracy', np.mean(balanced_scores), balanced_scores)

recall = cross_val_score(rf4, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='recall')
print('Recall', np.mean(recall), recall)

precision = cross_val_score(rf4, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='precision')
print('Precision', np.mean(precision), precision)

f1 = cross_val_score(rf4, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='f1')
print('F1', np.mean(f1), f1)

results.append({
    "name": "Random Forest4",
    "Accuracy":np.mean(scores),
    "Balanced_Accuracy":np.mean(balanced_scores),
    "Recall":np.mean(recall),
    "Precision":np.mean(precision),
    "F1": np.mean(f1)
})
pd.DataFrame(results)

Accuracy 0.8663522674049389 [0.85977832 0.87063899 0.86535744 0.86833296 0.86765362]
Balanced Accuracy 0.8663522468909036 [0.85977864 0.87063945 0.86535714 0.86833239 0.86765362]
Recall 0.8721694550614515 [0.86400833 0.87680405 0.86938411 0.87607855 0.87457224]
Precision 0.8621345569988564 [0.8567424  0.86610817 0.86245573 0.86273074 0.86263575]
F1 0.8671211543961086 [0.86036003 0.87142329 0.86590606 0.86935341 0.86856298]


Unnamed: 0,name,Accuracy,Balanced_Accuracy,Recall,Precision,F1
0,Decision Tree,0.884667,0.884666,0.89389,0.877689,0.885497
1,Support Vector Machine,0.860743,0.860743,0.85143,0.867592,0.859427
2,Logistic Regression,0.865073,0.865073,0.862975,0.86662,0.864788
3,GradientBoostingClassifier,0.878998,0.878998,0.891242,0.869995,0.880398
4,Random Forest2,0.848663,0.848662,0.832267,0.860483,0.846135
5,Random Forest4,0.866352,0.866352,0.872169,0.862135,0.867121


In [16]:
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier
rf6 = RandomForestClassifier(n_estimators = 13, random_state = 42, max_depth = 6)

sk_folds = StratifiedKFold(n_splits = 5)
sc = "balanced_accuracy"

scores = cross_val_score(rf6,  X_resampled_s, y_resampled_s, cv = sk_folds)
print('Accuracy', np.mean(scores), scores)

balanced_scores = cross_val_score(rf6,  X_resampled_s, y_resampled_s, cv = sk_folds, scoring='balanced_accuracy')
print('Balanced Accuracy', np.mean(balanced_scores), balanced_scores)

recall = cross_val_score(rf6, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='recall')
print('Recall', np.mean(recall), recall)

precision = cross_val_score(rf6, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='precision')
print('Precision', np.mean(precision), precision)

f1 = cross_val_score(rf6, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='f1')
print('F1', np.mean(f1), f1)

results.append({
    "name": "Random Forest6",
    "Accuracy":np.mean(scores),
    "Balanced_Accuracy":np.mean(balanced_scores),
    "Recall":np.mean(recall),
    "Precision":np.mean(precision),
    "F1": np.mean(f1)
})
pd.DataFrame(results)

Accuracy 0.8713958486715327 [0.8624563  0.8787473  0.87368891 0.87011828 0.87196846]
Balanced Accuracy 0.8713955016983498 [0.86245993 0.8787487  0.87368712 0.87011331 0.87196846]
Recall 0.9111788507124242 [0.91132272 0.89748549 0.89779827 0.93692353 0.91236423]
Precision 0.8444816409929065 [0.83017078 0.86505091 0.85651433 0.82650919 0.844163  ]
F1 0.8763390980932664 [0.86885595 0.88096977 0.87667054 0.87825966 0.87693958]


Unnamed: 0,name,Accuracy,Balanced_Accuracy,Recall,Precision,F1
0,Decision Tree,0.884667,0.884666,0.89389,0.877689,0.885497
1,Support Vector Machine,0.860743,0.860743,0.85143,0.867592,0.859427
2,Logistic Regression,0.865073,0.865073,0.862975,0.86662,0.864788
3,GradientBoostingClassifier,0.878998,0.878998,0.891242,0.869995,0.880398
4,Random Forest2,0.848663,0.848662,0.832267,0.860483,0.846135
5,Random Forest4,0.866352,0.866352,0.872169,0.862135,0.867121
6,Random Forest6,0.871396,0.871396,0.911179,0.844482,0.876339


In [17]:
from sklearn.neighbors import KNeighborsClassifier
knn= KNeighborsClassifier(n_neighbors=3)

sk_folds = StratifiedKFold(n_splits = 5)

scores = cross_val_score(knn,  X_resampled_s, y_resampled_s, cv = sk_folds)
print('Accuracy', np.mean(scores), scores)

balanced_scores = cross_val_score(knn,  X_resampled_s, y_resampled_s, cv = sk_folds, scoring=sc)
print('Balanced Accuracy', np.mean(balanced_scores), balanced_scores)

recall = cross_val_score(knn, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='recall')
print('Recall', np.mean(recall), recall)

precision = cross_val_score(knn, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='precision')
print('Precision', np.mean(precision), precision)

f1 = cross_val_score(knn, X_resampled_s, y_resampled_s, cv = sk_folds, scoring='f1')
print('F1', np.mean(f1), f1)

results.append({
    "name": "KNN",
    "Accuracy":np.mean(scores),
    "Balanced_Accuracy":np.mean(balanced_scores),
    "Recall":np.mean(recall),
    "Precision":np.mean(precision),
    "F1": np.mean(f1)
})
pd.DataFrame(results)

Accuracy 0.7201926813763078 [0.59034442 0.75318009 0.7493863  0.75697389 0.75107871]
Balanced Accuracy 0.7201897638973104 [0.59031526 0.75316565 0.7494012  0.756988   0.75107871]
Recall 0.48483510387072526 [0.19833358 0.5589942  0.54909253 0.56724189 0.55051332]
Precision 0.9162931559012911 [0.91804408 0.91388956 0.91610822 0.91416926 0.91925466]
F1 0.6190452276223384 [0.32619601 0.69368538 0.6866338  0.70008262 0.68862833]


Unnamed: 0,name,Accuracy,Balanced_Accuracy,Recall,Precision,F1
0,Decision Tree,0.884667,0.884666,0.89389,0.877689,0.885497
1,Support Vector Machine,0.860743,0.860743,0.85143,0.867592,0.859427
2,Logistic Regression,0.865073,0.865073,0.862975,0.86662,0.864788
3,GradientBoostingClassifier,0.878998,0.878998,0.891242,0.869995,0.880398
4,Random Forest2,0.848663,0.848662,0.832267,0.860483,0.846135
5,Random Forest4,0.866352,0.866352,0.872169,0.862135,0.867121
6,Random Forest6,0.871396,0.871396,0.911179,0.844482,0.876339
7,KNN,0.720193,0.72019,0.484835,0.916293,0.619045


In [18]:
from imblearn.ensemble import BalancedRandomForestClassifier
brfc = BalancedRandomForestClassifier(n_estimators=100, random_state=42)

sk_folds = StratifiedKFold(n_splits = 5)

scores = cross_val_score(brfc,  X_train_scaled, y_train, cv = sk_folds)
print('Accuracy', np.mean(scores), scores)

balanced_scores = cross_val_score(brfc,  X_train_scaled, y_train, cv = sk_folds, scoring=sc)
print('Balanced Accuracy', np.mean(balanced_scores), balanced_scores)

recall = cross_val_score(brfc, X_train_scaled, y_train, cv = sk_folds, scoring='recall')
print('Recall', np.mean(recall), recall)

precision = cross_val_score(brfc, X_train_scaled, y_train, cv = sk_folds, scoring='precision')
print('Precision', np.mean(precision), precision)

f1 = cross_val_score(brfc, X_train_scaled, y_train, cv = sk_folds, scoring='f1')
print('F1', np.mean(f1), f1)
results = []
results.append({
    "name": "Balanced Random Forest",
    "Accuracy":np.mean(scores),
    "Balanced_Accuracy":np.mean(balanced_scores),
    "Recall":np.mean(recall),
    "Precision":np.mean(precision),
    "F1": np.mean(f1)
})
pd.DataFrame(results)

Accuracy 0.84536 [0.84013333 0.85226667 0.8488     0.8412     0.8444    ]
Balanced Accuracy 0.8589913949787583 [0.85739589 0.87155249 0.85832462 0.85635399 0.85132999]
Recall 0.8761938547541341 [0.87917738 0.89588689 0.8703466  0.87548139 0.86007702]
Precision 0.3909257664172755 [0.3823365  0.40429234 0.3962595  0.38400901 0.38773148]
F1 0.5406017938262109 [0.5329178  0.55715428 0.54457831 0.53385519 0.53450339]


Unnamed: 0,name,Accuracy,Balanced_Accuracy,Recall,Precision,F1
0,Balanced Random Forest,0.84536,0.858991,0.876194,0.390926,0.540602


In [19]:
from imblearn.ensemble import EasyEnsembleClassifier
eec = EasyEnsembleClassifier(random_state=1,n_estimators = 100)

sk_folds = StratifiedKFold(n_splits = 5)

scores = cross_val_score(eec,  X_train_scaled, y_train, cv = sk_folds)
print('Accuracy', np.mean(scores), scores)

balanced_scores = cross_val_score(eec,  X_train_scaled, y_train, cv = sk_folds, scoring=sc)
print('Balanced Accuracy', np.mean(balanced_scores), balanced_scores)

recall = cross_val_score(eec, X_train_scaled, y_train, cv = sk_folds, scoring='recall')
print('Recall', np.mean(recall), recall)

precision = cross_val_score(eec, X_train_scaled, y_train, cv = sk_folds, scoring='precision')
print('Precision', np.mean(precision), precision)

f1 = cross_val_score(eec, X_train_scaled, y_train, cv = sk_folds, scoring='f1')
print('F1', np.mean(f1), f1)

results.append({
    "name": "Easy Ensemble",
    "Accuracy":np.mean(scores),
    "Balanced_Accuracy":np.mean(balanced_scores),
    "Recall":np.mean(recall),
    "Precision":np.mean(precision),
    "F1": np.mean(f1)
})
pd.DataFrame(results)

Accuracy 0.8714933333333332 [0.87586667 0.8704     0.872      0.87026667 0.86893333]
Balanced Accuracy 0.8569924520181175 [0.8545988  0.87200758 0.8514082  0.86008781 0.84685987]
Recall 0.8386910910104908 [0.8277635  0.87403599 0.8254172  0.84724005 0.81899872]
Precision 0.4379628336667857 [0.44691187 0.43758044 0.43830948 0.43593131 0.43108108]
F1 0.5753444541433108 [0.58044164 0.58319039 0.57257346 0.57566507 0.5648517 ]


Unnamed: 0,name,Accuracy,Balanced_Accuracy,Recall,Precision,F1
0,Balanced Random Forest,0.84536,0.858991,0.876194,0.390926,0.540602
1,Easy Ensemble,0.871493,0.856992,0.838691,0.437963,0.575344


In [None]:
# from imblearn.ensemble import BalancedRandomForestClassifier
# from sklearn.neighbors import KNeighborsClassifier
# ml = {
#     "Support Vector Machines": SVC(kernel='linear'),
#     "LogisticRegression": LogisticRegression(max_iter=1000,solver='lbfgs', random_state=1),
#     "Random Forest_2": RandomForestClassifier(n_estimators = 13, random_state = 42, max_depth = 2),
#     "Random Forest_4": RandomForestClassifier(n_estimators = 13, random_state = 42, max_depth = 4),
#     "Random Forest_6": RandomForestClassifier(n_estimators = 13, random_state = 42, max_depth = 6),
#     "Decision Tree": DecisionTreeClassifier(random_state=42),
#     "GradientBoostingClassifier":  GradientBoostingClassifier(n_estimators=20,
#                                         learning_rate=1,
#                                         max_features=5,
#                                         max_depth=3,
#                                         random_state=0),
#     "KNN": KNeighborsClassifier(n_neighbors=3)
# }

# test_results = []
# for x in ml:
#     model = ml[x]
#     model.fit(X_resampled_s, y_resampled_s)
#     y_pred_s = model.predict(X_test_scaled)
#     accuracy = model.score(X_test_scaled, y_test)
#     balanced_accuracy = balanced_accuracy_score(y_test,y_pred_s)
#     #calculating precision and reall
#     precision = precision_score(y_test, y_pred_s)
#     recall = recall_score(y_test, y_pred_s)
#     print(f'model{x}: {confusion_matrix(y_test,y_pred_s)}')
#     test_results.append({
#     "name": x,
#     "Accuracy":accuracy,
#     "Balanced_Accuracy":balanced_accuracy,
#     "Precision": precision,
#     "Recall": recall
# })
# pd.DataFrame(test_results).sort_values("Accuracy", ascending=False)

In [None]:
# from imblearn.ensemble import BalancedRandomForestClassifier
# BE = {
#     "Balanced Random Forest": BalancedRandomForestClassifier(n_estimators=100, random_state=42),
#     "Easy Ensemble": EasyEnsembleClassifier(random_state=42,n_estimators = 100)
# }

# for x in BE:
#     model2 = BE[x]
#     model2.fit(X_train_scaled, y_train)
#     y_pred_s2 = model2.predict(X_test_scaled)
#     accuracy = model2.score(X_test_scaled, y_test)
#     balanced_accuracy = balanced_accuracy_score(y_test,y_pred_s2)
#     #calculating precision and reall
#     precision = precision_score(y_test, y_pred_s2)
#     recall = recall_score(y_test, y_pred_s2)
#     print(f'model{x}: {confusion_matrix(y_test,y_pred_s2)}')
#     test_results.append({
#     "name": x,
#     "Accuracy":accuracy,
#     "Balanced_Accuracy":balanced_accuracy,
#     "Precision": precision,
#     "Recall": recall
# })
# pd.DataFrame(test_results).sort_values("Accuracy", ascending=False)

In [None]:
# pd.DataFrame(results).sort_values("Balanced_Accuracy", ascending=False)

In [None]:
# from sklearn.model_selection import cross_val_score
# from sklearn.tree import DecisionTreeClassifi er
# clf = DecisionTreeClassifier(random_state=0)
# clf.fit(X_resampled_s, y_resampled_s)
# clf_pred = clf.predict(X_test_scaled)
# print(balanced_accuracy_score(y_test, clf_pred))
# print(confusion_matrix(y_test, clf_pred))
# print(classification_report_imbalanced(y_test, clf_pred))

In [None]:
# from sklearn.model_selection import cross_val_score
# from sklearn.tree import DecisionTreeClassifier
# clf1 = DecisionTreeClassifier(random_state=0)
# clf1.fit(X_train_scaled,y_train)
# clf_pred1 = clf1.predict(X_test_scaled)
# print(accuracy_score(y_test, clf_pred1))
# print(balanced_accuracy_score(y_test, clf_pred1))
# print(confusion_matrix(y_test, clf_pred1))
# print(classification_report_imbalanced(y_test, clf_pred1))

In [None]:
# from sklearn import tree
# import graphviz
# target = list(y.astype(str).unique())
# feature_names = list(X.astype(str).columns)
# dot_data = tree.export_graphviz(clf,
#                                 out_file=None, 
#                       feature_names=feature_names,  
#                       class_names=target,  
#                       filled=True, rounded=True,  
#                       special_characters=True)  
# graph = graphviz.Source(dot_data)  

In [None]:
# from imblearn.ensemble import BalancedRandomForestClassifier
# brfc = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
# brfc.fit(X_train_scaled,y_train)
# brfc_pred = brfc.predict(X_test_scaled)
# print(balanced_accuracy_score(y_test, brfc_pred))
# print(confusion_matrix(y_test, brfc_pred))
# print(classification_report_imbalanced(y_test, brfc_pred))

In [None]:
# from sklearn.ensemble import RandomForestClassifier
# rfc = RandomForestClassifier(max_depth=2, random_state=0)
# rfc.fit(X_resampled_s, y_resampled_s)
# rfc_pred = rfc.predict(X_test_scaled)
# print(balanced_accuracy_score(y_test, rfc_pred))
# print(confusion_matrix(y_test, rfc_pred))
# print(classification_report_imbalanced(y_test, rfc_pred))

In [None]:
# from sklearn.ensemble import RandomForestClassifier
# rfc4 = RandomForestClassifier(max_depth=4, random_state=0)
# rfc4.fit(X_resampled_s, y_resampled_s)
# rfc_pred4 = rfc4.predict(X_test_scaled)
# print(balanced_accuracy_score(y_test, rfc_pred4))
# print(confusion_matrix(y_test, rfc_pred4))
# print(classification_report_imbalanced(y_test, rfc_pred4))

In [None]:
# from sklearn.ensemble import RandomForestClassifier
# rfc6 = RandomForestClassifier(max_depth=6, random_state=0)
# rfc6.fit(X_resampled_s, y_resampled_s)
# rfc_pred6 = rfc6.predict(X_test_scaled)
# print(balanced_accuracy_score(y_test, rfc_pred6))
# print(confusion_matrix(y_test, rfc_pred6))
# print(classification_report_imbalanced(y_test, rfc_pred6))

In [None]:
# # Figure out the best scale_pos_weight for XGBoost classifier
# from sklearn.model_selection import GridSearchCV
# from sklearn.model_selection import RepeatedStratifiedKFold
# from xgboost import XGBClassifier
# xgbc = XGBClassifier()
# # define grid
# weights = [1, 10, 25, 50, 75, 99, 100, 1000]
# param_grid = dict(scale_pos_weight=weights)
# # define evaluation procedure
# cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# # define grid search
# grid = GridSearchCV(estimator=xgbc, param_grid=param_grid, n_jobs=-1, cv=cv, scoring='roc_auc')
# # execute the grid search
# grid_result = grid.fit(X_resampled_s, y_resampled_s)
# # report the best configuration
# print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# # report all configurations
# means = grid_result.cv_results_['mean_test_score']
# stds = grid_result.cv_results_['std_test_score']
# params = grid_result.cv_results_['params']
# for mean, stdev, param in zip(means, stds, params):
#     print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
# # Train on XGBoost classifier
# XGB = XGBClassifier(scale_pos_weight=1)
# XGB.fit(X_resampled_s, y_resampled_s)
# XGB_pred = XGB.predict(X_test_scaled)
# print(balanced_accuracy_score(y_test, XGB_pred))
# print(confusion_matrix(y_test, XGB_pred))
# print(classification_report_imbalanced(y_test, XGB_pred))

In [None]:
# # Train on the EasyEnsembleClassifier
# from imblearn.ensemble import EasyEnsembleClassifier
# eec = EasyEnsembleClassifier(random_state=1,n_estimators = 100)

# eec.fit(X_train_scaled,y_train)

# # Calculated the balanced accuracy score

# y_pred_eec = eec.predict(X_test_scaled)

# print(f"Use 'Death' as target, balanced_accuracy_score is: {balanced_accuracy_score(y_test, y_pred_eec)}")
# print(classification_report_imbalanced(y_test, y_pred_eec))
# print(confusion_matrix(y_test,y_pred_eec))

In [None]:
# #Plotting Precision-Recall Curve
# disp = plot_precision_recall_curve(model, X_test_scaled, y_test)

In [None]:
# # # Create a method that creates a new Sequential model with hyperparameter options
# def create_model(hp):
#     nn_model = tf.keras.models.Sequential()

#     # Allow kerastuner to decide which activation function to use in hidden layers
#     activation = hp.Choice('activation',['relu','tanh','sigmoid'])
#     first_units = hp.Int('first_units', min_value=1, max_value=10, step=2)
    
#     # Allow kerastuner to decide number of neurons in first layer
#     nn_model.add(tf.keras.layers.Dense(units=first_units, activation=activation, input_dim=len(X_train_scaled[0])))

#     # Allow kerastuner to decide number of hidden layers and neurons in hidden layers
#     for i in range(hp.Int('num_layers', 1, 6)):
#         next_units = hp.Int('units_' + str(i), min_value=1, max_value=10, step=2)

#         nn_model.add(tf.keras.layers.Dense(units=next_units, activation=activation))
    
#     nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

#     # Compile the model
#     nn_model.compile(loss="binary_crossentropy", optimizer='adam', metrics=["Accuracy"])

#     return nn_model

In [None]:
# import keras_tuner as kt

# tuner = kt.Hyperband(
#     create_model, # function name
#     objective="val_accuracy",
#     max_epochs=20,
#     hyperband_iterations=2)

In [None]:
# # Run the kerastuner search for best hyperparameters
# tuner.search(X_resampled_s, y_resampled_s,epochs=20,validation_data=(X_test_scaled,y_test))

In [None]:
# # Get best model hyperparameters
# best_hyper = tuner.get_best_hyperparameters(1)[0]
# best_hyper.values

In [None]:
# best_model = tuner.get_best_models(1)[0]
# fit_model = best_model.fit(X_resampled_s, y_resampled_s, epochs=20)

In [None]:
# # Evaluate best model against full test data
# model_loss, model_accuracy = best_model.evaluate(X_test_scaled,y_test,verbose=2)
# print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

In [None]:
# import numpy as np
# from sklearn.feature_selection import SelectKBest, chi2, f_classif

# # Perform feature selection
# selector = SelectKBest(f_classif, k=10)
# selector.fit(X, y)

# # Get the raw p-values for each feature, and transform from p-values into scores
# scores = -np.log10(selector.pvalues_)

# # Plot the scores.
# plt.bar(range(len(keep_col)), scores)
# plt.xticks(range(len(keep_col)), X, rotation='vertical')
# plt.show()