In [1]:
# Import libraries

# Pandas
import pandas as pd
from collections import Counter

import matplotlib.pyplot as plt
import numpy as np


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.metrics import confusion_matrix


from sklearn.linear_model import LogisticRegression

from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import accuracy_score
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import classification_report
from imblearn.ensemble import EasyEnsembleClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import plot_precision_recall_curve
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.svm import SVC
import sklearn as skl
import tensorflow as tf

In [2]:
file = '../clean_ml_data_0625.csv'
clean_df = pd.read_csv(file)

In [3]:
clean_df = clean_df.sample(50000)

In [4]:
clean_df.head()

Unnamed: 0,gender,pneumonia,pregnant,diabetes,copd,asthma,immunosup,hypertension,cardiovascular,obesity,renal_chronic,tobacco,closed_contact,another_complication,death,intubation,ICU,new_age
837839,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,5
23188,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,2
751949,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2
306780,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,3
455168,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,2


In [5]:
Counter(clean_df.death)

Counter({1: 5281, 0: 44719})

In [6]:
y= clean_df['death']
X = clean_df.drop(columns = ['ICU','intubation','death'])

In [7]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state =42, stratify=y)

In [8]:
# Creating StandardScaler instance
scaler = StandardScaler()
# Fitting standard scaler
X_scaler = scaler.fit(X_train)

# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [9]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE
X_resampled_s, y_resampled_s = SMOTE(random_state=1,
sampling_strategy='auto').fit_resample(X_train_scaled,y_train)
Counter(y_resampled_s)

Counter({0: 33539, 1: 33539})

In [None]:
# # Use SMOTEENN resampling method
# from imblearn.combine import SMOTEENN 
# smote_enn = SMOTEENN(random_state=0)
# X_resampled_s, y_resampled_s = smote_enn.fit_resample(X_train_scaled,y_train)
# Counter(y_resampled_s)

In [None]:
# keep_col = ['gender',
#          'pneumonia', 'new_age',
#        'pregnant', 'diabetes', 'copd',
#        'asthma', 'immunosup', 'hypertension',
#        'cardiovascular', 'obesity', 'renal_chronic', 'tobacco', 'another_complication', 'closed_contact']

In [10]:
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
ml = {
    "Support Vector Machines": SVC(kernel='linear'),
    "LogisticRegression": LogisticRegression(max_iter=1000,solver='lbfgs', random_state=1),
    "Random Forest_2": RandomForestClassifier(n_estimators = 13, random_state = 42, max_depth = 2),
    "Random Forest_4": RandomForestClassifier(n_estimators = 13, random_state = 42, max_depth = 4),
    "Random Forest_6": RandomForestClassifier(n_estimators = 13, random_state = 42, max_depth = 6),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "GradientBoostingClassifier":  GradientBoostingClassifier(n_estimators=20,
                                        learning_rate=1,
                                        max_features=5,
                                        max_depth=3,
                                        random_state=0),
    "KNN": KNeighborsClassifier(n_neighbors=3)
}

results = []
for x in ml:
    model = ml[x]
    model.fit(X_resampled_s, y_resampled_s)
    y_pred_s = model.predict(X_test_scaled)
    accuracy = model.score(X_test_scaled, y_test)
    balanced_accuracy = balanced_accuracy_score(y_test,y_pred_s)
    #calculating precision and reall
    precision = precision_score(y_test, y_pred_s)
    recall = recall_score(y_test, y_pred_s)
    print(f'model{x}: {confusion_matrix(y_test,y_pred_s)}')
    results.append({
    "name": x,
    "Accuracy":accuracy,
    "Balanced_Accuracy":balanced_accuracy,
    "Precision": precision,
    "Recall": recall
})
pd.DataFrame(results).sort_values("Accuracy", ascending=False)

modelSupport Vector Machines: [[9653 1527]
 [ 182 1138]]
modelLogisticRegression: [[9706 1474]
 [ 199 1121]]
modelRandom Forest_2: [[9370 1810]
 [ 178 1142]]
modelRandom Forest_4: [[9623 1557]
 [ 181 1139]]
modelRandom Forest_6: [[9429 1751]
 [ 158 1162]]
modelDecision Tree: [[9647 1533]
 [ 275 1045]]
modelGradientBoostingClassifier: [[9715 1465]
 [ 214 1106]]
modelKNN: [[10482   698]
 [  657   663]]


Unnamed: 0,name,Accuracy,Balanced_Accuracy,Precision,Recall
7,KNN,0.8916,0.71992,0.487142,0.502273
1,LogisticRegression,0.86616,0.8587,0.431985,0.849242
6,GradientBoostingClassifier,0.86568,0.853421,0.430183,0.837879
0,Support Vector Machines,0.86328,0.862769,0.427017,0.862121
3,Random Forest_4,0.86096,0.861806,0.422478,0.862879
5,Decision Tree,0.85536,0.827273,0.405353,0.791667
4,Random Forest_6,0.84728,0.861842,0.398901,0.880303
2,Random Forest_2,0.84096,0.851628,0.386856,0.865152


In [11]:
from imblearn.ensemble import BalancedRandomForestClassifier
BE = {
    "Balanced Random Forest": BalancedRandomForestClassifier(n_estimators=100, random_state=42),
    "Easy Ensemble": EasyEnsembleClassifier(random_state=42,n_estimators = 100)
}

results = []
for x in BE:
    model2 = BE[x]
    model2.fit(X_train_scaled, y_train)
    y_pred_s2 = model2.predict(X_test_scaled)
    accuracy = model2.score(X_test_scaled, y_test)
    balanced_accuracy = balanced_accuracy_score(y_test,y_pred_s2)
    #calculating precision and reall
    precision = precision_score(y_test, y_pred_s2)
    recall = recall_score(y_test, y_pred_s2)
    print(f'model{x}: {confusion_matrix(y_test,y_pred_s2)}')
    results.append({
    "name": x,
    "Accuracy":accuracy,
    "Balanced_Accuracy":balanced_accuracy,
    "Precision": precision,
    "Recall": recall
})
pd.DataFrame(results).sort_values("Accuracy", ascending=False)

modelBalanced Random Forest: [[9323 1857]
 [ 148 1172]]
modelEasy Ensemble: [[9818 1362]
 [ 212 1108]]


Unnamed: 0,name,Accuracy,Balanced_Accuracy,Precision,Recall
1,Easy Ensemble,0.87408,0.858785,0.448583,0.839394
0,Balanced Random Forest,0.8396,0.860889,0.386926,0.887879


In [12]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
#  YOUR CODE GOES HERE
number_of_features = len(X_resampled_s[0])
first_layer_unit = 80
second_layer_unit = 30
first_activation = 'relu'
second_activation = 'relu'

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units = first_layer_unit,input_dim = number_of_features,activation = first_activation))

# Second hidden layer
nn.add(
    tf.keras.layers.Dense(units = second_layer_unit,activation = second_activation))

# Output layer
nn.add(
    tf.keras.layers.Dense(units = 1, activation = 'sigmoid'))

# Check the structure of the model
nn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 80)                1280      
                                                                 
 dense_1 (Dense)             (None, 30)                2430      
                                                                 
 dense_2 (Dense)             (None, 1)                 31        
                                                                 
Total params: 3,741
Trainable params: 3,741
Non-trainable params: 0
_________________________________________________________________


In [13]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["Accuracy","TruePositives",'FalseNegatives'])

In [14]:
# Train the model
fit_model = nn.fit(X_resampled_s, y_resampled_s,epochs = 50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [15]:
# Evaluate the model using the test data : Recall: tp/(tp+fn) 0.829236
results = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(results)
# print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

391/391 - 2s - loss: 0.3616 - Accuracy: 0.8435 - true_positives: 1124.0000 - false_negatives: 196.0000 - 2s/epoch - 5ms/step
[0.36164140701293945, 0.8435199856758118, 1124.0, 196.0]


In [None]:
# #Plotting Precision-Recall Curve
# disp = plot_precision_recall_curve(model, X_test_scaled, y_test)

In [None]:
# # # Create a method that creates a new Sequential model with hyperparameter options
# def create_model(hp):
#     nn_model = tf.keras.models.Sequential()

#     # Allow kerastuner to decide which activation function to use in hidden layers
#     activation = hp.Choice('activation',['relu','tanh','sigmoid'])
#     first_units = hp.Int('first_units', min_value=1, max_value=10, step=2)
    
#     # Allow kerastuner to decide number of neurons in first layer
#     nn_model.add(tf.keras.layers.Dense(units=first_units, activation=activation, input_dim=len(X_train_scaled[0])))

#     # Allow kerastuner to decide number of hidden layers and neurons in hidden layers
#     for i in range(hp.Int('num_layers', 1, 6)):
#         next_units = hp.Int('units_' + str(i), min_value=1, max_value=10, step=2)

#         nn_model.add(tf.keras.layers.Dense(units=next_units, activation=activation))
    
#     nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

#     # Compile the model
#     nn_model.compile(loss="binary_crossentropy", optimizer='adam', metrics=["Accuracy"])

#     return nn_model

In [None]:
# import keras_tuner as kt

# tuner = kt.Hyperband(
#     create_model, # function name
#     objective="val_accuracy",
#     max_epochs=20,
#     hyperband_iterations=2)

In [None]:
# # Run the kerastuner search for best hyperparameters
# tuner.search(X_resampled_s, y_resampled_s,epochs=20,validation_data=(X_test_scaled,y_test))

In [None]:
# # Get best model hyperparameters
# best_hyper = tuner.get_best_hyperparameters(1)[0]
# best_hyper.values

In [None]:
# best_model = tuner.get_best_models(1)[0]
# fit_model = best_model.fit(X_resampled_s, y_resampled_s, epochs=20)

In [None]:
# # Evaluate best model against full test data
# model_loss, model_accuracy = best_model.evaluate(X_test_scaled,y_test,verbose=2)
# print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

In [None]:
# import numpy as np
# from sklearn.feature_selection import SelectKBest, chi2, f_classif

# # Perform feature selection
# selector = SelectKBest(f_classif, k=10)
# selector.fit(X, y)

# # Get the raw p-values for each feature, and transform from p-values into scores
# scores = -np.log10(selector.pvalues_)

# # Plot the scores.
# plt.bar(range(len(keep_col)), scores)
# plt.xticks(range(len(keep_col)), X, rotation='vertical')
# plt.show()