In [None]:
# import libraries
import pandas as pd
import numpy as np
import keras
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Dropout
from scikeras.wrappers import KerasClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, accuracy_score
from imblearn.over_sampling import SMOTE

In [None]:
# import files (new)
folder = "../../data/"

x_train = pd.read_csv(folder + "x_train_new.csv")
x_test = pd.read_csv(folder + "x_test_new.csv")
x_valid = pd.read_csv(folder + "x_valid_new.csv")

y_train = pd.read_csv(folder + "y_train_new.csv", dtype=int)
y_test = pd.read_csv(folder + "y_test_new.csv", dtype=int)
y_valid = pd.read_csv(folder + "y_valid_new.csv", dtype=int)

In [None]:
# oversample
oversample = SMOTE()
x_train, y_train = oversample.fit_resample(x_train, y_train)

In [None]:
# parameters
input_dim = x_train.shape[1]
scoring = {"precision": make_scorer(precision_score), 
           "accuracy": make_scorer(accuracy_score), 
           "recall": make_scorer(recall_score), 
           "f1": make_scorer(f1_score)
          }

In [None]:
# f1_score
import keras.backend as K

def f1_metric(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

# Tuning of Hyperparameters

In [None]:
# tuning number of neurons, batch_size and epochs
def create_model(neurons):
    model = Sequential()
    model.add(Dense(neurons, input_dim=input_dim, activation="relu"))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer="adam", 
                  metrics=['accuracy', keras.metrics.Precision(), keras.metrics.Recall()])
    return model

tf.random.set_seed(7)
model = KerasClassifier(model=create_model, verbose=0)
# define the grid search parameters
neurons = [8, 12, 16, 20, 24, 28, 30, 32]
batch_size = [10, 20, 40, 60, 80, 100]
epochs = [10, 50, 100]
param_grid = dict(batch_size=batch_size, epochs=epochs, model__neurons=neurons)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3, scoring=scoring, refit="f1")
grid_result = grid.fit(x_train, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# accuracy
means_acc = grid_result.cv_results_['mean_test_accuracy']
stds_acc = grid_result.cv_results_['std_test_accuracy']
# precision
means_prec = grid_result.cv_results_['mean_test_precision']
stds_prec = grid_result.cv_results_['std_test_precision']
# recall
means_recall = grid_result.cv_results_['mean_test_recall']
stds_recall = grid_result.cv_results_['std_test_recall']
#f1
means_f1 = grid_result.cv_results_['mean_test_f1']
stds_f1 = grid_result.cv_results_['std_test_f1']
# parameters
params = grid_result.cv_results_['params']
for mean_acc, stdev_acc, mean_prec, stdev_prec, mean_recall, stdev_recall, mean_f1, stdev_f1, param in zip(means_acc, stds_acc, means_prec, stds_prec, means_recall, stds_recall, means_f1, stds_f1, params):
    print("Accuracy: %f (%f), Precision:  %f (%f), Recall: %f (%f), F1: %f (%f) with: %r" % 
          (mean_acc, stdev_acc, mean_prec, stdev_prec, mean_recall, stdev_recall, mean_f1, stdev_f1, param))

Best: 0.909220 using {'batch_size': 10, 'epochs': 100, 'model__neurons': 20}
Accuracy: 0.863638 (0.011353), Precision:  0.870449 (0.033724), Recall: 0.858548 (0.052278), F1: 0.862406 (0.015360) with: {'batch_size': 10, 'epochs': 10, 'model__neurons': 8}
Accuracy: 0.867340 (0.007387), Precision:  0.872228 (0.017929), Recall: 0.861859 (0.026894), F1: 0.866485 (0.008854) with: {'batch_size': 10, 'epochs': 10, 'model__neurons': 12}
Accuracy: 0.857125 (0.023558), Precision:  0.893789 (0.007816), Recall: 0.811269 (0.061828), F1: 0.849015 (0.031041) with: {'batch_size': 10, 'epochs': 10, 'model__neurons': 16}
Accuracy: 0.841292 (0.007423), Precision:  0.841225 (0.059865), Recall: 0.858301 (0.095223), F1: 0.842682 (0.015644) with: {'batch_size': 10, 'epochs': 10, 'model__neurons': 20}
Accuracy: 0.829551 (0.029177), Precision:  0.907711 (0.023192), Recall: 0.736992 (0.089014), F1: 0.809356 (0.042703) with: {'batch_size': 10, 'epochs': 10, 'model__neurons': 24}
Accuracy: 0.855464 (0.016227), Pre

From the results, neurons = 20, batch size = 10, epochs = 100 seems to be performing better

In [None]:
# tuning drop out rate
def create_model(dropout_rate):
    model = Sequential()
    model.add(Dense(20, input_dim=input_dim, activation="relu"))
    model.add(Dropout(dropout_rate))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer="adam", 
                  metrics=['accuracy', keras.metrics.Precision(), keras.metrics.Recall()])
    return model

tf.random.set_seed(7)
model = KerasClassifier(model=create_model, epochs=100, batch_size=10, verbose=0)
# define the grid search parameters
dropout_rate = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
param_grid = dict(model__dropout_rate=dropout_rate)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3, scoring=scoring, refit="f1")
grid_result = grid.fit(x_train, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# accuracy
means_acc = grid_result.cv_results_['mean_test_accuracy']
stds_acc = grid_result.cv_results_['std_test_accuracy']
# precision
means_prec = grid_result.cv_results_['mean_test_precision']
stds_prec = grid_result.cv_results_['std_test_precision']
# recall
means_recall = grid_result.cv_results_['mean_test_recall']
stds_recall = grid_result.cv_results_['std_test_recall']
#f1
means_f1 = grid_result.cv_results_['mean_test_f1']
stds_f1 = grid_result.cv_results_['std_test_f1']
# parameters
params = grid_result.cv_results_['params']
for mean_acc, stdev_acc, mean_prec, stdev_prec, mean_recall, stdev_recall, mean_f1, stdev_f1, param in zip(means_acc, stds_acc, means_prec, stds_prec, means_recall, stds_recall, means_f1, stds_f1, params):
    print("Accuracy: %f (%f), Precision:  %f (%f), Recall: %f (%f), F1: %f (%f) with: %r" % 
          (mean_acc, stdev_acc, mean_prec, stdev_prec, mean_recall, stdev_recall, mean_f1, stdev_f1, param))

Best: 0.897809 using {'model__dropout_rate': 0.2}
Accuracy: 0.893261 (0.012040), Precision:  0.870293 (0.028084), Recall: 0.927753 (0.056264), F1: 0.896225 (0.015517) with: {'model__dropout_rate': 0.1}
Accuracy: 0.894919 (0.008145), Precision:  0.873096 (0.000466), Recall: 0.924155 (0.018809), F1: 0.897809 (0.009051) with: {'model__dropout_rate': 0.2}
Accuracy: 0.891728 (0.011269), Precision:  0.862669 (0.007881), Recall: 0.931824 (0.020734), F1: 0.895811 (0.011571) with: {'model__dropout_rate': 0.3}
Accuracy: 0.890195 (0.003712), Precision:  0.861262 (0.010240), Recall: 0.930804 (0.023635), F1: 0.894376 (0.005489) with: {'model__dropout_rate': 0.4}
Accuracy: 0.883300 (0.003616), Precision:  0.845932 (0.013713), Recall: 0.938207 (0.024732), F1: 0.889297 (0.004769) with: {'model__dropout_rate': 0.5}
Accuracy: 0.874235 (0.007086), Precision:  0.847471 (0.024650), Recall: 0.915233 (0.037199), F1: 0.879024 (0.008173) with: {'model__dropout_rate': 0.6}
Accuracy: 0.878831 (0.009180), Precisi

From the results, best dropout rate is 0.2.

In [None]:
# tuning number of neurons for second hidden layer (without dropout layer)
def create_model(neurons):
    model = Sequential()
    model.add(Dense(20, input_dim=input_dim, activation="relu"))
    model.add(Dense(neurons, activation="relu"))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer="adam", 
                  metrics=['accuracy', keras.metrics.Precision(), keras.metrics.Recall()])
    return model

tf.random.set_seed(7)
model = KerasClassifier(model=create_model, epochs=100, batch_size=10, verbose=0)
# define the grid search parameters
neurons = [8, 12, 16]
param_grid = dict(model__neurons=neurons)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3, scoring=scoring, refit="f1")
grid_result = grid.fit(x_train, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# accuracy
means_acc = grid_result.cv_results_['mean_test_accuracy']
stds_acc = grid_result.cv_results_['std_test_accuracy']
# precision
means_prec = grid_result.cv_results_['mean_test_precision']
stds_prec = grid_result.cv_results_['std_test_precision']
# recall
means_recall = grid_result.cv_results_['mean_test_recall']
stds_recall = grid_result.cv_results_['std_test_recall']
#f1
means_f1 = grid_result.cv_results_['mean_test_f1']
stds_f1 = grid_result.cv_results_['std_test_f1']
# parameters
params = grid_result.cv_results_['params']
for mean_acc, stdev_acc, mean_prec, stdev_prec, mean_recall, stdev_recall, mean_f1, stdev_f1, param in zip(means_acc, stds_acc, means_prec, stds_prec, means_recall, stds_recall, means_f1, stds_f1, params):
    print("Accuracy: %f (%f), Precision:  %f (%f), Recall: %f (%f), F1: %f (%f) with: %r" % 
          (mean_acc, stdev_acc, mean_prec, stdev_prec, mean_recall, stdev_recall, mean_f1, stdev_f1, param))

Best: 0.913706 using {'model__neurons': 12}
Accuracy: 0.904241 (0.014787), Precision:  0.883779 (0.013353), Recall: 0.931808 (0.044482), F1: 0.906344 (0.017237) with: {'model__neurons': 8}
Accuracy: 0.909220 (0.008404), Precision:  0.870142 (0.002923), Recall: 0.961950 (0.015636), F1: 0.913706 (0.008617) with: {'model__neurons': 12}
Accuracy: 0.896451 (0.004540), Precision:  0.872182 (0.013299), Recall: 0.929521 (0.009017), F1: 0.899802 (0.003248) with: {'model__neurons': 16}


From the results, the best number of neurons for the second layer for models with no dropout rates is 12.

In [None]:
# tuning number of neurons for second hidden layer (with dropout layer=0.2)
def create_model(neurons):
    model = Sequential()
    model.add(Dense(20, input_dim=input_dim, activation="relu"))
    model.add(Dropout(0.2))
    model.add(Dense(neurons, activation="relu"))
    model.add(Dropout(0.2))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer="adam", 
                  metrics=['accuracy', keras.metrics.Precision(), keras.metrics.Recall()])
    return model

tf.random.set_seed(7)
model = KerasClassifier(model=create_model, epochs=100, batch_size=10, verbose=0)
# define the grid search parameters
neurons = [8, 12, 16]
param_grid = dict(model__neurons=neurons)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3, scoring=scoring, refit="f1")
grid_result = grid.fit(x_train, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# accuracy
means_acc = grid_result.cv_results_['mean_test_accuracy']
stds_acc = grid_result.cv_results_['std_test_accuracy']
# precision
means_prec = grid_result.cv_results_['mean_test_precision']
stds_prec = grid_result.cv_results_['std_test_precision']
# recall
means_recall = grid_result.cv_results_['mean_test_recall']
stds_recall = grid_result.cv_results_['std_test_recall']
#f1
means_f1 = grid_result.cv_results_['mean_test_f1']
stds_f1 = grid_result.cv_results_['std_test_f1']
# parameters
params = grid_result.cv_results_['params']
for mean_acc, stdev_acc, mean_prec, stdev_prec, mean_recall, stdev_recall, mean_f1, stdev_f1, param in zip(means_acc, stds_acc, means_prec, stds_prec, means_recall, stds_recall, means_f1, stds_f1, params):
    print("Accuracy: %f (%f), Precision:  %f (%f), Recall: %f (%f), F1: %f (%f) with: %r" % 
          (mean_acc, stdev_acc, mean_prec, stdev_prec, mean_recall, stdev_recall, mean_f1, stdev_f1, param))

Best: 0.900412 using {'model__neurons': 16}
Accuracy: 0.892876 (0.002349), Precision:  0.843382 (0.004004), Recall: 0.965018 (0.008291), F1: 0.900074 (0.002515) with: {'model__neurons': 8}
Accuracy: 0.893004 (0.006589), Precision:  0.854967 (0.013982), Recall: 0.947404 (0.025122), F1: 0.898440 (0.007199) with: {'model__neurons': 12}
Accuracy: 0.895556 (0.006803), Precision:  0.860656 (0.017184), Recall: 0.945096 (0.027148), F1: 0.900412 (0.007220) with: {'model__neurons': 16}


From the results, a better number of neurons for the second layer for model with dropout layer is 12.

# Model Selection

In [None]:
# model 1 (single hidden layer without dropout)
model1 = Sequential()
model1.add(Dense(20, input_dim=input_dim, activation='relu'))
model1.add(Dense(1, activation="sigmoid"))
model1.compile(loss='binary_crossentropy', optimizer='adam', 
               metrics=['accuracy',keras.metrics.Precision(), keras.metrics.Recall(), f1_metric, keras.metrics.AUC()])
model1.fit(x_train, y_train, epochs=100, batch_size=10, verbose=0)
train = model1.evaluate(x_train, y_train, verbose=0)
valid = model1.evaluate(x_valid, y_valid, verbose=0)
print(f'train : loss = {train[0]}; accuracy = {train[1]}; precision = {train[2]}; recall = {train[3]}, f1 = {train[4]}, AUC = {train[5]}')
print(f'valid : loss = {valid[0]}; accuracy = {valid[1]}; precision = {valid[2]}; recall = {valid[3]}, f1 = {valid[4]}, AUC = {valid[5]}')

train : loss = 0.24807000160217285; accuracy = 0.9060265421867371; precision = 0.8730642795562744; recall = 0.9502043128013611, f1 = 0.7312540411949158, AUC = 0.9615662693977356
valid : loss = 0.3623411953449249; accuracy = 0.8724583983421326; precision = 0.34736841917037964; recall = 0.824999988079071, f1 = 0.4708216190338135, AUC = 0.8936127424240112


In [None]:
# model 2 (single hidden layer with dropout)
model2 = Sequential()
model2.add(Dense(20, input_dim=input_dim, activation='relu'))
model2.add(Dropout(0.2))
model2.add(Dense(1,activation="sigmoid"))
model2.compile(loss='binary_crossentropy', optimizer='adam', 
               metrics=['accuracy',keras.metrics.Precision(), keras.metrics.Recall(), f1_metric, keras.metrics.AUC()])
model2.fit(x_train, y_train, epochs=100, batch_size=10, verbose=0)
train = model2.evaluate(x_train, y_train, verbose=0)
valid = model2.evaluate(x_valid, y_valid, verbose=0)
print(f'train : loss = {train[0]}; accuracy = {train[1]}; precision = {train[2]}; recall = {train[3]}, f1 = {train[4]}, AUC = {train[5]}')
print(f'valid : loss = {valid[0]}; accuracy = {valid[1]}; precision = {valid[2]}; recall = {valid[3]}, f1 = {valid[4]}, AUC = {valid[5]}')

train : loss = 0.24925780296325684; accuracy = 0.9033452272415161; precision = 0.8593856692314148; recall = 0.964504599571228, f1 = 0.7209519743919373, AUC = 0.9579201340675354
valid : loss = 0.32243645191192627; accuracy = 0.8484288454055786; precision = 0.3125; recall = 0.875, f1 = 0.4313725233078003, AUC = 0.9170159101486206


In [None]:
# model 3 (double hidden layer without dropout)
model3 = Sequential()
model3.add(Dense(20, input_dim=input_dim, activation='relu'))
model3.add(Dense(12, activation='relu'))
model3.add(Dense(1, activation="sigmoid"))
model3.compile(loss='binary_crossentropy', optimizer='adam', 
               metrics=['accuracy',keras.metrics.Precision(), keras.metrics.Recall(), f1_metric, keras.metrics.AUC()])
model3.fit(x_train, y_train, epochs=100, batch_size=10, verbose=0)
train = model3.evaluate(x_train, y_train, verbose=0)
valid = model3.evaluate(x_valid, y_valid, verbose=0)
print(f'train : loss = {train[0]}; accuracy = {train[1]}; precision = {train[2]}; recall = {train[3]}, f1 = {train[4]}, AUC = {train[5]}')
print(f'valid : loss = {valid[0]}; accuracy = {valid[1]}; precision = {valid[2]}; recall = {valid[3]}, f1 = {valid[4]}, AUC = {valid[5]}')

train : loss = 0.22258542478084564; accuracy = 0.9120275974273682; precision = 0.880811870098114; recall = 0.9530133008956909, f1 = 0.7409542798995972, AUC = 0.9638438820838928
valid : loss = 0.354107141494751; accuracy = 0.8613678216934204; precision = 0.3232323229312897; recall = 0.800000011920929, f1 = 0.43853333592414856, AUC = 0.89219069480896


In [None]:
# model 4 (double hidden layer with dropout)
model4 = Sequential()
model4.add(Dense(20, input_dim=input_dim, activation='relu'))
model4.add(Dropout(0.2))
model4.add(Dense(12, activation='relu'))
model4.add(Dropout(0.2))
model4.add(Dense(1, activation="sigmoid"))
model4.compile(loss='binary_crossentropy', optimizer='adam', 
               metrics=['accuracy',keras.metrics.Precision(), keras.metrics.Recall(), f1_metric, keras.metrics.AUC()])
model4.fit(x_train, y_train, epochs=100, batch_size=10, verbose=0)
train = model4.evaluate(x_train, y_train, verbose=0)
valid = model4.evaluate(x_valid, y_valid, verbose=0)
print(f'train : loss = {train[0]}; accuracy = {train[1]}; precision = {train[2]}; recall = {train[3]}, f1 = {train[4]}, AUC = {train[5]}')
print(f'valid : loss = {valid[0]}; accuracy = {valid[1]}; precision = {valid[2]}; recall = {valid[3]}, f1 = {valid[4]}, AUC = {valid[5]}')

train : loss = 0.2420443892478943; accuracy = 0.9027068614959717; precision = 0.860210120677948; recall = 0.9616956114768982, f1 = 0.7215970754623413, AUC = 0.9581348896026611
valid : loss = 0.3553621470928192; accuracy = 0.8447319865226746; precision = 0.3070175349712372; recall = 0.875, f1 = 0.4277702569961548, AUC = 0.9147704839706421


# Final Model

In [None]:
nn_model = Sequential()
nn_model.add(Dense(20, input_dim=input_dim, activation='relu'))
nn_model.add(Dense(1, activation="sigmoid"))
nn_model.compile(loss='binary_crossentropy', optimizer='adam', 
               metrics=['accuracy',keras.metrics.Precision(), keras.metrics.Recall(), f1_metric, keras.metrics.AUC()])
nn_model.fit(x_train, y_train, epochs=100, batch_size=10, verbose=0)
train = nn_model.evaluate(x_train, y_train, verbose=0)
valid = nn_model.evaluate(x_valid, y_valid, verbose=0)
print(f'train : loss = {train[0]}; accuracy = {train[1]}; precision = {train[2]}; recall = {train[3]}, f1 = {train[4]}, AUC = {train[5]}')
print(f'valid : loss = {valid[0]}; accuracy = {valid[1]}; precision = {valid[2]}; recall = {valid[3]}, f1 = {valid[4]}, AUC = {valid[5]}')

train : loss = 0.22860126197338104; accuracy = 0.9122829437255859; precision = 0.8764280676841736; recall = 0.9599080681800842, f1 = 0.7363041043281555, AUC = 0.9660319089889526
valid : loss = 0.3471260964870453; accuracy = 0.8613678216934204; precision = 0.32673266530036926; recall = 0.824999988079071, f1 = 0.4385584592819214, AUC = 0.891242504119873


In [None]:
test = nn_model.evaluate(x_test, y_test, verbose=0)
print(f'test : loss = {test[0]}; accuracy = {test[1]}; precision = {test[2]}; recall = {test[3]}, f1 = {test[4]}, AUC = {test[5]}')

test : loss = 0.3176102042198181; accuracy = 0.8613678216934204; precision = 0.41025641560554504; recall = 0.8888888955116272, f1 = 0.4977661669254303, AUC = 0.9317818284034729
