In [None]:
# import libraries
import pandas as pd
import numpy as np
import keras
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Dropout
from scikeras.wrappers import KerasClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, accuracy_score
from imblearn.over_sampling import SMOTE

In [None]:
# import files (ori)
folder = "../../data/"

x_train = pd.read_csv(folder + "x_train_ori.csv")
x_test = pd.read_csv(folder + "x_test_ori.csv")
x_valid = pd.read_csv(folder + "x_valid_ori.csv")

y_train = pd.read_csv(folder + "y_train_ori.csv", dtype=int)
y_test = pd.read_csv(folder + "y_test_ori.csv", dtype=int)
y_valid = pd.read_csv(folder + "y_valid_ori.csv", dtype=int)

In [None]:
# oversample
oversample = SMOTE()
x_train, y_train = oversample.fit_resample(x_train, y_train)

In [None]:
# parameters
input_dim = x_train.shape[1]
scoring = {"precision": make_scorer(precision_score), 
           "accuracy": make_scorer(accuracy_score), 
           "recall": make_scorer(recall_score), 
           "f1": make_scorer(f1_score)
          }

In [None]:
# f1_score
import keras.backend as K

def f1_metric(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

# Tuning of hyperparameters

In [None]:
# tuning number of neurons, batch_size and epochs
def create_model(neurons):
    model = Sequential()
    model.add(Dense(neurons, input_dim=input_dim, activation="relu"))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer="adam", 
                  metrics=['accuracy', keras.metrics.Precision(), keras.metrics.Recall()])
    return model

tf.random.set_seed(7)
model = KerasClassifier(model=create_model, verbose=0)
# define the grid search parameters
neurons = [8, 12, 16, 20, 24]
batch_size = [10, 20, 40, 60, 80, 100]
epochs = [10, 50, 100]
param_grid = dict(batch_size=batch_size, epochs=epochs, model__neurons=neurons)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3, scoring=scoring, refit="f1")
grid_result = grid.fit(x_train, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# accuracy
means_acc = grid_result.cv_results_['mean_test_accuracy']
stds_acc = grid_result.cv_results_['std_test_accuracy']
# precision
means_prec = grid_result.cv_results_['mean_test_precision']
stds_prec = grid_result.cv_results_['std_test_precision']
# recall
means_recall = grid_result.cv_results_['mean_test_recall']
stds_recall = grid_result.cv_results_['std_test_recall']
#f1
means_f1 = grid_result.cv_results_['mean_test_f1']
stds_f1 = grid_result.cv_results_['std_test_f1']
# parameters
params = grid_result.cv_results_['params']
for mean_acc, stdev_acc, mean_prec, stdev_prec, mean_recall, stdev_recall, mean_f1, stdev_f1, param in zip(means_acc, stds_acc, means_prec, stds_prec, means_recall, stds_recall, means_f1, stds_f1, params):
    print("Accuracy: %f (%f), Precision:  %f (%f), Recall: %f (%f), F1: %f (%f) with: %r" % 
          (mean_acc, stdev_acc, mean_prec, stdev_prec, mean_recall, stdev_recall, mean_f1, stdev_f1, param))

Best: 0.850338 using {'batch_size': 10, 'epochs': 100, 'model__neurons': 20}
Accuracy: 0.735062 (0.015275), Precision:  0.762524 (0.027549), Recall: 0.684881 (0.009665), F1: 0.721247 (0.010699) with: {'batch_size': 10, 'epochs': 10, 'model__neurons': 8}
Accuracy: 0.728044 (0.027322), Precision:  0.718754 (0.038350), Recall: 0.754082 (0.013876), F1: 0.735409 (0.020699) with: {'batch_size': 10, 'epochs': 10, 'model__neurons': 12}
Accuracy: 0.734294 (0.006150), Precision:  0.776961 (0.046256), Recall: 0.669062 (0.066587), F1: 0.714550 (0.016305) with: {'batch_size': 10, 'epochs': 10, 'model__neurons': 16}
Accuracy: 0.755487 (0.016481), Precision:  0.757374 (0.018634), Recall: 0.755871 (0.075665), F1: 0.753792 (0.030961) with: {'batch_size': 10, 'epochs': 10, 'model__neurons': 20}
Accuracy: 0.730590 (0.013654), Precision:  0.756405 (0.023575), Recall: 0.686679 (0.083837), F1: 0.715761 (0.033659) with: {'batch_size': 10, 'epochs': 10, 'model__neurons': 24}
Accuracy: 0.756894 (0.027771), Pre

From the results, neurons = 20 , batch size = 10 , epochs = 100 seems to be performing better

In [None]:
# tuning drop out rate
def create_model(dropout_rate):
    model = Sequential()
    model.add(Dense(20, input_dim=input_dim, activation="relu"))
    model.add(Dropout(dropout_rate))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer="adam", 
                  metrics=['accuracy', keras.metrics.Precision(), keras.metrics.Recall()])
    return model

tf.random.set_seed(7)
model = KerasClassifier(model=create_model, epochs=100, batch_size=10, verbose=0)
# define the grid search parameters
dropout_rate = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
param_grid = dict(model__dropout_rate=dropout_rate)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3, scoring=scoring, refit="f1")
grid_result = grid.fit(x_train, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# accuracy
means_acc = grid_result.cv_results_['mean_test_accuracy']
stds_acc = grid_result.cv_results_['std_test_accuracy']
# precision
means_prec = grid_result.cv_results_['mean_test_precision']
stds_prec = grid_result.cv_results_['std_test_precision']
# recall
means_recall = grid_result.cv_results_['mean_test_recall']
stds_recall = grid_result.cv_results_['std_test_recall']
#f1
means_f1 = grid_result.cv_results_['mean_test_f1']
stds_f1 = grid_result.cv_results_['std_test_f1']
# parameters
params = grid_result.cv_results_['params']
for mean_acc, stdev_acc, mean_prec, stdev_prec, mean_recall, stdev_recall, mean_f1, stdev_f1, param in zip(means_acc, stds_acc, means_prec, stds_prec, means_recall, stds_recall, means_f1, stds_f1, params):
    print("Accuracy: %f (%f), Precision:  %f (%f), Recall: %f (%f), F1: %f (%f) with: %r" % 
          (mean_acc, stdev_acc, mean_prec, stdev_prec, mean_recall, stdev_recall, mean_f1, stdev_f1, param))

Best: 0.838978 using {'model__dropout_rate': 0.3}
Accuracy: 0.818564 (0.003805), Precision:  0.796639 (0.040936), Recall: 0.865697 (0.071157), F1: 0.826098 (0.009094) with: {'model__dropout_rate': 0.1}
Accuracy: 0.807712 (0.003201), Precision:  0.808810 (0.028325), Recall: 0.810254 (0.049997), F1: 0.807628 (0.012221) with: {'model__dropout_rate': 0.2}
Accuracy: 0.830186 (0.010847), Precision:  0.797770 (0.016226), Recall: 0.885864 (0.031520), F1: 0.838978 (0.011787) with: {'model__dropout_rate': 0.3}
Accuracy: 0.812818 (0.011105), Precision:  0.767472 (0.024722), Recall: 0.901432 (0.040117), F1: 0.827965 (0.009511) with: {'model__dropout_rate': 0.4}
Accuracy: 0.810138 (0.005671), Precision:  0.769783 (0.010918), Recall: 0.886096 (0.035068), F1: 0.823290 (0.009492) with: {'model__dropout_rate': 0.5}
Accuracy: 0.778727 (0.015367), Precision:  0.750343 (0.028651), Recall: 0.841670 (0.061544), F1: 0.791155 (0.019158) with: {'model__dropout_rate': 0.6}
Accuracy: 0.796988 (0.007959), Precisi

From the results, dropout rate = 0.3 seems to be performing best.

In [None]:
# tuning number of neurons for second hidden layer (without dropout layer)
def create_model(neurons):
    model = Sequential()
    model.add(Dense(20, input_dim=input_dim, activation="relu"))
    model.add(Dense(neurons, activation="relu"))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer="adam", 
                  metrics=['accuracy', keras.metrics.Precision(), keras.metrics.Recall()])
    return model

tf.random.set_seed(7)
model = KerasClassifier(model=create_model, epochs=100, batch_size=10, verbose=0)
# define the grid search parameters
neurons = [8, 12, 16]
param_grid = dict(model__neurons=neurons)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3, scoring=scoring, refit="f1")
grid_result = grid.fit(x_train, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# accuracy
means_acc = grid_result.cv_results_['mean_test_accuracy']
stds_acc = grid_result.cv_results_['std_test_accuracy']
# precision
means_prec = grid_result.cv_results_['mean_test_precision']
stds_prec = grid_result.cv_results_['std_test_precision']
# recall
means_recall = grid_result.cv_results_['mean_test_recall']
stds_recall = grid_result.cv_results_['std_test_recall']
#f1
means_f1 = grid_result.cv_results_['mean_test_f1']
stds_f1 = grid_result.cv_results_['std_test_f1']
# parameters
params = grid_result.cv_results_['params']
for mean_acc, stdev_acc, mean_prec, stdev_prec, mean_recall, stdev_recall, mean_f1, stdev_f1, param in zip(means_acc, stds_acc, means_prec, stds_prec, means_recall, stds_recall, means_f1, stds_f1, params):
    print("Accuracy: %f (%f), Precision:  %f (%f), Recall: %f (%f), F1: %f (%f) with: %r" % 
          (mean_acc, stdev_acc, mean_prec, stdev_prec, mean_recall, stdev_recall, mean_f1, stdev_f1, param))

Best: 0.848344 using {'model__neurons': 16}
Accuracy: 0.823419 (0.011855), Precision:  0.775970 (0.027457), Recall: 0.913956 (0.042825), F1: 0.838011 (0.009598) with: {'model__neurons': 8}
Accuracy: 0.830182 (0.013901), Precision:  0.776814 (0.027849), Recall: 0.930037 (0.021395), F1: 0.845836 (0.008128) with: {'model__neurons': 12}
Accuracy: 0.833885 (0.010310), Precision:  0.782711 (0.026925), Recall: 0.928250 (0.030414), F1: 0.848344 (0.003865) with: {'model__neurons': 16}


From the results, when the number of neurons for the second hidden layer = 16, it gaves the best model (without dropout layer).

In [None]:
# tuning number of neurons for second hidden layer (with dropout layer=0.3)
def create_model(neurons):
    model = Sequential()
    model.add(Dense(20, input_dim=input_dim, activation="relu"))
    model.add(Dropout(0.3))
    model.add(Dense(neurons, activation="relu"))
    model.add(Dropout(0.3))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer="adam", 
                  metrics=['accuracy', keras.metrics.Precision(), keras.metrics.Recall()])
    return model

tf.random.set_seed(7)
model = KerasClassifier(model=create_model, epochs=100, batch_size=10, verbose=0)
# define the grid search parameters
neurons = [8, 12, 16]
param_grid = dict(model__neurons=neurons)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3, scoring=scoring, refit="f1")
grid_result = grid.fit(x_train, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# accuracy
means_acc = grid_result.cv_results_['mean_test_accuracy']
stds_acc = grid_result.cv_results_['std_test_accuracy']
# precision
means_prec = grid_result.cv_results_['mean_test_precision']
stds_prec = grid_result.cv_results_['std_test_precision']
# recall
means_recall = grid_result.cv_results_['mean_test_recall']
stds_recall = grid_result.cv_results_['std_test_recall']
#f1
means_f1 = grid_result.cv_results_['mean_test_f1']
stds_f1 = grid_result.cv_results_['std_test_f1']
# parameters
params = grid_result.cv_results_['params']
for mean_acc, stdev_acc, mean_prec, stdev_prec, mean_recall, stdev_recall, mean_f1, stdev_f1, param in zip(means_acc, stds_acc, means_prec, stds_prec, means_recall, stds_recall, means_f1, stds_f1, params):
    print("Accuracy: %f (%f), Precision:  %f (%f), Recall: %f (%f), F1: %f (%f) with: %r" % 
          (mean_acc, stdev_acc, mean_prec, stdev_prec, mean_recall, stdev_recall, mean_f1, stdev_f1, param))

Best: 0.838964 using {'model__neurons': 12}
Accuracy: 0.813842 (0.021671), Precision:  0.770535 (0.045355), Recall: 0.905015 (0.044840), F1: 0.829948 (0.009195) with: {'model__neurons': 8}
Accuracy: 0.818311 (0.017427), Precision:  0.754394 (0.020486), Recall: 0.945355 (0.009678), F1: 0.838964 (0.013108) with: {'model__neurons': 12}
Accuracy: 0.799662 (0.032836), Precision:  0.781702 (0.013398), Recall: 0.831457 (0.084411), F1: 0.803798 (0.042084) with: {'model__neurons': 16}


From the results, when the number of neurons for the second hidden layer = 12, it gaves the best model (with dropout layer).

# Model Selection

In [None]:
# model 1 (single hidden layer without dropout)
model1 = Sequential()
model1.add(Dense(20, input_dim=input_dim, activation='relu'))
model1.add(Dense(1, activation="sigmoid"))
model1.compile(loss='binary_crossentropy', optimizer='adam', 
               metrics=['accuracy',keras.metrics.Precision(), keras.metrics.Recall(), f1_metric, keras.metrics.AUC()])
model1.fit(x_train, y_train, epochs=100, batch_size=10, verbose=0)
train = model1.evaluate(x_train, y_train, verbose=0)
valid = model1.evaluate(x_valid, y_valid, verbose=0)
print(f'train : loss = {train[0]}; accuracy = {train[1]}; precision = {train[2]}; recall = {train[3]}, f1 = {train[4]}, AUC = {train[5]}')
print(f'valid : loss = {valid[0]}; accuracy = {valid[1]}; precision = {valid[2]}; recall = {valid[3]}, f1 = {valid[4]}, AUC = {valid[5]}')

train : loss = 0.5275610089302063; accuracy = 0.7562564015388489; precision = 0.7050051093101501; recall = 0.8812564015388489, f1 = 0.5902056694030762, AUC = 0.8436737656593323
valid : loss = 0.7414788603782654; accuracy = 0.6543437838554382; precision = 0.16129031777381897; recall = 0.875, f1 = 0.26049262285232544, AUC = 0.84496009349823


In [None]:
# model 2 (single hidden layer with dropout)
model2 = Sequential()
model2.add(Dense(20, input_dim=input_dim, activation='relu'))
model2.add(Dropout(0.3))
model2.add(Dense(1, activation="sigmoid"))
model2.compile(loss='binary_crossentropy', optimizer='adam', 
               metrics=['accuracy',keras.metrics.Precision(), keras.metrics.Recall(), f1_metric, keras.metrics.AUC()])
model2.fit(x_train, y_train, epochs=100, batch_size=10, verbose=0)
train = model2.evaluate(x_train, y_train, verbose=0)
valid = model2.evaluate(x_valid, y_valid, verbose=0)
print(f'train : loss = {train[0]}; accuracy = {train[1]}; precision = {train[2]}; recall = {train[3]}, f1 = {train[4]}, AUC = {train[5]}')
print(f'valid : loss = {valid[0]}; accuracy = {valid[1]}; precision = {valid[2]}; recall = {valid[3]}, f1 = {valid[4]}, AUC = {valid[5]}')

train : loss = 0.43915849924087524; accuracy = 0.8059244155883789; precision = 0.7302075624465942; recall = 0.9703779220581055, f1 = 0.6276625990867615, AUC = 0.8833599090576172
valid : loss = 0.5497429370880127; accuracy = 0.6672828197479248; precision = 0.16019417345523834; recall = 0.824999988079071, f1 = 0.25266751646995544, AUC = 0.8374001979827881


In [None]:
# model 3 (double hidden layer without dropout)
model3 = Sequential()
model3.add(Dense(20, input_dim=input_dim, activation='relu'))
model3.add(Dense(16, activation='relu'))
model3.add(Dense(1, activation="sigmoid"))
model3.compile(loss='binary_crossentropy', optimizer='adam', 
               metrics=['accuracy',keras.metrics.Precision(), keras.metrics.Recall(), f1_metric, keras.metrics.AUC()])
model3.fit(x_train, y_train, epochs=100, batch_size=10, verbose=0)
train = model3.evaluate(x_train, y_train, verbose=0)
valid = model3.evaluate(x_valid, y_valid, verbose=0)
print(f'train : loss = {train[0]}; accuracy = {train[1]}; precision = {train[2]}; recall = {train[3]}, f1 = {train[4]}, AUC = {train[5]}')
print(f'valid : loss = {valid[0]}; accuracy = {valid[1]}; precision = {valid[2]}; recall = {valid[3]}, f1 = {valid[4]}, AUC = {valid[5]}')

train : loss = 0.37698426842689514; accuracy = 0.832482099533081; precision = 0.7960436344146729; recall = 0.8940244913101196, f1 = 0.6425326466560364, AUC = 0.9043956995010376
valid : loss = 0.47812938690185547; accuracy = 0.7707948088645935; precision = 0.21621622145175934; recall = 0.800000011920929, f1 = 0.32716643810272217, AUC = 0.8565369248390198


In [None]:
# model 4 (double hidden layer with dropout)
model4 = Sequential()
model4.add(Dense(20, input_dim=input_dim, activation='relu'))
model4.add(Dropout(0.3))
model4.add(Dense(12, activation='relu'))
model4.add(Dropout(0.3))
model4.add(Dense(1, activation="sigmoid"))
model4.compile(loss='binary_crossentropy', optimizer='adam', 
               metrics=['accuracy',keras.metrics.Precision(), keras.metrics.Recall(), f1_metric, keras.metrics.AUC()])
model4.fit(x_train, y_train, epochs=100, batch_size=10, verbose=0)
train = model4.evaluate(x_train, y_train, verbose=0)
valid = model4.evaluate(x_valid, y_valid, verbose=0)
print(f'train : loss = {train[0]}; accuracy = {train[1]}; precision = {train[2]}; recall = {train[3]}, f1 = {train[4]}, AUC = {train[5]}')
print(f'valid : loss = {valid[0]}; accuracy = {valid[1]}; precision = {valid[2]}; recall = {valid[3]}, f1 = {valid[4]}, AUC = {valid[5]}')

train : loss = 0.414261132478714; accuracy = 0.8079673051834106; precision = 0.7287557125091553; recall = 0.9811031818389893, f1 = 0.632788896560669, AUC = 0.8968215584754944
valid : loss = 0.5556502938270569; accuracy = 0.6580406427383423; precision = 0.1627907007932663; recall = 0.875, f1 = 0.2579997479915619, AUC = 0.8460828065872192


# Final Model

In [None]:
nn_model = Sequential()
nn_model.add(Dense(20, input_dim=input_dim, activation='relu'))
nn_model.add(Dense(16, activation="relu"))
nn_model.add(Dense(1, activation="sigmoid"))
nn_model.compile(loss='binary_crossentropy', optimizer='adam', 
               metrics=['accuracy',keras.metrics.Precision(), keras.metrics.Recall(), f1_metric, keras.metrics.AUC()])
nn_model.fit(x_train, y_train, epochs=100, batch_size=10, verbose=0)
train = nn_model.evaluate(x_train, y_train, verbose=0)
valid = nn_model.evaluate(x_valid, y_valid, verbose=0)
print(f'train : loss = {train[0]}; accuracy = {train[1]}; precision = {train[2]}; recall = {train[3]}, f1 = {train[4]}, AUC = {train[5]}')
print(f'valid : loss = {valid[0]}; accuracy = {valid[1]}; precision = {valid[2]}; recall = {valid[3]}, f1 = {valid[4]}, AUC = {valid[5]}')

train : loss = 0.365684449672699; accuracy = 0.8309499621391296; precision = 0.798068106174469; recall = 0.8861082792282104, f1 = 0.6449406743049622, AUC = 0.9128667712211609
valid : loss = 0.49847784638404846; accuracy = 0.7670979499816895; precision = 0.2133333384990692; recall = 0.800000011920929, f1 = 0.3074982464313507, AUC = 0.8507236242294312


In [None]:
test = nn_model.evaluate(x_test, y_test, verbose=0)
print(f'test : loss = {test[0]}; accuracy = {test[1]}; precision = {test[2]}; recall = {test[3]}, f1 = {test[4]}, AUC = {test[5]}')

test : loss = 0.5184089541435242; accuracy = 0.7670979499816895; precision = 0.26923078298568726; recall = 0.7777777910232544, f1 = 0.358478307723999, AUC = 0.837135910987854
