In [None]:
# import libraries
import pandas as pd
import numpy as np
import keras
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Dropout
from scikeras.wrappers import KerasClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, accuracy_score
from imblearn.over_sampling import SMOTE

In [None]:
# import files (extra)
folder = "../../data/"

x_train = pd.read_csv(folder + "x_train_extra.csv")
x_test = pd.read_csv(folder + "x_test_extra.csv")
x_valid = pd.read_csv(folder + "x_valid_extra.csv")

y_train = pd.read_csv(folder + "y_train_extra.csv", dtype=int)
y_test = pd.read_csv(folder + "y_test_extra.csv", dtype=int)
y_valid = pd.read_csv(folder + "y_valid_extra.csv", dtype=int)

# drop provider
# x_train = x_train.drop(columns=["Provider"])
# x_test = x_test.drop(columns=["Provider"])
# x_valid = x_valid.drop(columns=["Provider"])

In [None]:
# oversample
oversample = SMOTE()
x_train, y_train = oversample.fit_resample(x_train, y_train)

In [None]:
# parameters
input_dim = x_train.shape[1]
scoring = {"precision": make_scorer(precision_score), 
           "accuracy": make_scorer(accuracy_score), 
           "recall": make_scorer(recall_score), 
           "f1": make_scorer(f1_score)
          }

In [None]:
# f1_score
import keras.backend as K

def f1_metric(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

# Tuning of hyperparameters

In [None]:
# tuning number of neurons, batch_size and epochs
def create_model(neurons):
    model = Sequential()
    model.add(Dense(neurons, input_dim=input_dim, activation="relu"))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer="adam", 
                  metrics=['accuracy', keras.metrics.Precision(), keras.metrics.Recall()])
    return model

tf.random.set_seed(7)
model = KerasClassifier(model=create_model, verbose=0)
# define the grid search parameters
neurons = [16, 24, 32, 38, 42, 48]
batch_size = [10, 20, 40, 60, 80, 100]
epochs = [10, 50, 100]
param_grid = dict(batch_size=batch_size, epochs=epochs, model__neurons=neurons)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3, scoring=scoring, refit="f1")
grid_result = grid.fit(x_train, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# accuracy
means_acc = grid_result.cv_results_['mean_test_accuracy']
stds_acc = grid_result.cv_results_['std_test_accuracy']
# precision
means_prec = grid_result.cv_results_['mean_test_precision']
stds_prec = grid_result.cv_results_['std_test_precision']
# recall
means_recall = grid_result.cv_results_['mean_test_recall']
stds_recall = grid_result.cv_results_['std_test_recall']
#f1
means_f1 = grid_result.cv_results_['mean_test_f1']
stds_f1 = grid_result.cv_results_['std_test_f1']
# parameters
params = grid_result.cv_results_['params']
for mean_acc, stdev_acc, mean_prec, stdev_prec, mean_recall, stdev_recall, mean_f1, stdev_f1, param in zip(means_acc, stds_acc, means_prec, stds_prec, means_recall, stds_recall, means_f1, stds_f1, params):
    print("Accuracy: %f (%f), Precision:  %f (%f), Recall: %f (%f), F1: %f (%f) with: %r" % 
          (mean_acc, stdev_acc, mean_prec, stdev_prec, mean_recall, stdev_recall, mean_f1, stdev_f1, param))

Best: 0.916257 using {'batch_size': 20, 'epochs': 100, 'model__neurons': 38}
Accuracy: 0.866059 (0.021142), Precision:  0.878388 (0.010091), Recall: 0.850848 (0.062170), F1: 0.862864 (0.027064) with: {'batch_size': 10, 'epochs': 10, 'model__neurons': 16}
Accuracy: 0.860821 (0.033727), Precision:  0.864572 (0.020956), Recall: 0.859531 (0.103371), F1: 0.857456 (0.045855) with: {'batch_size': 10, 'epochs': 10, 'model__neurons': 24}
Accuracy: 0.865679 (0.006569), Precision:  0.809734 (0.007460), Recall: 0.956077 (0.004864), F1: 0.876828 (0.005738) with: {'batch_size': 10, 'epochs': 10, 'model__neurons': 32}
Accuracy: 0.872448 (0.014350), Precision:  0.877905 (0.014360), Recall: 0.866462 (0.048088), F1: 0.871057 (0.018999) with: {'batch_size': 10, 'epochs': 10, 'model__neurons': 38}
Accuracy: 0.879213 (0.010703), Precision:  0.845261 (0.028098), Recall: 0.931564 (0.040403), F1: 0.885125 (0.010489) with: {'batch_size': 10, 'epochs': 10, 'model__neurons': 42}
Accuracy: 0.803759 (0.035247), Pr

From the results, neurons = 38, batch size = 20, epochs = 100 seems to be performing better

In [None]:
# tuning drop out rate
def create_model(dropout_rate):
    model = Sequential()
    model.add(Dense(38, input_dim=input_dim, activation="relu"))
    model.add(Dropout(dropout_rate))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer="adam", 
                  metrics=['accuracy', keras.metrics.Precision(), keras.metrics.Recall()])
    return model

tf.random.set_seed(7)
model = KerasClassifier(model=create_model, epochs=100, batch_size=20, verbose=0)
# define the grid search parameters
dropout_rate = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
param_grid = dict(model__dropout_rate=dropout_rate)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3, scoring=scoring, refit="f1")
grid_result = grid.fit(x_train, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# accuracy
means_acc = grid_result.cv_results_['mean_test_accuracy']
stds_acc = grid_result.cv_results_['std_test_accuracy']
# precision
means_prec = grid_result.cv_results_['mean_test_precision']
stds_prec = grid_result.cv_results_['std_test_precision']
# recall
means_recall = grid_result.cv_results_['mean_test_recall']
stds_recall = grid_result.cv_results_['std_test_recall']
#f1
means_f1 = grid_result.cv_results_['mean_test_f1']
stds_f1 = grid_result.cv_results_['std_test_f1']
# parameters
params = grid_result.cv_results_['params']
for mean_acc, stdev_acc, mean_prec, stdev_prec, mean_recall, stdev_recall, mean_f1, stdev_f1, param in zip(means_acc, stds_acc, means_prec, stds_prec, means_recall, stds_recall, means_f1, stds_f1, params):
    print("Accuracy: %f (%f), Precision:  %f (%f), Recall: %f (%f), F1: %f (%f) with: %r" % 
          (mean_acc, stdev_acc, mean_prec, stdev_prec, mean_recall, stdev_recall, mean_f1, stdev_f1, param))

Best: 0.905170 using {'model__dropout_rate': 0.1}
Accuracy: 0.902171 (0.012125), Precision:  0.876324 (0.002686), Recall: 0.936654 (0.031960), F1: 0.905170 (0.013683) with: {'model__dropout_rate': 0.1}
Accuracy: 0.891060 (0.005057), Precision:  0.847214 (0.016237), Recall: 0.955300 (0.024235), F1: 0.897593 (0.005042) with: {'model__dropout_rate': 0.2}
Accuracy: 0.898723 (0.006813), Precision:  0.865560 (0.006008), Recall: 0.944317 (0.022631), F1: 0.903026 (0.007817) with: {'model__dropout_rate': 0.3}
Accuracy: 0.897829 (0.003286), Precision:  0.861340 (0.003531), Recall: 0.948404 (0.012298), F1: 0.902715 (0.003906) with: {'model__dropout_rate': 0.4}
Accuracy: 0.891188 (0.004856), Precision:  0.849300 (0.004345), Recall: 0.951213 (0.012282), F1: 0.897319 (0.005124) with: {'model__dropout_rate': 0.5}
Accuracy: 0.890421 (0.004544), Precision:  0.854405 (0.007384), Recall: 0.941507 (0.017634), F1: 0.895690 (0.005386) with: {'model__dropout_rate': 0.6}
Accuracy: 0.880077 (0.006388), Precisi

From the results, a dropout rate of 0.1 would give a best model that has a dropout layer.

In [None]:
# tuning number of neurons for second hidden layer (without dropout layer)
def create_model(neurons):
    model = Sequential()
    model.add(Dense(38, input_dim=input_dim, activation="relu"))
    model.add(Dense(neurons, activation="relu"))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer="adam", 
                  metrics=['accuracy', keras.metrics.Precision(), keras.metrics.Recall()])
    return model

tf.random.set_seed(7)
model = KerasClassifier(model=create_model, epochs=100, batch_size=20, verbose=0)
# define the grid search parameters
neurons = [8, 12, 16, 20, 24, 28, 32]
param_grid = dict(model__neurons=neurons)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3, scoring=scoring, refit="f1")
grid_result = grid.fit(x_train, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# accuracy
means_acc = grid_result.cv_results_['mean_test_accuracy']
stds_acc = grid_result.cv_results_['std_test_accuracy']
# precision
means_prec = grid_result.cv_results_['mean_test_precision']
stds_prec = grid_result.cv_results_['std_test_precision']
# recall
means_recall = grid_result.cv_results_['mean_test_recall']
stds_recall = grid_result.cv_results_['std_test_recall']
#f1
means_f1 = grid_result.cv_results_['mean_test_f1']
stds_f1 = grid_result.cv_results_['std_test_f1']
# parameters
params = grid_result.cv_results_['params']
for mean_acc, stdev_acc, mean_prec, stdev_prec, mean_recall, stdev_recall, mean_f1, stdev_f1, param in zip(means_acc, stds_acc, means_prec, stds_prec, means_recall, stds_recall, means_f1, stds_f1, params):
    print("Accuracy: %f (%f), Precision:  %f (%f), Recall: %f (%f), F1: %f (%f) with: %r" % 
          (mean_acc, stdev_acc, mean_prec, stdev_prec, mean_recall, stdev_recall, mean_f1, stdev_f1, param))

Best: 0.926282 using {'model__neurons': 32}
Accuracy: 0.904598 (0.005535), Precision:  0.858483 (0.011899), Recall: 0.969349 (0.011888), F1: 0.910419 (0.004686) with: {'model__neurons': 8}
Accuracy: 0.903065 (0.004067), Precision:  0.877999 (0.033161), Recall: 0.940485 (0.051898), F1: 0.906199 (0.008124) with: {'model__neurons': 12}
Accuracy: 0.909579 (0.009072), Precision:  0.894005 (0.018488), Recall: 0.930524 (0.032253), F1: 0.911271 (0.010276) with: {'model__neurons': 16}
Accuracy: 0.912388 (0.004989), Precision:  0.874559 (0.012591), Recall: 0.963474 (0.020603), F1: 0.916600 (0.005272) with: {'model__neurons': 20}
Accuracy: 0.917241 (0.000828), Precision:  0.882210 (0.010782), Recall: 0.963474 (0.013598), F1: 0.920893 (0.000514) with: {'model__neurons': 24}
Accuracy: 0.920307 (0.006510), Precision:  0.886453 (0.006225), Recall: 0.964240 (0.016897), F1: 0.923603 (0.006879) with: {'model__neurons': 28}
Accuracy: 0.922733 (0.007754), Precision:  0.885831 (0.009067), Recall: 0.970626 

From the results, when the number of neurons in the second hidden layer is 32 for models without dropout layers, it has the best performance.

In [None]:
# tuning number of neurons for second hidden layer (with dropout layer=0.1)
def create_model(neurons):
    model = Sequential()
    model.add(Dense(38, input_dim=input_dim, activation="relu"))
    model.add(Dropout(0.1))
    model.add(Dense(neurons, activation="relu"))
    model.add(Dropout(0.1))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer="adam", 
                  metrics=['accuracy', keras.metrics.Precision(), keras.metrics.Recall()])
    return model

tf.random.set_seed(7)
model = KerasClassifier(model=create_model, epochs=100, batch_size=20, verbose=0)
# define the grid search parameters
neurons = [8, 12, 16, 20, 24, 28, 32]
param_grid = dict(model__neurons=neurons)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3, scoring=scoring, refit="f1")
grid_result = grid.fit(x_train, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# accuracy
means_acc = grid_result.cv_results_['mean_test_accuracy']
stds_acc = grid_result.cv_results_['std_test_accuracy']
# precision
means_prec = grid_result.cv_results_['mean_test_precision']
stds_prec = grid_result.cv_results_['std_test_precision']
# recall
means_recall = grid_result.cv_results_['mean_test_recall']
stds_recall = grid_result.cv_results_['std_test_recall']
#f1
means_f1 = grid_result.cv_results_['mean_test_f1']
stds_f1 = grid_result.cv_results_['std_test_f1']
# parameters
params = grid_result.cv_results_['params']
for mean_acc, stdev_acc, mean_prec, stdev_prec, mean_recall, stdev_recall, mean_f1, stdev_f1, param in zip(means_acc, stds_acc, means_prec, stds_prec, means_recall, stds_recall, means_f1, stds_f1, params):
    print("Accuracy: %f (%f), Precision:  %f (%f), Recall: %f (%f), F1: %f (%f) with: %r" % 
          (mean_acc, stdev_acc, mean_prec, stdev_prec, mean_recall, stdev_recall, mean_f1, stdev_f1, param))

Best: 0.914137 using {'model__neurons': 32}
Accuracy: 0.897957 (0.003006), Precision:  0.865547 (0.006853), Recall: 0.942529 (0.015865), F1: 0.902265 (0.003923) with: {'model__neurons': 8}
Accuracy: 0.898467 (0.007396), Precision:  0.855414 (0.014312), Recall: 0.959898 (0.026555), F1: 0.904251 (0.007912) with: {'model__neurons': 12}
Accuracy: 0.904087 (0.004737), Precision:  0.850830 (0.006734), Recall: 0.980077 (0.003484), F1: 0.910874 (0.004115) with: {'model__neurons': 16}
Accuracy: 0.901405 (0.004989), Precision:  0.873647 (0.007509), Recall: 0.938697 (0.010855), F1: 0.904938 (0.004969) with: {'model__neurons': 20}
Accuracy: 0.904853 (0.000651), Precision:  0.856836 (0.002630), Recall: 0.972158 (0.002605), F1: 0.910854 (0.000339) with: {'model__neurons': 24}
Accuracy: 0.906130 (0.003525), Precision:  0.873622 (0.009769), Recall: 0.949936 (0.011975), F1: 0.910066 (0.003299) with: {'model__neurons': 28}
Accuracy: 0.909323 (0.002839), Precision:  0.868321 (0.009266), Recall: 0.965262 

From the results, when the number of neurons in the second hidden layer is 16 for models with dropout layers, it has the best performance.

# Model Selection

In [None]:
# model 1 (single hidden layer without dropout)
model1 = Sequential()
model1.add(Dense(38, input_dim=input_dim, activation='relu'))
model1.add(Dense(1, activation="sigmoid"))
model1.compile(loss='binary_crossentropy', optimizer='adam', 
               metrics=['accuracy',keras.metrics.Precision(), keras.metrics.Recall(), f1_metric, keras.metrics.AUC()])
model1.fit(x_train, y_train, epochs=100, batch_size=20, verbose=0)
train = model1.evaluate(x_train, y_train, verbose=0)
valid = model1.evaluate(x_valid, y_valid, verbose=0)
print(f'train : loss = {train[0]}; accuracy = {train[1]}; precision = {train[2]}; recall = {train[3]}, f1 = {train[4]}, AUC = {train[5]}')
print(f'valid : loss = {valid[0]}; accuracy = {valid[1]}; precision = {valid[2]}; recall = {valid[3]}, f1 = {valid[4]}, AUC = {valid[5]}')

train : loss = 0.22085408866405487; accuracy = 0.9190500378608704; precision = 0.8758589029312134; recall = 0.9765066504478455, f1 = 0.746049702167511, AUC = 0.9686145186424255
valid : loss = 0.44531404972076416; accuracy = 0.8613678216934204; precision = 0.32673266530036926; recall = 0.824999988079071, f1 = 0.44975343346595764, AUC = 0.8980787992477417


In [None]:
# model 2 (single hidden layer with dropout)
model2 = Sequential()
model2.add(Dense(38, input_dim=input_dim, activation='relu'))
model2.add(Dropout(0.1))
model2.add(Dense(1, activation="sigmoid"))
model2.compile(loss='binary_crossentropy', optimizer='adam', 
               metrics=['accuracy',keras.metrics.Precision(), keras.metrics.Recall(), f1_metric, keras.metrics.AUC()])
model2.fit(x_train, y_train, epochs=100, batch_size=20, verbose=0)
train = model2.evaluate(x_train, y_train, verbose=0)
valid = model2.evaluate(x_valid, y_valid, verbose=0)
print(f'train : loss = {train[0]}; accuracy = {train[1]}; precision = {train[2]}; recall = {train[3]}, f1 = {train[4]}, AUC = {train[5]}')
print(f'valid : loss = {valid[0]}; accuracy = {valid[1]}; precision = {valid[2]}; recall = {valid[3]}, f1 = {valid[4]}, AUC = {valid[5]}')

train : loss = 0.20112751424312592; accuracy = 0.9244126677513123; precision = 0.8834794759750366; recall = 0.9777834415435791, f1 = 0.7537176609039307, AUC = 0.9682551622390747
valid : loss = 0.36390575766563416; accuracy = 0.8539741039276123; precision = 0.3106796145439148; recall = 0.800000011920929, f1 = 0.40335965156555176, AUC = 0.8893712162971497


In [None]:
# model 3 (double hidden layer without dropout)
model3 = Sequential()
model3.add(Dense(38, input_dim=input_dim, activation='relu'))
model3.add(Dense(32, activation='relu'))
model3.add(Dense(1, activation="sigmoid"))
model3.compile(loss='binary_crossentropy', optimizer='adam', 
               metrics=['accuracy',keras.metrics.Precision(), keras.metrics.Recall(), f1_metric, keras.metrics.AUC()])
model3.fit(x_train, y_train, epochs=100, batch_size=20, verbose=0)
train = model3.evaluate(x_train, y_train, verbose=0)
valid = model3.evaluate(x_valid, y_valid, verbose=0)
print(f'train : loss = {train[0]}; accuracy = {train[1]}; precision = {train[2]}; recall = {train[3]}, f1 = {train[4]}, AUC = {train[5]}')
print(f'valid : loss = {valid[0]}; accuracy = {valid[1]}; precision = {valid[2]}; recall = {valid[3]}, f1 = {valid[4]}, AUC = {valid[5]}')

train : loss = 0.14671862125396729; accuracy = 0.9463738799095154; precision = 0.9138257503509521; recall = 0.9856997132301331, f1 = 0.8028750419616699, AUC = 0.9838058948516846
valid : loss = 0.4194095730781555; accuracy = 0.8983364105224609; precision = 0.39726027846336365; recall = 0.7250000238418579, f1 = 0.4613865911960602, AUC = 0.8644710779190063


In [None]:
# model 4 (double hidden layer with dropout)
model4 = Sequential()
model4.add(Dense(38, input_dim=input_dim, activation='relu'))
model4.add(Dropout(0.1))
model4.add(Dense(32, activation='relu'))
model4.add(Dropout(0.1))
model4.add(Dense(1, activation="sigmoid"))
model4.compile(loss='binary_crossentropy', optimizer='adam', 
               metrics=['accuracy',keras.metrics.Precision(), keras.metrics.Recall(), f1_metric, keras.metrics.AUC()])
model4.fit(x_train, y_train, epochs=100, batch_size=20, verbose=0)
train = model4.evaluate(x_train, y_train, verbose=0)
valid = model4.evaluate(x_valid, y_valid, verbose=0)
print(f'train : loss = {train[0]}; accuracy = {train[1]}; precision = {train[2]}; recall = {train[3]}, f1 = {train[4]}, AUC = {train[5]}')
print(f'valid : loss = {valid[0]}; accuracy = {valid[1]}; precision = {valid[2]}; recall = {valid[3]}, f1 = {valid[4]}, AUC = {valid[5]}')

train : loss = 0.18956579267978668; accuracy = 0.9263278841972351; precision = 0.8919934034347534; recall = 0.9701225757598877, f1 = 0.766128659248352, AUC = 0.9714576005935669
valid : loss = 0.37886080145835876; accuracy = 0.8853974342346191; precision = 0.375; recall = 0.824999988079071, f1 = 0.507236123085022, AUC = 0.886876106262207


# Final Model

In [None]:
nn_model = Sequential()
nn_model.add(Dense(38, input_dim=input_dim, activation='relu'))
nn_model.add(Dropout(0.1))
nn_model.add(Dense(1, activation="sigmoid"))
nn_model.add(Dropout(0.1))
nn_model.compile(loss='binary_crossentropy', optimizer='adam', 
               metrics=['accuracy',keras.metrics.Precision(), keras.metrics.Recall(), f1_metric, keras.metrics.AUC()])
nn_model.fit(x_train, y_train, epochs=100, batch_size=20, verbose=0)
train = nn_model.evaluate(x_train, y_train, verbose=0)
valid = nn_model.evaluate(x_valid, y_valid, verbose=0)
print(f'train : loss = {train[0]}; accuracy = {train[1]}; precision = {train[2]}; recall = {train[3]}, f1 = {train[4]}, AUC = {train[5]}')
print(f'valid : loss = {valid[0]}; accuracy = {valid[1]}; precision = {valid[2]}; recall = {valid[3]}, f1 = {valid[4]}, AUC = {valid[5]}')

train : loss = 0.23789189755916595; accuracy = 0.9122829437255859; precision = 0.8781915903091431; recall = 0.9573544263839722, f1 = 0.7396813631057739, AUC = 0.9583377242088318
valid : loss = 0.2927311360836029; accuracy = 0.8613678216934204; precision = 0.3333333432674408; recall = 0.875, f1 = 0.4599184989929199, AUC = 0.9181138277053833


In [None]:
test = nn_model.evaluate(x_test, y_test, verbose=0)
print(f'test : loss = {test[0]}; accuracy = {test[1]}; precision = {test[2]}; recall = {test[3]}, f1 = {test[4]}, AUC = {test[5]}')

test : loss = 0.3066774606704712; accuracy = 0.8576709628105164; precision = 0.4017094075679779; recall = 0.8703703880310059, f1 = 0.4736981689929962, AUC = 0.908985435962677
