In [None]:
import tensorflow as tf
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler
import numpy as np
import os

In [None]:
def scale_dataset(dataframe: pd.DataFrame, oversample: bool = False):
    # if target column is the last value
    X = dataframe[dataframe.columns[:-1]].values
    y = dataframe[dataframe.columns[-1]].values

    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    if oversample:
        ros = RandomOverSampler()
        X, y = ros.fit_resample(X, y)

    data = np.hstack((X, np.reshape(y, (-1, 1))))
    return data, X, y

In [None]:
def plot_history(history):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 4))
    ax1.plot(history.history["loss"], label="loss")
    ax1.plot(history.history["val_loss"], label="val_loss")
    ax1.set_xlabel("Epoch")
    ax1.set_ylabel("Binary crossentropy")
    ax1.grid(True)

    ax2.plot(history.history["accuracy"], label="accuracy")
    ax2.plot(history.history["val_accuracy"], label="val_accuracy")
    ax2.set_xlabel("Epoch")
    ax2.set_ylabel("Accuracy")
    ax2.grid(True)

    plt.show()
    # fig.savefig(os.path.join(os.environ["OUTPUT_PATH"], "history.png"))
    # plt.close()

In [None]:
def train_model(X_train, y_train, X_valid, y_valid, num_nodes, dropout_prob, lr, batch_size, epochs):
    nn_model = tf.keras.Sequential([
        tf.keras.layers.Dense(num_nodes, activation="relu", input_dim=X_train.shape[1]),
        tf.keras.layers.Dropout(dropout_prob),
        tf.keras.layers.Dense(num_nodes, activation="relu"),
        tf.keras.layers.Dropout(dropout_prob),
        tf.keras.layers.Dense(1, activation="sigmoid"),
    ])

    nn_model.compile(
        optimizer=tf.keras.optimizers.Adadelta(lr), 
        loss="binary_crossentropy",
        metrics=["accuracy"])
    
    history = nn_model.fit(
        X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_valid, y_valid)
    )
    
    return nn_model, history

In [None]:
print("Loading dataset...")
df = pd.read_excel("./MachineLearning-Dataset-V1.xlsx")
print("Done...")

# Train (60%), validation (20%) and test (20%) datasets
train, valid, test = np.split(df.sample(frac=1), [int(0.6 * len(df)), int(0.8 * len(df))])

train, X_train, y_train = scale_dataset(train, oversample=True)
valid, X_valid, y_valid = scale_dataset(valid, oversample=False)
test, X_test, y_test = scale_dataset(test, oversample=False)

In [None]:
least_val_loss = float("inf")
least_loss_model = None
least_loss_history = None
least_loss_params = []

epochs = 100
for num_nodes in [16, 32, 64]:
    for dropout_prob in [0, 0.2]:
        for lr in [0.001, 0.005, 0.1]:
            for batch_size in [32, 64, 128]:
                print(f"nodes: {num_nodes} , dropout: {dropout_prob}, learning rate (lr): {lr}, batch size: {batch_size}")
                model, history = train_model(X_train, y_train, X_valid, y_valid, num_nodes, dropout_prob, lr, batch_size, epochs)
                plot_history(history)
                _, val_loss = model.evaluate(X_test, y_test)
                if val_loss < least_val_loss:
                    least_val_loss = val_loss
                    least_loss_model = model
                    least_loss_history = history
                    least_loss_params = [num_nodes, dropout_prob, lr, batch_size]

In [None]:
# Least loss
#   Adadelta
#       Loss:     0.12441471219062805
#       Params: [16, 0.2, 0.001, 128]