In [None]:
# imports
from sklearn.metrics import make_scorer, confusion_matrix, classification_report, accuracy_score, \
                            roc_auc_score, precision_score, f1_score, recall_score, \
                            mean_squared_error, mean_absolute_error, log_loss
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.python.keras.utils.vis_utils import plot_model
from tensorflow.python.keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from ast import literal_eval as make_tuple
from tensorflow.keras import Sequential
from sklearn.pipeline import Pipeline
from tensorflow.keras import metrics
from datetime import timedelta
import tensorflow as tf
from time import time
import pandas as pd
import numpy as np
import sys

In [None]:
sys.path.insert(1, "../../utils/")
import model_utils

results_dir = "./"
best_results_path = "../grid_search_results/neural_network_params.csv"
best_results = pd.read_csv(best_results_path)

In [None]:
# get best params according to grid_search best params
top3_best_params = best_results.nlargest(3,['model_accuracy', 'f1_score', 'roc_score'])
top3_best_params.drop_duplicates(["epoch_best_params"], inplace = True)

# list top 3 results in which epoch_best_params are different
top3_best_params.head(3)
#print(top3_best_params["epoch_best_params"].values)

In [None]:
all_predictions = []
for row in top3_best_params.iterrows():
    # load correct database in which best results were saved
    dbname = row[1]["database"]
    dataframe_path = f"../../pre_processing/datasets/preprocessed_data/{dbname}.csv"
    df = pd.read_csv(dataframe_path, sep = ",")

    # Drop RA column if exists
    if "RA" in df.columns:
        print("Removing column RA")
        df.drop('RA', inplace=True, axis=1)

    X = df.iloc[:, df.columns != "EVADIDO"].values
    y = df["EVADIDO"]

    input_dim = len(df.columns)
    input_dim = input_dim - 1
    n_splits = row[1]["n_splits"]
    best_param = row[1][-1] # row with best params
    best_param_tuple = make_tuple(best_param)
    
    epochs = best_param_tuple['clf__epochs']
    init = best_param_tuple['clf__init']
    batch_size = best_param_tuple['clf__batch_size']
    optimizer = best_param_tuple['clf__optimizer']
    dropout = best_param_tuple['clf__dropout']

    # Apply best params to model to check accuracy
    clf = Sequential()
    clf.add(Dense(units=8, kernel_initializer= "uniform", activation='relu'))
    clf.add(Dropout(rate = dropout))
    clf.add(Dense(units=8, kernel_initializer= "uniform", activation='relu'))
    clf.add(Dropout(rate = dropout))
    clf.add(Dense(units=2, activation='softmax'))

    clf.compile(loss='categorical_crossentropy', optimizer = "RMSprop", metrics=[
        "acc",
        "mse",
        metrics.Precision(),
        metrics.Recall(),
    ])

    estimator = Pipeline([("scl", StandardScaler()),
                          ("clf", clf)])
    
    skf = StratifiedKFold(n_splits = n_splits, shuffle = True)

    for i in range(1, 11):
        for index, (train_index, test_index) in enumerate(skf.split(X, y)):
            print(f"{i}'st run for split {index}")
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = to_categorical(y[train_index]), to_categorical(y[test_index])

            start_time = time()        
            estimator.fit(X_train, y_train, clf__epochs = epochs, clf__batch_size = batch_size, clf__verbose = 0)
            predictions = estimator.predict(X_test)
            end_time = time()
            total_time = timedelta(seconds=end_time-start_time)

            y_test = y_test.argmax(axis=1)
            predictions = predictions.argmax(axis=1)

            predict_data = {
                "database": dbname,
                "n_splits": n_splits,
                "confusion_matrix": confusion_matrix(y_test, predictions, labels=np.unique(predictions)),
                "classification_report": classification_report(y_test, predictions),
                "model_accuracy": accuracy_score(y_test, predictions),
                "f1_score": f1_score(y_test, predictions, labels=np.unique(predictions)),
                "precision_score": precision_score(y_test, predictions),
                "roc_score": roc_auc_score(y_test, predictions),
                "recall_score": recall_score(y_test, predictions),
                "log_loss": log_loss(y_test, predictions),
                "epoch_params": best_param,
                "mean_squared_error": mean_squared_error(y_test, predictions),
                "mean_absolute_error": mean_absolute_error(y_test, predictions),
                "train_test_duration": total_time,
                "train_size": "uniform",
                "train_size": len(X_train),
                "test_size": len(X_test)
            }

            all_predictions.append(predict_data)

# CSV Output results
model_utils.generate_output_csv(all_predictions, results_dir, "neural_network_results")
print("FINISHED")