# Import

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import keras_tuner as kt
import math
import datetime, os
import keras
import tensorflow as tf
from keras.callbacks import EarlyStopping
from sklearn import preprocessing
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from kernels.gaussian_kernel_regression import gaussian_kernel_regression
from functions.common_function import *
from functions.build_tuner_model import build_tuner_model
from functions.dataset_interpolation import dataset_interpolation_own
from scipy.stats import pearsonr


'''Enabling plotting of graphs just below the plotting commands'''
%matplotlib inline
'''Enabling the disply of all rows and columns within the dataframe'''
pd.set_option("display.max_rows", None, "display.max_columns", None)

# Constant

In [None]:
num_feature = 8
cat_col = [4, 5]
num_ori_feature = num_feature - len(cat_col)
num_target = 3
bandwidth = 100
num_epochs = 10000
num_folds = 4
directory_name = "Tuning_Counter_Choudhury_Method_Own_Gaussian_Kernel"

limit = pd.DataFrame({'lower' : [303, 20, 0, 2, 6, 1.5, 122, 1236, 14], \
                     'higher' : [840, 44, 17, 5, 8, 2, 408, 3240, 101], \
                     'ref' : [530, 40, 14, 3.2, 6, 1.8, np.nan, np.nan, np.nan]})

'''Importing Dataset'''
dataset = pd.read_csv("Dataset/Choudhury_Dataset.csv")

# Implementation

In [None]:
'''Dataset splitting before interpolation to not introduce data leakage '''
x_train, x_test, y_train, y_test = tt_split(dataset, 0.2, num_feature, num_target)

'''Merging of features and targets datasets'''
train_dataset = merge_x_y(x_train, y_train)
test_dataset = merge_x_y(x_test, y_test)

'''Dataset Interpolation'''
interpolated_dataset = dataset_interpolation_own(train_dataset, num_ori_feature, num_target, limit, bandwidth)

'''Converting Categorical Data into binary representation'''
converted_dataset = convert_cat(interpolated_dataset, cat_col, num_ori_feature, num_target, [interpolated_dataset.iloc[:, 4].unique(), interpolated_dataset.iloc[:, 5].unique()])
converted_test_dataset = convert_cat(test_dataset, cat_col, num_ori_feature, num_target, [dataset.iloc[:, 4].unique(), dataset.iloc[:, 5].unique()])

'''Normalising dataset using Min Max present in the train set'''
scaler = MinMaxScaler()
scaler.fit(converted_dataset)
normalised_train_dataset = pd.DataFrame(scaler.transform(converted_dataset), columns = get_col_names(converted_dataset))

normalised_test_dataset = pd.DataFrame(scaler.transform(converted_test_dataset), columns = get_col_names(converted_dataset))

x_train = normalised_train_dataset.iloc[:, 0: num_feature]
y_train =  normalised_train_dataset.iloc[:, num_feature: num_feature + num_target]
x_test = normalised_test_dataset.iloc[:, 0: num_feature]
y_test = normalised_test_dataset.iloc[:, num_feature: num_feature + num_target]

kfold =KFold(n_splits = num_folds, shuffle = True)
fold_no = 1
for train, test in kfold.split(x_train, y_train):
    print('------------------------------------------------------------------------')
    print(f'Training for fold {fold_no} ...')
    tuner = kt.BayesianOptimization(build_tuner_model, objective = 'val_loss', max_trials = 10, directory = 'keras_tuner', project_name = directory_name, overwrite = True)
    tuner.search_space_summary()
    tuner.search(x_train.iloc[train], y_train.iloc[train], epochs = num_epochs, validation_data= (x_train.iloc[test], y_train.iloc[test]))
    tuner.results_summary()
    best_hps = tuner.get_best_models()[0]
    best_hps.save(f"Model\{directory_name}\model_{fold_no}")
    print("Saved model to disk")
    fold_no += 1

#  Loading of Models and Evaluate Them

In [None]:
results = []
for i in range(1, 5):
    '''Loads the Best Model Trained using Cross Validation'''
    loaded_model = keras.models.load_model(f"Model\{directory_name}\model_{i}")

    print("Loaded model from disk")

    '''Compilation of the model with its corresponding weights, followed by the evaluation of the model using test set'''
    loaded_model.compile(loss = 'MeanAbsoluteError',\
                        optimizer = 'SGD',\
                        metrics = [tf.keras.metrics.MeanSquaredError(),\
                        tf.keras.metrics.RootMeanSquaredError()])
    result = loaded_model.evaluate(x_test, y_test, batch_size = 128)
    print(f"Mean Absolute Error for model {i}(Loss): ", result[0])
    results.append(result[0])

        
'''Provide average score'''
print('------------------------------------------------------------------------')
print('Score per fold')
for i in range(0, len(results)):
    print('------------------------------------------------------------------------')
    print(f'> Iteration {i+1} - MAE: {results[i]}') 
print('------------------------------------------------------------------------')
print('Average scores for all folds:')
print(f'> MAE: {np.mean(results)} - Standard Deviation: {np.std(results)}')
print('------------------------------------------------------------------------')   

# Visualisation of Predictions using Best Model from Cross Validation

In [None]:
best_model_index = results.index(min(results)) + 1

'''Loads the Best Model Trained using Cross Validation'''
loaded_model = keras.models.load_model(f"Model\{directory_name}\model_{best_model_index}")

prediction = pd.DataFrame(loaded_model.predict(x_test), columns = get_col_names(y_test))

'''Preparation to Rescale target values'''
min_y = dataset.iloc[:, num_ori_feature: num_ori_feature + num_target].min().to_list()
max_y = dataset.iloc[:, num_ori_feature: num_ori_feature + num_target].max().to_list()

corr_list = []
'''Tabulating the differences of Expected and Predictions made by the ANN Model'''
for i in range(len(x_test)):
    '''Rescaling of normalised data'''
    expected = pd.DataFrame(inverse_transform(y_test.iloc[i].to_list(), max_y, min_y))
    predicted = pd.DataFrame(inverse_transform(prediction.iloc[i].to_list(), max_y, min_y))
    comparison_df = pd.concat([expected, predicted], axis = 1)
    comparison_df.columns = ['Expected', 'Predicted']
    comparison_df.index = get_col_names(y_test)
    display(comparison_df.style.set_caption(f"Element {i + 1}"))
    corr, _ = pearsonr(expected.iloc[:, 0].tolist(), predicted.iloc[:, 0].tolist())
    corr_list.append(corr)
    
'''Provide average score'''
print('------------------------------------------------------------------------')
print('Score per fold')
for i in range(0, len(corr_list)):
    print('------------------------------------------------------------------------')
    print(f'> Iteration {i+1} - Pearson Correlation: {corr_list[i]}') 
print('------------------------------------------------------------------------')
print('Average scores for all folds:')
print(f'> Average Pearsons Correlation: {np.mean(corr_list)} - Standard Deviation: {np.std(corr_list)}')
print('------------------------------------------------------------------------')  