In [1]:
import pandas as pd
import numpy as np
from joblib import Parallel, delayed

from utility_functions import get_acc, record, add_row_to_csv
from BIC import BIC
from MLP import mlp_evaluate

In [2]:
# dataset
dataset = 'detailed'

# training data
fold_path = 'training_data/' + dataset + '/folds.csv'
inputs_path = 'training_data/' + dataset + '/inputs.csv'
outputs_path = 'training_data/' + dataset + '/outputs.csv'
evaluation_path = 'training_data/' + dataset + '/evaluation.csv'

# writing accuracy rate path
acc_rate_path = 'acc_rate/' + dataset + '.csv'

# path to write df to csv
output_df_path = 'record_dataframe/' + dataset + '/'

# raw dfs
fold_df = pd.read_csv(fold_path)
inputs_df = pd.read_csv(inputs_path)
outputs_df = pd.read_csv(outputs_path)
evaluation_df = pd.read_csv(evaluation_path)

# feature engineering transformation
identity = lambda x: x
log      = lambda x: np.log(x)
loglog   = lambda x: np.log(np.log(x))

In [3]:
def get_fold_dfs(fold):
    train_inputs_df = inputs_df[inputs_df['sequenceID'].isin(fold_df[fold_df['fold'] == fold]['sequenceID'])]
    train_outputs_df = outputs_df[outputs_df['sequenceID'].isin(fold_df[fold_df['fold'] == fold]['sequenceID'])]
    train_eval_df = evaluation_df[evaluation_df['sequenceID'].isin(fold_df[fold_df['fold'] == fold]['sequenceID'])]
    test_inputs_df = inputs_df[inputs_df['sequenceID'].isin(fold_df[fold_df['fold'] != fold]['sequenceID'])]
    test_eval_df = evaluation_df[evaluation_df['sequenceID'].isin(fold_df[fold_df['fold'] != fold]['sequenceID'])]
    return train_inputs_df, train_outputs_df, train_eval_df, test_inputs_df, test_eval_df
    

In [5]:
def process_combination(fold, n_layer, layer_size, feature_dict, normalize):
    train_inputs_df, train_outputs_df, train_eval_df, test_inputs_df, test_eval_df = get_fold_dfs(fold)
    chosen_feature = feature_dict['chosen_feature']
    f_engineer = feature_dict['f_engineer']
    is_f_engineer = 1 if f_engineer[0] != identity else 0
    lldas_test_df = mlp_evaluate(
        input_train_df=train_inputs_df,
        output_train_df=train_outputs_df,
        inputs_val_df=train_inputs_df,
        hidden_layers=n_layer,
        hidden_size=layer_size,
        chosen_feature=chosen_feature,
        f_engineer=f_engineer,
        normalize=normalize
    )
    acc = get_acc(train_eval_df, lldas_test_df)
    return [fold, n_layer, layer_size, chosen_feature, is_f_engineer, normalize, acc]

In [6]:
fold_list = [1, 2, 3, 4, 5, 6]
n_layer_list = [1, 2, 3]
layer_size_list = [2, 4, 8, 16, 32, 64, 128]
feature_dict_list = [
    {'chosen_feature': ['length'], 'f_engineer': [loglog]},
    {'chosen_feature': ['length'], 'f_engineer': [identity]},
    {'chosen_feature': ['length', 'sd'], 'f_engineer': [loglog, log]},
    {'chosen_feature': ['length', 'sd'], 'f_engineer': [identity, identity]},
    {'chosen_feature': ['sd', 'range_value', 'length', 'sum_diff'], 'f_engineer': [log, log, loglog, log]},
    {'chosen_feature': ['sd', 'range_value', 'length', 'sum_diff'], 'f_engineer': [identity, identity, identity, identity]}
]

In [7]:
# linear
linear_results = Parallel(n_jobs=-1)(
    delayed(process_combination)(
        fold, 0, 1, feature_dict, 0
    ) for fold in range(1, 7)
    for feature_dict in feature_dict_list
)

# Write results to CSV
for row in linear_results:
    add_row_to_csv(acc_rate_path, ['fold', 'n_layer', 'layer_size', 'features', 'f_engineer', 'normalize', 'acc'], row)

In [8]:
# non linear
mlp_results = Parallel(n_jobs=-1)(
    delayed(process_combination)(
        fold, n_layer, layer_size, feature_dict, 1
    ) for fold in fold_list
    for n_layer in n_layer_list
    for layer_size in layer_size_list
    for feature_dict in feature_dict_list
)

# Write results to CSV
for row in mlp_results:
    add_row_to_csv(acc_rate_path, ['fold', 'n_layer', 'layer_size', 'features', 'f_engineer', 'normalize', 'acc'], row)

In [9]:
# for fold in range(1, 7):
#     for n_layer in n_layer_list:
#         for layer_size in layer_size_list:
#             for feature_dict in feature_dict_list:
#                 for normalize in normalize_list:
#                     train_inputs_df, train_outputs_df, train_eval_df, test_inputs_df, test_eval_df = get_fold_dfs(fold)
#                     chosen_feature = feature_dict['chosen_feature']
#                     f_engineer = feature_dict['f_engineer']
#                     is_f_engineer = 1
#                     if f_engineer[0] == identity:
#                         is_f_engineer = 0
#                     lldas_test_df = mlp_evaluate(
#                         input_train_df = train_inputs_df,
#                         output_train_df = train_outputs_df,
#                         inputs_val_df = train_inputs_df,
#                         hidden_layers = n_layer,
#                         hidden_size = layer_size,
#                         chosen_feature = chosen_feature,
#                         f_engineer = f_engineer,
#                         normalize = normalize,
#                         n_ites=1
#                         )
#                     row = [fold, n_layer, layer_size, chosen_feature, is_f_engineer, normalize, get_acc(train_eval_df, lldas_test_df)]
#                     add_row_to_csv(acc_rate_path, ['fold', 'n_layer', 'layer_size', 'features', 'f_engineer', 'normalize', 'acc'], row)

In [10]:
# # BIC
# lldas_test_df_BIC = BIC(fold_inputs_df)
# print(get_acc(fold_eval_df, lldas_test_df_BIC))
# 85.10928961748634

In [11]:
# # HYPER
# n_layer = 0
# layer_size = 1
# chosen_feature = ['length']
# f_engineer = [loglog]
# normalize = 0

# lldas_test_df_linear = mlp_evaluate(
#     input_train_df = fold_inputs_df,
#     output_train_df = fold_outputs_df,
#     inputs_val_df = fold_inputs_df,
#     hidden_layers = n_layer,
#     hidden_size = layer_size,
#     chosen_feature = chosen_feature,
#     f_engineer = f_engineer,
#     normalize = normalize
# )
# print(get_acc(fold_eval_df, lldas_test_df_linear))
# 92.62295081967213

In [12]:
# # HYPER
# n_layer = 0
# layer_size = 1
# chosen_feature = ['length', 'sd']
# f_engineer = [log, log]
# normalize = 0

# lldas_test_df_linear = mlp_evaluate(
#     input_train_df = fold_inputs_df,
#     output_train_df = fold_outputs_df,
#     inputs_val_df = fold_inputs_df,
#     hidden_layers = n_layer,
#     hidden_size = layer_size,
#     chosen_feature = chosen_feature,
#     f_engineer = f_engineer,
#     normalize = normalize
# )
# print(get_acc(fold_eval_df, lldas_test_df_linear))
# 95.08196721311475

In [13]:
# # HYPER
# n_layer = 0
# layer_size = 1
# chosen_feature = ['length', 'sd']
# f_engineer = [loglog, log]
# normalize = 0

# lldas_test_df_linear = mlp_evaluate(
#     input_train_df = fold_inputs_df,
#     output_train_df = fold_outputs_df,
#     inputs_val_df = fold_inputs_df,
#     hidden_layers = n_layer,
#     hidden_size = layer_size,
#     chosen_feature = chosen_feature,
#     f_engineer = f_engineer,
#     normalize = normalize
# )
# print(get_acc(fold_eval_df, lldas_test_df_linear))
# 95.08196721311475

In [14]:
# # HYPER
# n_layer = 1
# layer_size = 8
# chosen_feature = ['sd', 'range_value', 'length', 'sum_diff']
# f_engineer = [log, log, loglog, log]
# normalize = 1

# lldas_test_df_linear = mlp_evaluate(
#     input_train_df = fold_inputs_df,
#     output_train_df = fold_outputs_df,
#     inputs_val_df = fold_inputs_df,
#     hidden_layers = n_layer,
#     hidden_size = layer_size,
#     chosen_feature = chosen_feature,
#     f_engineer = f_engineer,
#     normalize = normalize
# )
# print(get_acc(fold_eval_df, lldas_test_df_linear))
# 96.58469945355192