In [1]:
import pandas as pd
import numpy as np
from joblib import Parallel, delayed

from utility_functions import get_acc, add_row_to_csv
from MLP import mlp_evaluate

In [2]:
dataset = 'cancer'

In [3]:
# training data
fold_path = 'training_data/' + dataset + '/folds.csv'
inputs_path = 'training_data/' + dataset + '/inputs.csv'
outputs_path = 'training_data/' + dataset + '/outputs.csv'
evaluation_path = 'training_data/' + dataset + '/evaluation.csv'

# writing accuracy rate path
acc_rate_path = 'acc_rate/' + dataset + '.csv'

# path to write df to csv
output_df_path = 'record_dataframe/' + dataset + '/'

# raw dfs
fold_df = pd.read_csv(fold_path)
inputs_df = pd.read_csv(inputs_path)
outputs_df = pd.read_csv(outputs_path)
evaluation_df = pd.read_csv(evaluation_path)

# number of folds
n_folds = fold_df['fold'].nunique()

# feature engineering transformation
identity = lambda x: x
log      = lambda x: np.log(x)
loglog   = lambda x: np.log(np.log(x))

In [4]:
def get_fold_dfs(fold):
    train_inputs_df = inputs_df[inputs_df['sequenceID'].isin(fold_df[fold_df['fold'] == fold]['sequenceID'])]
    train_outputs_df = outputs_df[outputs_df['sequenceID'].isin(fold_df[fold_df['fold'] == fold]['sequenceID'])]
    train_eval_df = evaluation_df[evaluation_df['sequenceID'].isin(fold_df[fold_df['fold'] != fold]['sequenceID'])]
    test_inputs_df = inputs_df[inputs_df['sequenceID'].isin(fold_df[fold_df['fold'] == fold]['sequenceID'])]
    test_eval_df = evaluation_df[evaluation_df['sequenceID'].isin(fold_df[fold_df['fold'] == fold]['sequenceID'])]
    return train_inputs_df, train_outputs_df, train_eval_df, test_inputs_df, test_eval_df

In [5]:
def process_combination(fold, n_layer, layer_size, feature_dict, normalize):
    train_inputs_df, train_outputs_df, _, test_inputs_df, test_eval_df = get_fold_dfs(fold)
    chosen_feature = feature_dict['chosen_feature']
    f_engineer = feature_dict['f_engineer']
    is_f_engineer = 1 if f_engineer[0] != identity else 0
    lldas_test_df = mlp_evaluate(
        input_train_df=train_inputs_df,
        output_train_df=train_outputs_df,
        inputs_val_df=test_inputs_df,
        hidden_layers=n_layer,
        hidden_size=layer_size,
        chosen_feature=chosen_feature,
        f_engineer=f_engineer,
        normalize=normalize
    )
    acc = get_acc(test_eval_df, lldas_test_df)
    return [fold, n_layer, layer_size, chosen_feature, is_f_engineer, normalize, acc]

In [6]:
n_layer_list = [1, 2, 3]
layer_size_list = [2, 4, 8, 16, 32, 64, 128]
feature_dict_list = [
    {'chosen_feature': ['length'], 'f_engineer': [loglog]},
    {'chosen_feature': ['length'], 'f_engineer': [identity]},
    {'chosen_feature': ['length', 'sd'], 'f_engineer': [loglog, log]},
    {'chosen_feature': ['length', 'sd'], 'f_engineer': [identity, identity]},
    {'chosen_feature': ['sd', 'range_value', 'length', 'sum_diff'], 'f_engineer': [log, log, loglog, log]},
    {'chosen_feature': ['sd', 'range_value', 'length', 'sum_diff'], 'f_engineer': [identity, identity, identity, identity]}
]

In [7]:
fold = 1
n_layer = 1
layer_size = 4
feature_dict = feature_dict_list[0]
normalize = 1

train_inputs_df, train_outputs_df, _, test_inputs_df, test_eval_df = get_fold_dfs(fold)
chosen_feature = feature_dict['chosen_feature']
f_engineer = feature_dict['f_engineer']
is_f_engineer = 1 if f_engineer[0] != identity else 0
lldas_test_df = mlp_evaluate(
    input_train_df=train_inputs_df,
    output_train_df=train_outputs_df,
    inputs_val_df=test_inputs_df,
    hidden_layers=n_layer,
    hidden_size=layer_size,
    chosen_feature=chosen_feature,
    f_engineer=f_engineer,
    normalize=normalize
)

In [8]:
acc = get_acc(test_eval_df, lldas_test_df)