In [1]:
from mlp_utils import mlp_train_and_predict
import pandas as pd
import numpy as np

In [2]:
category = 'proposed'

In [3]:
dataset = 'detailed'
test_fold = 1
chosen_feature = ['loglog_count', 'log_variance', 'log_range_value', 'loglog_sum_diff']

In [4]:
# Load data
folds_df = pd.read_csv(f'../../training_data/{dataset}/folds.csv')
features_df = pd.read_csv(f'../../training_data/{dataset}/features.csv')[['sequenceID'] + chosen_feature]
target_df = pd.read_csv(f'../../training_data/{dataset}/target.csv')

In [5]:
# Split data into training and test sets
train_ids = folds_df[folds_df['fold'] != test_fold]['sequenceID']
test_ids = folds_df[folds_df['fold'] == test_fold]['sequenceID']

features_df_train = features_df[features_df['sequenceID'].isin(train_ids)]
features_df_test = features_df[features_df['sequenceID'].isin(test_ids)]
target_df_train = target_df[target_df['sequenceID'].isin(train_ids)]

In [6]:
def train_model(suffix, target_column):
    # Filter features and target
    X_train = features_df_train[chosen_feature]
    X_test = features_df_test[chosen_feature]
    y_train = target_df_train[target_column]

    # Filter out invalid rows
    X_train = X_train.reset_index(drop=True)
    y_train = y_train.reset_index(drop=True)
    if suffix == 'lower':
        valid_rows = y_train != -np.inf
    else:
        valid_rows = y_train != np.inf
    X_train = X_train[valid_rows]
    y_train = y_train[valid_rows].to_numpy()

    y_pred, best_params_ = mlp_train_and_predict(X_train, X_test, y_train)
    return y_pred, best_params_

In [7]:
# Train and predict for both lower and upper models
lda_lower_pred, best_params_lower = train_model('lower', 'min.log.lambda')
lda_upper_pred, best_params_upper = train_model('upper', 'max.log.lambda')

# Calculate and return the average vector
pred_lldas = (lda_lower_pred + lda_upper_pred) / 2

In [8]:
lldas_df = pd.DataFrame(list(zip(features_df_test['sequenceID'], pred_lldas)), columns=['sequenceID', 'llda'])

In [9]:
lldas_df = pd.DataFrame(list(zip(features_df_test['sequenceID'], pred_lldas)), columns=['sequenceID', 'llda'])
lldas_df.to_csv(f'predictions/{category}.{dataset}.{test_fold}.{len(chosen_feature)}.csv', index=False)

In [10]:
best_params_lower

{'hidden_layer_sizes': (512,), 'max_iter': 1000}

In [11]:
best_params_upper

{'hidden_layer_sizes': (64,), 'max_iter': 1000}