In [4]:
import pandas as pd
import os
from utility_functions import get_acc, add_row_to_csv

In [5]:
folder_path = 'training_data'
datasets = [name for name in os.listdir(folder_path) if os.path.isdir(os.path.join(folder_path, name))]
datasets = ['systematic']
n_best_model = 8

In [6]:
for dataset in datasets:
    # Load the necessary CSV files
    cv_df = pd.read_csv(f"model/mlp/report_{dataset}.csv")
    evaluation_df = pd.read_csv(f'training_data/{dataset}/evaluation.csv')
    fold_df = pd.read_csv(f'training_data/{dataset}/folds.csv')

    for test_fold in sorted(fold_df['fold'].unique()):
        # Filter cv_df by test_fold and test_ratio
        df_fold = cv_df[(cv_df['test_fold'] == test_fold)]
        
        # Filter evaluation dataframe by sequenceID
        eval_df = evaluation_df[evaluation_df['sequenceID'].isin(fold_df[fold_df['fold'] == test_fold]['sequenceID'])]
        
        # Get the top n models with the lowest val_loss
        top_rows = df_fold.nsmallest(n_best_model, 'val_loss')
        
        # Initialize a list to store predictions for 'llda' from the top 4 models
        pred_list = []

        for _, row in top_rows.iterrows():
            n_layer = row['num_layers']
            layer_size = row['layer_size']
            # Load the corresponding prediction CSV for each model
            pred_df = pd.read_csv(f'model/mlp/predictions_all/{dataset}.{n_layer}layers_{layer_size}neurons_{test_fold}.csv')
            pred_df.fillna(0, inplace=True)
            # Append the 'llda' predictions to the list
            pred_list.append(pred_df['llda'])
        
        # Calculate the average of the 'llda' predictions across the 4 models
        final_pred = sum(pred_list) / len(pred_list)
        
        # Replace the 'llda' column in pred_df with the averaged predictions
        pred_df['llda'] = final_pred
        
        # Calculate the accuracy using the averaged predictions
        acc = get_acc(eval_df, pred_df)
        
        # Save the result to the CSV file
        add_row_to_csv('acc_rate_csvs/' + dataset + '.csv', ['method', 'fold', 'acc'], [f'mlp', test_fold, acc])