In [1]:
import pandas as pd
import math
import os
import numpy as np
from collections import defaultdict

In [2]:
def chose_best_architect(cv_df, test_fold, n_features):
    cv_df_fold_features = cv_df[(cv_df['fold'] == test_fold) & (cv_df['n_features'] == n_features)]
    cv_df_fold_features = cv_df_fold_features.sort_values(by='val_acc', ascending=False)
    cv_df_fold_features = cv_df_fold_features[cv_df_fold_features['val_acc'] == cv_df_fold_features.iloc[0]['val_acc']]
    best_n_layer = math.ceil(cv_df_fold_features['n_layer'].mean())
    best_layer_size = math.ceil(cv_df_fold_features['layer_size'].mean())
    best_layer_size = 2**(int(np.log2(best_layer_size)))
    return best_n_layer, best_layer_size

In [3]:
def get_count_chosen_mlp(cv_df):
    # Initialize a defaultdict to count occurrences
    data = defaultdict(lambda: defaultdict(int))

    # Assuming you have a function chose_best_architect that returns best_n_layer and best_layer_size
    for test_fold in range(1, 7):
        for n_features in [1, 2, 4, 117]:
            best_n_layer, best_layer_size = chose_best_architect(cv_df, test_fold, n_features)
            data[(best_n_layer, best_layer_size)][n_features] += 1

    # Include all combinations of n_layer from 1 to 4 and layer_size from 2 to 512
    for n_layer in range(1, 5):
        for layer_size in [2, 4, 8, 16, 32, 64, 128, 256, 512]:
            if (n_layer, layer_size) not in data:
                data[(n_layer, layer_size)] = {1: 0, 2: 0, 4: 0, 117: 0}

    # Convert the defaultdict into a DataFrame
    df = pd.DataFrame(data).fillna(0).astype(int).T

    # Sort the DataFrame by n_layer
    df = df.sort_index(level=0)

    return df

In [4]:
# Initialize an empty list to hold DataFrames
dfs = []

# Iterate over datasets
for dataset in os.listdir("training_data"):
    # Read CSV file
    cv_df = pd.read_csv('acc_rate_csvs/' + dataset + '_cv.csv')
    # Process DataFrame
    df = get_count_chosen_mlp(cv_df)
    # Append processed DataFrame to the list
    dfs.append(df)

# Concatenate DataFrames along columns
concatenated_df = pd.concat(dfs, axis=1)

# Remove rows with all zeros
concatenated_df = concatenated_df.loc[(concatenated_df != 0).any(axis=1)]

In [5]:
concatenated_df

Unnamed: 0,Unnamed: 1,1,2,4,117,1.1,4.1,2.1,117.1,1.2,2.2,4.2,117.2
1,2,0,0,0,2,0,0,0,0,0,1,0,1
1,4,0,1,0,0,0,0,0,0,0,0,0,1
1,16,0,0,0,0,0,0,0,0,0,1,0,0
1,32,0,0,0,0,0,1,1,1,0,0,0,0
1,64,0,0,0,0,1,1,0,1,0,0,0,0
1,128,0,0,0,0,1,0,0,0,0,0,0,0
1,256,0,0,0,0,1,0,0,1,0,0,0,0
1,512,0,0,0,0,0,1,0,0,0,0,0,0
2,2,1,2,3,2,0,0,0,0,1,1,3,3
2,4,0,0,1,0,0,0,0,0,1,0,2,1
