In [5]:
import os
import pandas as pd
import shutil
import numpy as np

In [6]:
# garther all stats in one df, in folder stats/{dataset} has all csv, each one has 6 items: dataset, n_layer, n_neuron, fold, val_loss, time
def gather_stats(dataset, fold):
    # Define the path to the folder containing the CSV files
    stats_path = f"stats/{dataset}"

    # Initialize an empty list to hold DataFrames
    all_data = []

    # Check if the directory exists
    if not os.path.exists(stats_path):
        raise FileNotFoundError(f"The folder '{stats_path}' does not exist.")

    # Iterate over all files in the directory
    for file_name in os.listdir(stats_path):
        file_path = os.path.join(stats_path, file_name)

        # Check if the file is a CSV file and ends with fold{fold}
        if os.path.isfile(file_path) and file_name.endswith(f"fold{fold}.csv"):
            # Read the CSV file without headers and assign column names
            df = pd.read_csv(file_path, header=None, names=["dataset", "n_layer", "n_neuron", "fold", "val_loss", "time"])

            # Append the DataFrame to the list
            all_data.append(df)

    # Combine all DataFrames into a single DataFrame
    if all_data:
        combined_df = pd.concat(all_data, ignore_index=True)
        return combined_df
    else:
        raise ValueError(f"No valid CSV files ending with 'fold{fold}.csv' found in '{stats_path}'.")

def copy_and_rename_file(source_path, destination_folder, new_name):
    # Check if the source file exists
    if not os.path.isfile(source_path):
        raise FileNotFoundError(f"The file '{source_path}' does not exist.")
    
    # Check if the destination folder exists, create it if not
    if not os.path.exists(destination_folder):
        os.makedirs(destination_folder)

    # Create the full path for the renamed file
    new_path = os.path.join(destination_folder, new_name)

    # Copy and rename the file
    shutil.copy(source_path, new_path)

In [7]:
folder_path = '../../data'
datasets = [name for name in os.listdir(folder_path) if os.path.isdir(os.path.join(folder_path, name))]

In [8]:
# Example usage
for dataset in datasets:
    folds_df = pd.read_csv(f'../../data/{dataset}/folds.csv')
    for fold in sorted(np.unique(folds_df['fold'])):
        stats_df = gather_stats(dataset, fold)
        n_layer, n_neuron = stats_df.sort_values('val_loss').iloc[0][['n_layer', 'n_neuron']].values

        copy_and_rename_file(f'predictions_cv/{dataset}/{n_layer}layers_{n_neuron}neurons_fold{fold}.csv', 'predictions', f'{dataset}.{fold}.csv')
        # print(dataset, fold, n_layer, n_neuron)

auto93 1 2 64
auto93 2 2 64
auto93 3 2 32
auto93 4 1 1
auto93 5 1 1
autohorse 1 1 8
autohorse 2 2 8
autohorse 3 2 4
autohorse 4 2 8
autohorse 5 2 8
autompg 1 2 32
autompg 2 2 32
autompg 3 2 16
autompg 4 2 32
autompg 5 2 32
autoprice 1 1 16
autoprice 2 2 16
autoprice 3 2 4
autoprice 4 1 8
autoprice 5 1 8
baskball 1 1 16
baskball 2 1 16
baskball 3 1 8
baskball 4 1 16
baskball 5 1 8
bodyfat 1 2 64
bodyfat 2 2 64
bodyfat 3 1 16
bodyfat 4 1 32
bodyfat 5 1 16
breasttumor 1 2 4
breasttumor 2 2 8
breasttumor 3 2 8
breasttumor 4 2 4
breasttumor 5 2 4
cholesterol 1 2 16
cholesterol 2 2 16
cholesterol 3 2 64
cholesterol 4 2 16
cholesterol 5 1 1
cleveland 1 1 16
cleveland 2 2 8
cleveland 3 1 16
cleveland 4 2 16
cleveland 5 1 16
cloud 1 2 8
cloud 2 2 8
cloud 3 2 8
cloud 4 1 32
cloud 5 1 16
cpu 1 2 64
cpu 2 2 64
cpu 3 1 32
cpu 4 2 64
cpu 5 2 64
echomonths 1 1 32
echomonths 2 2 64
echomonths 3 2 64
echomonths 4 1 32
echomonths 5 1 2
elusage 1 1 32
elusage 2 2 64
elusage 3 2 64
elusage 4 1 16
elusage 