In [13]:
import os
import numpy as np
from typing import Dict, Union

def load_data_create_dict(input_dim: int, 
                          use_pseudorehearsal: bool , 
                          optimizer: str , 
                          trials: int , 
                          num_models: int) -> Dict[str, Union[np.ndarray, None]]:
    """
    Load data for specified trials and models from a structured directory and 
    return it as a dictionary.
    
    Parameters:
    ----------
    input_dim : int
        Input dimension.
    use_pseudorehearsal : bool
        Whether pseudo rehearsal is used.
    optimizer : str
        Type of optimizer ('sgd', etc.).
    trials : int
        Number of trials.
    num_models : int
        Number of models.

    Returns:
    -------
    dict
        A dictionary containing concatenated min and max distances and model 
        perturbations for each trial and model.
    """
    
    base_folder = f"results/input_dim_{input_dim}_{use_pseudorehearsal}_{optimizer}"

    # Initialize the dictionary with keys for min_distance, max_distance, and model perturbations
    data = {
        "min_distance": [],
        "max_distance": [],
        **{f"model_{j}_perturbations": [] for j in range(num_models)}
    }

    for i in range(trials):
        trial_folder = f"{base_folder}/trial_{i}"

        paths = {
            "min_distance": f"{trial_folder}/distances/min_distances.npy",
            "max_distance": f"{trial_folder}/distances/max_distances.npy",
            **{f"model_{j}_perturbations": f"{trial_folder}/perturbations/model_{j}/absolute_perturbation.npy" for j in range(num_models)}
        }

        for key, path in paths.items():
            if os.path.exists(path):
                data[key].append(np.load(path))
            else:
                print(f"Warning: {path} not found.")

    # Convert lists of numpy arrays to a single concatenated numpy array
    for key, value in data.items():
        data[key] = np.concatenate(value, axis=0).flatten() if value else None
                
    return data

def save_aggregated_data(input_dim: int, 
                         use_pseudorehearsal: bool, 
                         optimizer: str, 
                         trials: int, 
                         num_models: int) -> None:
    """
    Save aggregated data to a numpy file for specified parameters.

    The function aggregates data based on the provided parameters and 
    saves it in a structured directory named 'aggregated_results'.

    Parameters:
    ----------
    input_dim : int
        Input dimension.
    use_pseudorehearsal : bool
        Whether pseudo rehearsal is used.
    optimizer : str
        Type of optimizer ('sgd', etc.).
    trials : int
        Number of trials. Aggregation is done over 0 to this parameter the (max) number of trials
    num_models : int
        Number of models.

    Returns:
    -------
    None
    """
    
    base_folder = f"results/input_dim_{input_dim}_{use_pseudorehearsal}_{optimizer}"

    # Save the data
    save_folder = "aggregated_results"
    if not os.path.exists(save_folder):
        os.makedirs(save_folder)
    save_path = f"{save_folder}/input_dim_{input_dim}_{use_pseudorehearsal}_{optimizer}.npy"
    
    # Fetch the aggregated data
    data = load_data_create_dict(input_dim, use_pseudorehearsal, optimizer, trials, num_models)
    
    # Save the data to the specified path
    np.save(save_path, data)


In [16]:
def load_all_aggregated_data() -> dict:
    """Load all aggregated data for various configurations: input dimensions, pseudo rehearsal, and optimizer.
    
    Returns:
    - all_data (dict): A nested dictionary with keys as input dimensions, pseudorehearsal, and optimizer 
                       and values as the loaded data.
    """
    all_data = {}
    save_folder = "aggregated_results"
    
    # Get all files in the folder with the .npy extension
    files = [f for f in os.listdir(save_folder) if f.endswith(".npy")]

    # Extract the configuration details from the file name using regex
    pattern = re.compile(r"input_dim_(\d+)_(True|False)_(\w+).npy")

    for file in files:
        match = pattern.match(file)
        if match:
            dim = int(match.group(1))
            pseudo_rehearsal = True if match.group(2) == 'True' else False
            optimizer = match.group(3)
            
            if dim not in all_data:
                all_data[dim] = {}
            
            if pseudo_rehearsal not in all_data[dim]:
                all_data[dim][pseudo_rehearsal] = {}
            
            save_path = os.path.join(save_folder, file)
            data = np.load(save_path, allow_pickle=True).item()
            all_data[dim][pseudo_rehearsal][optimizer] = data

    return all_data 

# Example usage:
loaded_data_dict = load_all_aggregated_data()
#for dim, pseudo_rehearsal_data in loaded_data_dict.items():
#    for pseudo, optimizer_data in pseudo_rehearsal_data.items():
#        for optimizer, data in optimizer_data.items():
#            print(f"Data for dimension {dim}, pseudorehearsal: {pseudo}, optimizer: {optimizer} has keys: \n\n {data.keys()} \n\n")


NameError: name 're' is not defined

In [15]:
# here is an example of how the aggregate function runs:

max_num_trials = 3
trial_chunks = 1

optimizers = ['adam', 'sgd']
pseudorehearsals = [True, False]
input_dimensions = [x for x in range(1,3)]
trial_numbers = list(range(max_num_trials))

for optimizer in optimizers:
    for use_pseudorehearsal in pseudorehearsals:
        for input_dimension in input_dimensions:
            save_aggregated_data(input_dim=input_dimension,
                                   use_pseudorehearsal=use_pseudorehearsal,
                                   optimizer=optimizer,
                                   trials=max_num_trials,
                                   num_models=19)
            
            
# here is a flawed version of a procedure that loads the saved data. It does not correctly incorporate
# the True/False for pseudorehearsal or choice of optimizer. Can you please fix it?

import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt 
import re

def load_all_aggregated_data() -> dict:
    """Load all aggregated data for all available input dimensions.
    
    Returns:
    - all_data (dict): A dictionary with keys as input dimensions and values as the loaded data.
    """
    all_data = {}
    save_folder = "aggregated_results"
    
    # Get all files in the folder with the .npy extension
    files = [f for f in os.listdir(save_folder) if f.endswith(".npy")]

    # Extract the dimension from the file name using regex
    pattern = re.compile(r"data_input_dim_(\d+).npy")

    for file in files:
        match = pattern.match(file)
        if match:
            dim = int(match.group(1))
            save_path = os.path.join(save_folder, file)
            data = np.load(save_path, allow_pickle=True).item()
            all_data[dim] = data

    return all_data 

# Example usage:
loaded_data_dict = load_all_aggregated_data()
#for dim, data in loaded_data_dict.items():
#    print(f"Data for dimension {dim} has keys: \n\n {data.keys()} \n\n")


In [14]:
load_data_create_dict(input_dim=2, use_pseudorehearsal=True, optimizer='adam', trials=3, num_models=19)

{'min_distance': array([0.20778511, 0.08144092, 0.14693758, ..., 0.11785191, 0.00768984,
        0.31310558]),
 'max_distance': array([0.68519141, 0.24297593, 0.35712052, ..., 0.900284  , 0.41372856,
        0.49212958]),
 'model_0_perturbations': array([0.00190884, 0.22155362, 0.3145011 , ..., 0.08296591, 0.52713037,
        0.07892662], dtype=float32),
 'model_1_perturbations': array([0.05734289, 0.44299692, 0.23997849, ..., 0.07589984, 0.24677038,
        0.05254287], dtype=float32),
 'model_2_perturbations': array([0.00638894, 0.06607694, 0.28800416, ..., 0.01812072, 0.0257414 ,
        0.01433322], dtype=float32),
 'model_3_perturbations': array([0.41618338, 0.41618338, 0.41618338, ..., 0.08304958, 0.08304958,
        0.08304958], dtype=float32),
 'model_4_perturbations': array([0.0363976 , 0.34662816, 0.12458075, ..., 0.03423108, 0.05732498,
        0.02507427], dtype=float32),
 'model_5_perturbations': array([0.41618338, 0.41618338, 0.41618338, ..., 0.08304958, 0.08304958,
     