In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

In [None]:
# check if all files are in the folder
folder_path_BanknoteAuthentication= os.path.join(os.getcwd(), "..", "Model_Optimization", "Banknote")
folder_path_BreastCancer = os.path.join(os.getcwd(), "..", "Model_Optimization", "Breast_Cancer")
folder_path_MNIST = os.path.join(os.getcwd(), "..", "Model_Optimization", "MNIST")
folder_path_Audio = os.path.join(os.getcwd(), "..", "Model_Optimization", "Audio_MNIST")

folder_paths = [folder_path_BanknoteAuthentication, folder_path_BreastCancer, folder_path_MNIST, folder_path_Audio]

for paths in folder_paths:
    # Get all the files in the folder
    files = os.listdir(paths)
    # Filter out the CSV files
    csv_files = [file for file in files if file.endswith('.csv')]
    # There should be (8 train_val files * 3 lr * 5 seeds) + (8 test_files * 3 lr * 5 seeds) + 5 AE_files = 245 files
    print(f"Number of files in {paths} is {len(csv_files)}")
    # assert len(csv_files) == 245, "There are not 245 files in the folder"


In [None]:
def plot_specific_combination(folder_path, specific_lr, model):
    '''
    Plots the validation loss per seed & average for a combination of hyperparameters
    
        Parameters:
                folder_path (string): Path to the folder containing the CSV files
                specific_lr (float): learning rate
    '''

    # Get all the files in the folder
    files = os.listdir(folder_path)
    # Filter out the CSV files
    csv_files = [file for file in files if file.endswith('.csv')]

    val_accuracy_list_with_best_combinations = []
    val_loss_list_with_best_combinations = []


    for file in csv_files:
        # Read the CSV file
        file_path = os.path.join(folder_path, file)
        data = pd.read_csv(file_path)
        # check if file is the model we are looking for
        model_list_name = data["List Name"][0]
        if model_list_name == model:
            model_learning_rate = data["LEARNING_RATE"][0]            
            model_test_accuracy = data["Test_Accuracy"][0]
            model_dataset_name = data["Dataset"][0]
            model_seed = data["SEED"][0]
            model_validation_accuracy = data["Validation_Accuracy"].values
            model_validation_loss = data["Validation_Loss"].values
            
            # model_validation_accuracy = model_validation_accuracy[:50]
            # model_validation_loss = model_validation_loss[:50]

            if model_learning_rate == specific_lr:
                val_accuracy_list_with_best_combinations.append((model_seed, model_validation_accuracy))
                val_loss_list_with_best_combinations.append((model_seed, model_validation_loss))

    val_accuracy_list_with_best_combinations = sorted(val_accuracy_list_with_best_combinations, key=lambda x:x[0])

    
    # Create a figure and axis
    fig, ax = plt.subplots()

    # Iterate through the seed-test_loss pairs
    for seed, val_acc in val_accuracy_list_with_best_combinations:
        # Plot the validation accuracy values for the current seed
        ax.plot(val_acc, label=f"Seed {seed}, Validation Accuracy")
    
    # Average over all seeds
    # Extract the array from the tuple
    array_data_accuracy = [arr for _, arr in val_accuracy_list_with_best_combinations]
    # Calculate the average for each corresponding element in the arrays
    acc_averages = np.mean(array_data_accuracy, axis=0)
    ax.plot(acc_averages, label=f"Average Accuracy")

    array_data_loss = [arr for _, arr in val_loss_list_with_best_combinations]
    # Calculate the average for each corresponding element in the arrays
    loss_averages = np.mean(array_data_loss, axis=0)
    # ax.plot(loss_averages, label=f"Average Loss")

    # Add labels and title
    ax.set_xlabel('Epoch')
    ax.set_ylabel('Loss and Accuracy')
    ax.set_title(f'{model_dataset_name} - {model}: {specific_lr}')
    # Add legend
    ax.legend()

    # Display the graph
    plt.show()

In [None]:
def best_combination(folder_path, model, round_numbers):
    '''
    Returns the best combination of hyperparameters

        Parameters:
                folder_path (string): Path to the folder containing the CSV files
                model (string): Name of the model
                round_numbers (bool): Round the numbers to 4 decimal places
        Returns:
                best_learning_rate (float): Best learning rate
                best_mean_test_acc (float): Best mean test accuracy  
    '''
    # Get all the files in the folder
    files = os.listdir(folder_path)

    # Filter out the CSV files
    csv_files = [file for file in files if file.endswith('.csv')]

    # Define the combinations of learning_rate and batch_size
    learning_rates = [0.1, 0.01, 0.001]

    # Store results as tuples in list
    results = []

    # Iterate through the combinations (group by learning_rate and batch_size)
    for lr in learning_rates:
        # initialize an empty list to store test losses for each combination
        test_accs = []
        seeds = []

        # Iterate through the CSV files
        for file in csv_files:
            # Read the CSV file
            file_path = os.path.join(folder_path, file)
            data = pd.read_csv(file_path)

            # check if file is the model we are looking for
            model_list_name = data["List Name"][0]
            if model_list_name == model:
                model_learning_rate = data["LEARNING_RATE"][0]            
                model_test_accuracy = data["Test_Accuracy"][0]
                model_dataset_name = data["Dataset"][0]
                model_seed = data["SEED"][0]

                if model_learning_rate == lr:
                    test_accs.append(model_test_accuracy)
                    # print(f"model_test_accuracy = {model_test_accuracy} for seed {model_seed}")
                    seeds.append(model_seed)
        
        """
        # Create boxplot for the current learning rate
        plt.boxplot(test_accs, showmeans=True)
        plt.title(f"Boxplot for Learning Rate {lr}")
        # plt.xlabel("Seed")
        plt.ylabel("Test Accuracy")
        # plt.xticks(range(1, len(seeds) + 1), seeds)
        plt.show()
        """
        # Calculate the mean test loss
        if(len(test_accs) != 0):
            assert len(set(seeds)) == 5, "Number of unique seeds is not equal to 5 -> ERROR."
            assert len(test_accs) == 5, "Number of test losses is not equal to 5 -> ERROR."
            mean_test_acc = np.mean(test_accs, axis=0)
            se_test_acc = stats.sem(test_accs)
            # Calculate the 95% confidence interval for test accuracy
            confidence_interval_acc = stats.t.interval(0.95, len(test_accs) - 1, loc=mean_test_acc, scale=se_test_acc)
            conf_interval_half_acc = abs(confidence_interval_acc[1] - confidence_interval_acc[0]) / 2

            assert mean_test_acc == (sum(test_accs)/len(test_accs)), "Mean test acc is not equal to sum of test accs divided by number of test losses -> ERROR."
            print(f"Learning Rate: {lr} || Mean Test Acc: {mean_test_acc}")
            results.append((lr, mean_test_acc, conf_interval_half_acc, confidence_interval_acc))

    # sort results by mean test loss
    sorted_results = sorted(results, key=lambda x:x[1])
    sorted_results_length = len(sorted_results)
    print(f"len(sorted_results) = {sorted_results_length}")
    print(f"len(csv_files) = {len(csv_files)}")
    print("Sorted List:")
    print(sorted_results)
    print(f"Best Combination for {model}[{model_dataset_name}]:")
    best_learning_rate = sorted_results[sorted_results_length-1][0]
    best_mean_test_acc = sorted_results[sorted_results_length-1][1]
    best_CI_half = sorted_results[sorted_results_length-1][2]

    if round_numbers:
        # round to 4 decimal places
        best_mean_test_acc = round(best_mean_test_acc, 4)
        best_CI_half = round(best_CI_half, 4)
    print(f"Learning Rate: {best_learning_rate}, Mean Test Acc: {best_mean_test_acc}, 95% Confidence Interval Half: {best_CI_half}")
    print(100*"*")

    return best_learning_rate, best_mean_test_acc


In [None]:
folder_path_BanknoteAuthentication= os.path.join(os.getcwd(), "..", "Model_Optimization", "Banknote")
folder_path_BreastCancer = os.path.join(os.getcwd(), "..", "Model_Optimization", "Breast_Cancer")
folder_path_MNIST = os.path.join(os.getcwd(), "..", "Model_Optimization", "MNIST")
folder_path_Audio = os.path.join(os.getcwd(), "..", "Model_Optimization", "Audio_MNIST")


# Banknote Authentication
best_VQC_angle_lr_BanknoteAuthentication, best_VQC_angle_mean_test_acc_BanknoteAuthentication = best_combination(folder_path_BanknoteAuthentication, "VQC_angle_list", True)
best_VQC_amplitude_lr_BanknoteAuthentication, best_VQC_amplitude_mean_test_acc_BanknoteAuthentication = best_combination(folder_path_BanknoteAuthentication, "VQC_amplitude_list", True)
best_NN_compressed_lr_BanknoteAuthentication, best_NN_compressed_mean_test_acc_BanknoteAuthentication = best_combination(folder_path_BanknoteAuthentication, "NN_with_compressed_input_list", True)
best_NN_original_lr_BanknoteAuthentication, best_NN_original_mean_test_acc_BanknoteAuthentication = best_combination(folder_path_BanknoteAuthentication, "NN_with_original_input_list", True)
best_Sequent_classical_lr_BanknoteAuthentication, best_Sequent_classical_mean_test_acc_BanknoteAuthentication = best_combination(folder_path_BanknoteAuthentication, "sequent_classical_list", True)
best_Sequent_quantum_lr_BanknoteAuthentication, best_Sequent_quantum_mean_test_acc_BanknoteAuthentication = best_combination(folder_path_BanknoteAuthentication, "sequent_quantum_list", True)
best_Dressed_classical_lr_BanknoteAuthentication, best_Dressed_classical_mean_test_acc_BanknoteAuthentication = best_combination(folder_path_BanknoteAuthentication, "dressed_classical_list", True)
best_Dressed_quantum_lr_BanknoteAuthentication, best_Dressed_quantum_mean_test_acc_BanknoteAuthentication = best_combination(folder_path_BanknoteAuthentication, "dressed_quantum_list", True)


# Breast Cancer
best_VQC_angle_lr_BreastCancer, best_VQC_angle_mean_test_acc_BreastCancer = best_combination(folder_path_BreastCancer, "VQC_angle_list", True)
best_VQC_amplitude_lr_BreastCancer, best_VQC_amplitude_mean_test_acc_BreastCancer = best_combination(folder_path_BreastCancer, "VQC_amplitude_list", True)
best_NN_compressed_lr_BreastCancer, best_NN_compressed_mean_test_acc_BreastCancer = best_combination(folder_path_BreastCancer, "NN_with_compressed_input_list",True)
best_NN_original_lr_BreastCancer, best_NN_original_mean_test_acc_BreastCancer = best_combination(folder_path_BreastCancer, "NN_with_original_input_list",True)
best_Sequent_classical_lr_BreastCancer, best_Sequent_classical_mean_test_acc_BreastCancer = best_combination(folder_path_BreastCancer, "sequent_classical_list",True)
best_Sequent_quantum_lr_BreastCancer, best_Sequent_quantum_mean_test_acc_BreastCancer = best_combination(folder_path_BreastCancer, "sequent_quantum_list",True)
best_Dressed_classical_lr_BreastCancer, best_Dressed_classical_mean_test_acc_BreastCancer = best_combination(folder_path_BreastCancer, "dressed_classical_list",True)
best_Dressed_quantum_lr_BreastCancer, best_Dressed_quantum_mean_test_acc_BreastCancer = best_combination(folder_path_BreastCancer, "dressed_quantum_list",True)


# MNIST
best_VQC_angle_lr_MNIST, best_VQC_angle_mean_test_acc_MNIST = best_combination(folder_path_MNIST, "VQC_angle_list",True)
best_VQC_amplitude_lr_MNIST, best_VQC_amplitude_mean_test_acc_MNIST = best_combination(folder_path_MNIST, "VQC_amplitude_list",True)
best_NN_compressed_lr_MNIST, best_NN_compressed_mean_test_acc_MNIST = best_combination(folder_path_MNIST, "NN_with_compressed_input_list",True)
best_NN_original_lr_MNIST, best_NN_original_mean_test_acc_MNIST = best_combination(folder_path_MNIST, "NN_with_original_input_list",True)
best_Sequent_classical_lr_MNIST, best_Sequent_classical_mean_test_acc_MNIST = best_combination(folder_path_MNIST, "sequent_classical_list",True)
best_Sequent_quantum_lr_MNIST, best_Sequent_quantum_mean_test_acc_MNIST = best_combination(folder_path_MNIST, "sequent_quantum_list",True)
best_Dressed_classical_lr_MNIST, best_Dressed_classical_mean_test_acc_MNIST = best_combination(folder_path_MNIST, "dressed_classical_list",True)
best_Dressed_quantum_lr_MNIST, best_Dressed_quantum_mean_test_acc_MNIST = best_combination(folder_path_MNIST, "dressed_quantum_list",True)


# Audio MNIST
best_VQC_angle_lr_AudioMNIST, best_VQC_angle_mean_test_acc_AudioMNIST = best_combination(folder_path_Audio, "VQC_angle_list",True)
best_VQC_amplitude_lr_AudioMNIST, best_VQC_amplitude_mean_test_acc_AudioMNIST = best_combination(folder_path_Audio, "VQC_amplitude_list",True)
best_NN_compressed_lr_AudioMNIST, best_NN_compressed_mean_test_acc_AudioMNIST = best_combination(folder_path_Audio, "NN_with_compressed_input_list",True)
best_NN_original_lr_AudioMNIST, best_NN_original_mean_test_acc_AudioMNIST = best_combination(folder_path_Audio, "NN_with_original_input_list",True)
best_Sequent_classical_lr_AudioMNIST, best_Sequent_classical_mean_test_acc_AudioMNIST = best_combination(folder_path_Audio, "sequent_classical_list",True)
best_Sequent_quantum_lr_AudioMNIST, best_Sequent_quantum_mean_test_acc_AudioMNIST = best_combination(folder_path_Audio, "sequent_quantum_list",True)
best_Dressed_classical_lr_AudioMNIST, best_Dressed_classical_mean_test_acc_AudioMNIST = best_combination(folder_path_Audio, "dressed_classical_list",True)
best_Dressed_quantum_lr_AudioMNIST, best_Dressed_quantum_mean_test_acc_AudioMNIST = best_combination(folder_path_Audio, "dressed_quantum_list",True)

In [None]:
def best_combination_and_store(folder_path, models, round_numbers=True, layers=6):
    '''
    Returns the best combination of hyperparameters for multiple models. Stores the results in a CSV file.

        Parameters:
                folder_path (string): Path to the folder containing the CSV files
                models (list): List of model names to find the best combination for
                round_numbers (bool): Whether to round the results to 4 decimal places (default=True)
                layers (int): Number of layers in the model (default=3)
        Returns:
                best_combinations (list): List of dictionaries containing the best combination for each model
    '''
    # Get all the files in the folder
    files = os.listdir(folder_path)

    # Filter out the CSV files
    csv_files = [file for file in files if file.endswith('.csv')]

    # Define the combinations of learning_rate and batch_size
    learning_rates = [0.1, 0.01, 0.001]

    # Create a list to store the results for each model
    best_combinations = []

    for model in models:
        model_combinations = []
        model_dataset_name = ""  # Initialize the model_dataset_name for each model

        # Iterate through the combinations (group by learning_rate and batch_size)
        for lr in learning_rates:
            # initialize an empty list to store test losses for each combination
            test_accs = []
            seeds = []

            # Iterate through the CSV files
            for file in csv_files:
                # Read the CSV file
                file_path = os.path.join(folder_path, file)
                data = pd.read_csv(file_path)

                # check if file is the model we are looking for
                model_list_name = data["List Name"][0]
                if model_list_name == model:
                    model_learning_rate = data["LEARNING_RATE"][0]            
                    model_test_accuracy = data["Test_Accuracy"][0]
                    model_seed = data["SEED"][0]
                    epochs = data["EPOCHS"][0]
                    batch_size = data["BATCH_SIZE"][0]

                    if model_learning_rate == lr:
                        test_accs.append(model_test_accuracy)
                        seeds.append(model_seed)

                    # Extract the dataset name (assuming it is consistent across all files for the same model)
                    model_dataset_name = data["Dataset"][0]

            # Calculate the mean test loss
            if len(test_accs) != 0:
                assert len(set(seeds)) == 5, "Number of unique seeds is not equal to 5 -> ERROR."
                assert len(test_accs) == 5, "Number of test losses is not equal to 5 -> ERROR."
                mean_test_acc = np.mean(test_accs, axis=0)
                se_test_acc = stats.sem(test_accs)
                # Calculate the 95% confidence interval for test accuracy
                confidence_interval_acc = stats.t.interval(0.95, len(test_accs) - 1, loc=mean_test_acc, scale=se_test_acc)
                conf_interval_half_acc = abs(confidence_interval_acc[1] - confidence_interval_acc[0]) / 2

                assert mean_test_acc == (sum(test_accs) / len(test_accs)), "Mean test acc is not equal to sum of test accs divided by number of test losses -> ERROR."
                print(f"Model: {model} || Learning Rate: {lr} || Mean Test Acc: {mean_test_acc}")
                model_combinations.append((lr, mean_test_acc, conf_interval_half_acc, confidence_interval_acc))

        if model_combinations:  # Check if any valid combinations were found
            # Sort model_combinations by mean test loss
            sorted_combinations = sorted(model_combinations, key=lambda x: x[1])
            best_combination = sorted_combinations[-1]  # Get the best combination for this model

            if round_numbers:
                # Round to 4 decimal places
                best_mean_test_acc = round(best_combination[1], 4)
                best_CI_half = round(best_combination[2], 4)
                best_CI = [round(conf, 4) for conf in best_combination[3]]
            else:
                best_mean_test_acc = best_combination[1]
                best_CI_half = best_combination[2]
                best_CI = best_combination[3]

            best_combinations.append({
                "Dataset": model_dataset_name,
                "Model": model,
                "Epochs": epochs,
                "Batch size": batch_size,
                "Layers": layers,
                "Learning rate": best_combination[0],
                "Test acc": best_mean_test_acc,
                "Confidence Interval Half": best_CI_half,
                "Confidence Interval": best_CI,
            })

            print(f"Best Combination for {model}:")
            print(f"Learning Rate: {best_combination[0]}, Mean Test Acc: {best_mean_test_acc}, 95% Confidence Interval Half: {best_CI_half}, 95% Confidence Interval: {best_CI}")
            print(100 * "*")

    # Store the best combinations in a CSV file
    output_csv = os.path.join(os.getcwd(), "..", "Model_Optimization", f"{model_dataset_name}_best_combinations.csv")
    results_df = pd.DataFrame(best_combinations)
    results_df.to_csv(output_csv, index=False)

    return best_combinations

In [None]:
# Define folder paths for different datasets
folder_path_BanknoteAuthentication = os.path.join(os.getcwd(), "..", "Model_Optimization", "Banknote")
folder_path_BreastCancer = os.path.join(os.getcwd(),  "..", "Model_Optimization", "Breast_Cancer")
folder_path_MNIST = os.path.join(os.getcwd(),  "..", "Model_Optimization", "MNIST")
folder_path_Audio = os.path.join(os.getcwd(),  "..", "Model_Optimization", "Audio_MNIST")

# Define the list of models to check
models_to_check = [
    "VQC_angle_list",
    "VQC_amplitude_list",
    "dressed_quantum_list",
    "sequent_quantum_list",
    "NN_with_compressed_input_list",
    "NN_with_original_input_list",
]

# Call the function for Banknote Authentication dataset
best_combinations_banknote = best_combination_and_store(folder_path_BanknoteAuthentication, models_to_check, round_numbers=True)
best_combinations_breastcancer = best_combination_and_store(folder_path_BreastCancer, models_to_check, round_numbers=True)
best_combinations_mnist = best_combination_and_store(folder_path_MNIST, models_to_check, round_numbers=True)
best_combinations_audiomnist = best_combination_and_store(folder_path_Audio, models_to_check, round_numbers=True)

In [None]:
# Visualize
print("Plot for best combination for Banknote Authentication")
plot_specific_combination(folder_path_BanknoteAuthentication, best_VQC_angle_lr_BanknoteAuthentication, "VQC_angle_list")
plot_specific_combination(folder_path_BanknoteAuthentication, best_VQC_amplitude_lr_BanknoteAuthentication, "VQC_amplitude_list")
plot_specific_combination(folder_path_BanknoteAuthentication, best_NN_compressed_lr_BanknoteAuthentication, "NN_with_compressed_input_list")
plot_specific_combination(folder_path_BanknoteAuthentication, best_NN_original_lr_BanknoteAuthentication, "NN_with_original_input_list")
plot_specific_combination(folder_path_BanknoteAuthentication, best_Sequent_classical_lr_BanknoteAuthentication, "sequent_classical_list")
plot_specific_combination(folder_path_BanknoteAuthentication, best_Sequent_quantum_lr_BanknoteAuthentication, "sequent_quantum_list")
plot_specific_combination(folder_path_BanknoteAuthentication, best_Dressed_classical_lr_BanknoteAuthentication, "dressed_classical_list")
plot_specific_combination(folder_path_BanknoteAuthentication, best_Dressed_quantum_lr_BanknoteAuthentication, "dressed_quantum_list")


print("Plot for best combination for Breast Cancer Detection")
plot_specific_combination(folder_path_BreastCancer, best_VQC_angle_lr_BreastCancer, "VQC_angle_list")
plot_specific_combination(folder_path_BreastCancer, best_VQC_amplitude_lr_BreastCancer, "VQC_amplitude_list")
plot_specific_combination(folder_path_BreastCancer, best_NN_compressed_lr_BreastCancer, "NN_with_compressed_input_list")
plot_specific_combination(folder_path_BreastCancer, best_NN_original_lr_BreastCancer, "NN_with_original_input_list")
plot_specific_combination(folder_path_BreastCancer, best_Sequent_classical_lr_BreastCancer, "sequent_classical_list")
plot_specific_combination(folder_path_BreastCancer, best_Sequent_quantum_lr_BreastCancer, "sequent_quantum_list")
plot_specific_combination(folder_path_BreastCancer, best_Dressed_classical_lr_BreastCancer, "dressed_classical_list")
plot_specific_combination(folder_path_BreastCancer, best_Dressed_quantum_lr_BreastCancer, "dressed_quantum_list")


print("Plot for best combination for MNIST")
plot_specific_combination(folder_path_MNIST, best_VQC_angle_lr_MNIST, "VQC_angle_list")
plot_specific_combination(folder_path_MNIST, best_VQC_amplitude_lr_MNIST, "VQC_amplitude_list")
plot_specific_combination(folder_path_MNIST, best_NN_compressed_lr_MNIST, "NN_with_compressed_input_list")
plot_specific_combination(folder_path_MNIST, best_NN_original_lr_MNIST, "NN_with_original_input_list")
plot_specific_combination(folder_path_MNIST, best_Sequent_classical_lr_MNIST, "sequent_classical_list")
plot_specific_combination(folder_path_MNIST, best_Sequent_quantum_lr_MNIST, "sequent_quantum_list")
plot_specific_combination(folder_path_MNIST, best_Dressed_classical_lr_MNIST, "dressed_classical_list")
plot_specific_combination(folder_path_MNIST, best_Dressed_quantum_lr_MNIST, "dressed_quantum_list")


print("Plot for best combination for Audio MNIST")
plot_specific_combination(folder_path_Audio, 0.001, "VQC_angle_list")
plot_specific_combination(folder_path_Audio, 0.1, "VQC_amplitude_list")
plot_specific_combination(folder_path_Audio, 0.01, "NN_with_compressed_input_list")
plot_specific_combination(folder_path_Audio, 0.1, "NN_with_original_input_list")
plot_specific_combination(folder_path_Audio, 0.1, "sequent_classical_list")
plot_specific_combination(folder_path_Audio, 0.1, "sequent_quantum_list")
plot_specific_combination(folder_path_Audio, 0.1, "dressed_classical_list")
plot_specific_combination(folder_path_Audio, 0.1, "dressed_quantum_list")