In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import glob
import ast
import torch
import numpy as np

In [2]:
import sys
sys.path.append('../')
from utils.analyzation_tools import corrected_repeated_kFold_cv_test as cv_test
from finetuning.model.region_loss import Regional_Loss


2024-05-30 12:15:03.058244: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Helpers


In [3]:

def compare_loss(list_of_df, name, save_path):
    """
    Function to compare metrics of different experiments using t-tests.
    :param list_of_df: List of dataframes
    :param name: Name of the experiment
    :param save_path: Path to save the plot
    :return: None
    """
    list_of_df = [df.copy() for df in list_of_df]
    for i in range(len(list_of_df)):
        list_of_df[i] = list_of_df[i].assign(Experiment=f"L{i+1}")
        cols_to_drop= list_of_df[i].filter(like='text', axis=1).columns
        list_of_df[i] = list_of_df[i].drop(columns=cols_to_drop)
        
        list_of_df[i].columns = list_of_df[i].columns.str.split().str[-2:].str.join(" ")

    condf = pd.concat(list_of_df)
    metrics = condf.columns[:-1]
    meltdf = condf.melt(id_vars=["Experiment"], var_name="Metric", value_name="Value")
    meltdf["Value"] = meltdf["Value"].apply(lambda x: float(x[0]) if type(x) == list else x)
    meltdf["Value"] = meltdf["Value"].astype(float)
    loss_config = ['L1', 'L2', 'L3', 'L4']
    
    for metric in metrics:
        matrix = []
        print(f"Metric: {metric}")
        for i in range(len(loss_config)):
            buffer = []
            for j in range(len(loss_config)):
                exp1 = loss_config[i]
                exp2 = loss_config[j]
                
                values1 = meltdf[meltdf['Experiment'] == exp1]
                values1 = values1[values1['Metric'] == metric]['Value']
                values2 = meltdf[meltdf['Experiment'] == exp2]
                values2 = values2[values2['Metric'] == metric]['Value']
                # We assume significnce level of 0.05; due to 10 flod validation we have 9:1 ration of samples
                _,_,_, p_value = cv_test(values1.to_list(), values2.to_list(), 9, 1, 0.05)
                if p_value < 0.05:
                    if values1.mean() > values2.mean():
                        print(f"{exp1} is significantly better than {exp2}")
                        buffer.append(1)
                    else:
                        print(f"{exp2} is significantly better than {exp1}")
                        buffer.append(-1)
                else:
                    print(f"No significant difference between {exp1} and {exp2}")
                    buffer.append(0)
            matrix.append(buffer)
        matrix = pd.DataFrame(matrix, index=loss_config, columns=loss_config)
        plt.figure(figsize=(10, 7))
        sns.heatmap(matrix, annot=True, cmap='coolwarm', cbar=False)
        plt.title(f"{metric} comparison")
        plt.savefig(save_path + f"{name}_{metric}_comparison.png")
        plt.close()
      

In [65]:

def calculate_metrics(df, data_type,REPO_PATH):
    """
    Calculate the metrics for region and country columns.

    Args:
        df (pd.DataFrame): The DataFrame containing the metrics.
        data_type (str): The type of data (validation or test).

    Returns:
        pd.DataFrame: The DataFrame containing the calculated metrics.
    """
    country_list = f'{REPO_PATH}/utils/country_list/country_list_region_and_continent.csv'
    country_list = pd.read_csv(country_list)
    metrics_calculator = Regional_Loss(country_list=country_list)
    # Convert the 'Output' column to a list of tensors
    df['Output'] = df['Output'].apply(lambda x: torch.tensor(x))

    # Stack the list of tensors into a single tensor
    outputs = torch.stack(df['Output'].tolist())
    if data_type == 'validation':
        NotImplementedError
    elif data_type == 'test':
        c_ac = metrics_calculator.calculate_country_accuracy(outputs, df['Label'])
        c_prec, c_rec, c_f1,_,_ = metrics_calculator.calculate_metrics_per_class(outputs, df['Label'])
        r_ac = metrics_calculator.claculate_region_accuracy(outputs, df['Label'])
        r_prec, r_rec, r_f1,_,_ = metrics_calculator.calculate_metrics_per_region(outputs, df['Label'])
        ignored_class = len(df['Label'].unique())  - len(df['Prediction'].unique())
    metrics = {
        'country_accuracy': [c_ac],
        'country_precision': [c_prec.mean()],
        'country_recall': [c_rec.mean()],
        'country_f1': [c_f1.mean()],
        'region_accuracy': [r_ac],
        'region_precision': [r_prec.mean()],
        'region_recall': [r_rec.mean()],
        'region_f1': [r_f1.mean()],
        'ignored_classes': [ignored_class]
    }

    return metrics


In [68]:
def read_csv_from_dir(log_dir, REPO_PATH):
    """

    """
    # Create empty lists to store the dataframes
    validation_dfs = []
    test_dfs = []
    zero_shot_dfs = []

    # Iterate over the folders in the log directory
    for folder in sorted(os.listdir(log_dir)):
        folder_path = os.path.join(log_dir, folder)
        if os.path.isdir(folder_path):
            # calculate the metrics for all seeds in the folder
            log_files = glob.glob(folder_path + "/*")
            validation_buffer = []
            test_buffer = []
            zero_shot_buffer = []
            for file_path in log_files:
                if '.csv' not in file_path:
                    continue
                df = pd.read_csv(file_path,converters={"Output": ast.literal_eval})

                # Split the data into validation and test data
                if 'validation' in file_path:
                    df = calculate_metrics(df, 'validation', REPO_PATH=REPO_PATH)
                    validation_buffer.append(df)
                elif 'test' in file_path:
                    df = calculate_metrics(df, 'test', REPO_PATH=REPO_PATH)
                    test_buffer.append(df)
                elif 'zero' in file_path:
                    df = calculate_metrics(df, 'zero', REPO_PATH=REPO_PATH)
                    zero_shot_buffer.append(df)
            validation_dfs.append(pd.concat(validation_buffer))
            test_dfs.append(pd.concat(test_buffer))
            zero_shot_dfs.append(pd.concat(zero_shot_buffer))
    return validation_dfs, test_dfs, zero_shot_dfs


## Run eval

In [69]:
REPO_PATH = '/home/leon/Documents/GPML/good_practices_ml/'

# directory of all experiments
experimient_dir= '/media/leon/Samsung_T5/Uni/good_practices_ml/runs/merged_seeds/'
# create lists that contain the dataframes of the different experiments
# First axis contains the different dataset configurations
# Second axis contains the different Loss configurations
# Third axis contains the DataFrame of the different seeds
validation_sets = []
test_sets = []
zeros_shot_datasets = []


for folder in sorted(os.listdir(experimient_dir)):
    log_dir = os.path.join(experimient_dir, folder)
    if os.path.isdir(log_dir):
        save_path = log_dir + '/results/'
        # Call the event_to_df function with the log directory 
        val, test, zero= read_csv_from_dir(log_dir,REPO_PATH)
        validation_sets.append(val)
        test_sets.append(test)
        zeros_shot_datasets.append(zero)



In [None]:
dataset_config = ['Strongly Balanced', 'Unbalanced', 'Weakly Balanced', 'Mixed Strongly Balanced', 'Mixed Unbalanced', 'Mixed Weakly Balanced']
save_path = '/media/leon/Samsung_T5/Uni/good_practices_ml/runs/merged_seeds/figures/'
for experiment in enumerate(validation_sets):
    name = dataset_config[experiment[0]]
    compare_loss(experiment, name, save_path)