In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import glob
import ast
import torch
import numpy as np
import seaborn as sns
from scipy.stats import ttest_ind
from sklearn.metrics import confusion_matrix


In [None]:
import sys
sys.path.append('../')
from utils.analyzation_tools import corrected_repeated_kFold_cv_test as cv_test
import utils.confusion_matrix as cm
from finetuning.model.region_loss import Regional_Loss


## Helpers


In [None]:

def compare_loss(list_of_df, name, save_path):
    """
    Function to compare metrics of different experiments using t-tests.
    :param list_of_df: List of dataframes
    :param name: Name of the experiment
    :param save_path: Path to save the plot
    :return: None
    """
    list_of_df = [df.copy() for df in list_of_df]
    for i in range(len(list_of_df)):
        list_of_df[i] = list_of_df[i].assign(Experiment=f"L{i+1}")
        cols_to_drop= list_of_df[i].filter(like='text', axis=1).columns.tolist()
        cols_to_drop.extend(['prediction', 'label'])
        list_of_df[i] = list_of_df[i].drop(columns=cols_to_drop)
        
        list_of_df[i].columns = list_of_df[i].columns.str.split().str[-2:].str.join(" ")

    condf = pd.concat(list_of_df)
    metrics = condf.columns[:-1]
    meltdf = condf.melt(id_vars=["Experiment"], var_name="Metric", value_name="Value")
    meltdf["Value"] = meltdf["Value"].apply(lambda x: x.item() if isinstance(x, torch.Tensor) else x)
    meltdf["Value"] = meltdf["Value"].apply(lambda x: float(x[0]) if isinstance(x, list) else x)
    meltdf["Value"] = meltdf["Value"].astype(float)
    loss_config = ['L1', 'L2', 'L3', 'L4']
    
    for metric in metrics:
        matrix = []
        values_matrix = []
        print(f"Metric: {metric}")
        for i in range(len(loss_config)):
            signi_buffer = []
            value_buffer = []
            for j in range(len(loss_config)):
                exp1 = loss_config[i]
                exp2 = loss_config[j]

                values1 = meltdf[meltdf['Experiment'] == exp1]
                values1 = values1[values1['Metric'] == metric]['Value']
                values2 = meltdf[meltdf['Experiment'] == exp2]
                values2 = values2[values2['Metric'] == metric]['Value']
                # We assume significnce level of 0.05; due to 10 flod validation we have 9:1 ration of samples
                ttest_result = ttest_ind(values1.to_list(), values2.to_list())
                t_stat = ttest_result.statistic
                p_value = ttest_result.pvalue
                degrees_of_freedom = len(values1.to_list()) + len(values2.to_list()) - 2
                if p_value < 0.05:
                    if values1.mean() > values2.mean():
                        print(f"{exp1} is significantly better than {exp2}")
                        signi_buffer.append(1)
                    else:
                        print(f"{exp2} is significantly better than {exp1}")
                        signi_buffer.append(-1)
                else:
                    print(f"No significant difference between {exp1} and {exp2}")
                    signi_buffer.append(0)
                value_str = (f"M1={values1.mean():.3f}, S1={values1.std():.3f}\n"
                             f"M2={values2.mean():.3f}, S2={values2.std():.3f}\n"
                             f"t({degrees_of_freedom}) = {t_stat:.3f}, p={p_value:.3f}")
                value_buffer.append(value_str)
                
            matrix.append(signi_buffer)
            values_matrix.append(value_buffer)
            
        matrix = pd.DataFrame(matrix, index=loss_config, columns=loss_config)
        values_matrix = pd.DataFrame(values_matrix, index=loss_config, columns=loss_config)
        
        plt.figure(figsize=(10, 7))
        sns.heatmap(matrix, annot=values_matrix, fmt='', cmap='coolwarm', cbar=False)
        plt.title(f"{metric} comparison")
        plt.savefig(save_path + f"{name}_{metric}_comparison.png")
        plt.close()

      

In [None]:

def compare_dataframe(df_data1, df_data2, dataset_names):
    """
    Function to compare metrics of different experiments using t-tests.
    :param list_of_df: List of dataframes
    :param name: Name of the experiment
    :param save_path: Path to save the plot
    :return: None
    """
    cols_to_drop = df_data1.filter(like='text', axis=1).columns.tolist()
    cols_to_drop.extend(['prediction', 'label'])
    df_data1 = df_data1.drop(columns=cols_to_drop)
    df_data2 = df_data2.drop(columns=cols_to_drop)
    df_data1.columns = df_data1.columns.str.split().str[-2:].str.join(" ")
    df_data2.columns = df_data2.columns.str.split().str[-2:].str.join(" ")
    metrics = df_data1.columns[:-1]
        
    for metric in metrics:
        print(f"Metric: {metric}")
        data1 = df_data1[metric]
        data2 = df_data2[metric]
        t_stat, p_value = ttest_ind(data1.to_list(), data2.to_list())
        degrees_of_freedom = len(data1.to_list()) + len(data2.to_list()) - 2
        if p_value < 0.05:
            if data1.mean() > data2.mean():
                print(f"{dataset_names[0]} is significantly better than {dataset_names[1]}")
            else:
                print(f"{dataset_names[1]} is significantly better than {dataset_names[0]}")
        else:
            print(f"No significant difference between {dataset_names[0]} and {dataset_names[1]}")
        print(f"data1 = {dataset_names[0]}, data2={dataset_names[1]}")
        print(f"M1={data1.mean():.3f}, S1={data1.std():.3f}\n M2={data2.mean():.3f}, S2={data2.std():.3f}\n t({degrees_of_freedom}) = {t_stat:.3f}, p={p_value:.3f}")

In [None]:

def calculate_metrics(df, data_type, REPO_PATH):
    """
    Calculate the metrics for region and country columns.

    Args:
        df (pd.DataFrame): The DataFrame containing the metrics.
        data_type (str): The type of data (validation or test).

    Returns:
        pd.DataFrame: The DataFrame containing the calculated metrics.
    """
    country_list = f'{REPO_PATH}/utils/country_list/country_list_region_and_continent.csv'
    country_list = pd.read_csv(country_list)
    metrics_calculator = Regional_Loss(country_list=country_list)
    # Convert the 'Output' column to a list of tensors
    df['Output'] = df['Output'].apply(lambda x: torch.tensor(x))
    if data_type == 'validation':
        dfs = np.array_split(df, 10)
        for df in dfs:
            # Stack the list of tensors into a single tensor
            outputs = torch.stack(df['Output'].tolist())
            c_ac = metrics_calculator.calculate_country_accuracy(outputs, df['Label'])
            c_prec, c_rec, c_f1,_,_ = metrics_calculator.calculate_metrics_per_class(outputs, df['Label'])
            r_ac = metrics_calculator.claculate_region_accuracy(outputs, df['Label'])
            r_prec, r_rec, r_f1,_,_ = metrics_calculator.calculate_metrics_per_region(outputs, df['Label'])
            m_prec, m_rec, m_f1,_,_ = metrics_calculator.calculate_mixed_metrics(outputs, df['Label'])
            ignored_class = len(df['Label'].unique())  - len(df['Prediction'].unique())
            ignored_region= sum(1 for x, y in zip(r_prec, r_rec) if x == 0 and y == 0)

            metrics = {
                'country_accuracy': [c_ac],
                'country_precision': [c_prec.mean()],
                'country_recall': [c_rec.mean()],
                'country_f1': [c_f1.mean()],
                'region_accuracy': [r_ac],
                'region_precision': [r_prec.mean()],
                'region_recall': [r_rec.mean()],
                'region_f1': [r_f1.mean()],
                'mixed_precision': [m_prec.mean()],
                'mixed_recall': [m_rec.mean()],
                'mixed_f1': [m_f1.mean()],
                'ignored_classes': [ignored_class],
                'ignored_regions': [ignored_region],
                'prediction': [df['Prediction']],
                'label': [df['Label']]
            }
            if 'all_metrics' in locals():
                all_metrics = pd.concat([all_metrics, pd.DataFrame(metrics)])
            else:
                all_metrics = pd.DataFrame(metrics)
    else:
        # Stack the list of tensors into a single tensor
        outputs = torch.stack(df['Output'].tolist())
        c_ac = metrics_calculator.calculate_country_accuracy(outputs, df['Label'])
        c_prec, c_rec, c_f1,_,_ = metrics_calculator.calculate_metrics_per_class(outputs, df['Label'])
        r_ac = metrics_calculator.claculate_region_accuracy(outputs, df['Label'])
        r_prec, r_rec, r_f1,_,_ = metrics_calculator.calculate_metrics_per_region(outputs, df['Label'])
        m_prec, m_rec, m_f1 = metrics_calculator.calculate_mixed_metrics(outputs, df['Label'])
        ignored_class = len(df['Label'].unique())  - len(df['Prediction'].unique())
        ignored_region= sum(1 for x, y in zip(r_prec, r_rec) if x == 0 and y == 0)

        metrics = {
            'country_accuracy': [c_ac],
            'country_precision': [c_prec.mean()],
            'country_recall': [c_rec.mean()],
            'country_f1': [c_f1.mean()],
            'region_accuracy': [r_ac],
            'region_precision': [r_prec.mean()],
            'region_recall': [r_rec.mean()],
            'region_f1': [r_f1.mean()],
            'mixed_precision': [m_prec.mean()],
            'mixed_recall': [m_rec.mean()],
            'mixed_f1': [m_f1.mean()],
            'ignored_classes': [ignored_class],
            'ignored_regions': [ignored_region],
            'prediction': [df['Prediction']],
            'label': [df['Label']]
        }
        all_metrics = pd.DataFrame(metrics)

    return all_metrics


In [None]:
def read_csv_from_dir(log_dir, REPO_PATH):
    """

    """
    # Create empty lists to store the dataframes
    validation_dfs = []
    test_dfs = []
    zero_shot_dfs = []

    # Iterate over the folders in the log directory
    for folder in sorted(os.listdir(log_dir)):
        folder_path = os.path.join(log_dir, folder)
        if os.path.isdir(folder_path):
            # calculate the metrics for all seeds in the folder
            log_files = glob.glob(folder_path + "/*")
            validation_buffer = []
            test_buffer = []
            zero_shot_buffer = []
            for file_path in log_files:
                if '.csv' not in file_path:
                    continue
                df = pd.read_csv(file_path,converters={"Output": ast.literal_eval})

                # Split the data into validation and test data
                #if 'validation' in file_path:
                #    df = calculate_metrics(df, 'validation', REPO_PATH=REPO_PATH)
                #    validation_buffer.append(df)
                if 'zero' in file_path:
                    df = calculate_metrics(df, 'zero', REPO_PATH=REPO_PATH)
                    zero_shot_buffer.append(df)
                elif 'test' in file_path:
                    df = calculate_metrics(df, 'test', REPO_PATH=REPO_PATH)
                    test_buffer.append(df)
            #validation_dfs.append(pd.concat(validation_buffer))
            test_dfs.append(pd.concat(test_buffer))
            zero_shot_dfs.append(pd.concat(zero_shot_buffer))
    return validation_dfs, test_dfs, zero_shot_dfs


In [None]:
def box_plot_experiments(list_of_df, name, save_path, loss_number=0, dataset_names=None, metric_names=None, legend_out_of_plot=False):
    """
    Genreates box plots for all metrics contained in the dataframes.
    Compares these metrics for each dataframe in the list.

    Parameters:
    list_of_df (list): A list of dataframes.
    name (str): The name of the experiment.
    save_path (str): The path to save the plot.

    Returns:
    pd.DataFrame: A concatenated dataframe containing all data with a coloumn tagging the used Loss.
    """
    dataset_to_indices = {'Strongly Balanced':0, 'Unbalanced':1, 'Weakly Balanced':2, 'Mixed Strongly Balanced':3, 'Mixed Weakly Balanced':4}
    if dataset_names is not None:
        indices = [dataset_to_indices[name] for name in dataset_names]
        list_of_df = [list_of_df[i] for i in indices]


    sns.set_theme(style="whitegrid")
    list_of_df = [df[loss_number].copy() for df in list_of_df]

    cols_to_use = metric_names.copy()
    cols_to_use.append('Experiment')

    for i in range(len(list_of_df)):


        keys = list(dataset_to_indices.keys())
        list_of_df[i] = list_of_df[i].assign(Experiment=keys[indices[i]])
        list_of_df[i] = list_of_df[i][cols_to_use]
        #cols_to_drop = list_of_df[i].filter(like='text', axis=1).columns
        #list_of_df[i] = list_of_df[i].drop(columns=cols_to_drop)

        list_of_df[i].columns = list_of_df[i].columns.str.split().str[-2:].str.join(" ")
        list_of_df[i] = list_of_df[i].rename(columns={'country_accuracy': 'Accuracy', 'country_precision': 'Precision', 'country_recall': 'Recall', 'country_f1': 'F1', 'region_accuracy': 'Accuracy', 'region_precision': 'Precision', 'region_recall': 'Recall', 'region_f1': 'F1', 'mixed_precision': 'Precision', 'mixed_recall': 'Recall', 'mixed_f1': 'F1'})

    condf = pd.concat(list_of_df)
    meltdf = condf.melt(id_vars=["Experiment"], var_name="Metric", value_name="Value")
    meltdf["Value"] = meltdf["Value"].apply(lambda x: float(x[0]) if isinstance(x,list) else x) 
    meltdf["Value"] = meltdf["Value"].apply(lambda x: float(x.item()) if isinstance(x,torch.Tensor) else x) 

    ax = sns.boxplot(
        x="Metric", y="Value", hue="Experiment", data=meltdf, showfliers=False
    )
    ax.set_title(name)
    if legend_out_of_plot:
        lgd = plt.legend(loc='upper left', fontsize='small', borderaxespad=0.0, bbox_to_anchor=(1, 1))
    else:
        lgd = plt.legend(loc='upper right', fontsize='small', borderaxespad=0.0)
    ax.set_xticklabels(ax.get_xticklabels(), fontsize=10)
    plt.savefig(
        save_path + f"{name}-boxplot.png",
        bbox_extra_artists=(lgd,),
        bbox_inches="tight",
    )
    plt.clf()
    plt.close()
    return condf


## Run eval

In [None]:
REPO_PATH = '/share/temp/bjordan/good_practices_in_machine_learning/good_practices_ml/'

# directory of all experiments
experimient_dir= '/share/temp/bjordan/good_practices_in_machine_learning/good_practices_ml/finetuning/runs/merged_seeds2/'
# create lists that contain the dataframes of the different experiments
# First axis contains the different dataset configurations
# Second axis contains the different Loss configurations
# Third axis contains the DataFrame of the different seeds
validation_sets = []
test_sets = []
zeros_shot_datasets = []


for folder in sorted(os.listdir(experimient_dir)):
    log_dir = os.path.join(experimient_dir, folder)
    if os.path.isdir(log_dir):
        save_path = log_dir + '/results/'
        # Call the event_to_df function with the log directory 
        val, test, zero= read_csv_from_dir(log_dir,REPO_PATH)
        #validation_sets.append(val)
        test_sets.append(test)
        zeros_shot_datasets.append(zero)



In [None]:
print([len(val) for val in test_sets])
print(len(test_sets))
sorted(os.listdir(experimient_dir))

In [None]:
dataset_names = ['Strongly Balanced', 'Unbalanced', 'Weakly Balanced', 'Mixed Strongly Balanced', 'Mixed Weakly Balanced']
save_path = '/share/temp/bjordan/good_practices_in_machine_learning/good_practices_ml/finetuning/runs/new_figures/'
for i, experiment in enumerate(test_sets):
    name = dataset_names[i]
    compare_loss(experiment, name, save_path)

In [None]:
l1_test_sets = [sublist[0] for sublist in test_sets]
l1_geo_strongly_balanced = l1_test_sets[0]
l1_geo_weakly_balanced = l1_test_sets[2]
l1_geo_mixed_strongly_balanced = l1_test_sets[3]
l1_geo_mixed_weakly_balanced = l1_test_sets[4]
compare_dataframe(l1_geo_strongly_balanced, l1_geo_mixed_strongly_balanced, ['Strongly Balanced', 'Mixed Strongly Balanced'])
compare_dataframe(l1_geo_weakly_balanced, l1_geo_mixed_weakly_balanced, ['Weakly Balanced', 'Mixed Weakly Balanced'])
compare_dataframe(l1_geo_strongly_balanced, l1_geo_weakly_balanced, ['Strongly Balanced', 'Weakly Balanced'])
compare_dataframe(l1_geo_mixed_strongly_balanced, l1_geo_mixed_weakly_balanced, ['Mixed Strongly Balanced', 'Mixed Weakly Balanced'])

In [None]:
#validation_sets_with_region = [df.filter(like='region') for df in validation_sets]
#validation_sets_with_country = [df.filter(like='country') for df in validation_sets]

test_sets_with_region = [[df.filter(like='region') for df in sub_array] for sub_array in test_sets]
test_sets_with_country = [[df.filter(like='country') for df in sub_array] for sub_array in test_sets]
test_sets_mixed = [[df.filter(like='mixed') for df in sub_array] for sub_array in test_sets]

zero_shot_sets_with_region = [[df.filter(like='region') for df in sub_array] for sub_array in zeros_shot_datasets]
zero_shot_sets_with_country = [[df.filter(like='country') for df in sub_array] for sub_array in zeros_shot_datasets]
zero_shot_sets_mixed = [[df.filter(like='mixed') for df in sub_array] for sub_array in zeros_shot_datasets]

In [None]:
dataset_names = ['Strongly Balanced', 'Weakly Balanced', 'Mixed Strongly Balanced', 'Mixed Weakly Balanced']
#box_plot_experiments(validation_sets_with_region, 'Validation Region', save_path, loss_number=0, dataset_names=['geo_strongly_balanced', 'geo_unbalanced', 'geo_weakly_balanced'], metric_names=['region_accuracy', 'region_precision', 'region_recall', 'region_f1'])
#box_plot_experiments(validation_sets_with_country, 'Validation Country', save_path, loss_number=0, dataset_names=['geo_strongly_balanced', 'geo_unbalanced', 'geo_weakly_balanced'], metric_names=['country_accuracy', 'country_precision', 'country_recall', 'country_f1'])

box_plot_experiments(test_sets_with_region, 'Regions on Test-Set', save_path, loss_number=0, dataset_names=dataset_names, metric_names=['region_accuracy', 'region_precision', 'region_recall', 'region_f1'])
box_plot_experiments(test_sets_with_country, 'Countries on Test-Set', save_path, loss_number=0, dataset_names=dataset_names, metric_names=['country_accuracy', 'country_precision', 'country_recall', 'country_f1'])
box_plot_experiments(test_sets_mixed, 'Mixed on Test-Set', save_path, loss_number=0, dataset_names=dataset_names, metric_names=['mixed_accuracy', 'mixed_precision', 'mixed_recall', 'mixed_f1'], legend_out_of_plot=True)

box_plot_experiments(zero_shot_sets_with_region, 'Zero Shot Regions', save_path, loss_number=0, dataset_names=dataset_names, metric_names=['region_accuracy', 'region_precision', 'region_recall', 'region_f1'])
box_plot_experiments(zero_shot_sets_with_country, 'Zero Shot Countries', save_path, loss_number=0, dataset_names=dataset_names, metric_names=['country_accuracy', 'country_precision', 'country_recall', 'country_f1'])
box_plot_experiments(zero_shot_sets_mixed, 'Zero Shot Mixed', save_path, loss_number=0, dataset_names=dataset_names, metric_names=['mixed_accuracy', 'mixed_precision', 'mixed_recall', 'mixed_f1'], legend_out_of_plot=True)


In [None]:
len(test_sets)

In [None]:
print(sorted(os.listdir(experimient_dir)))

for experiment in test_sets:
    buffer = []
    for i, df in enumerate(experiment):
        buffer.append(df.assign(Experiment=f'L{i+1}').drop(columns=['prediction', 'label', 'ignored_classes', 'ignored_regions', 'mixed_accuracy', 'mixed_precision', 'mixed_recall', 'mixed_f1']))
    buffer = pd.concat(buffer)
    print(buffer.groupby('Experiment').mean().round(decimals=3).style.highlight_max(props='textbf:--rwrap;').format(precision=3).to_latex())



In [None]:
print(sorted(os.listdir(experimient_dir)))

for experiment in test_sets:
    buffer = []
    for i, df in enumerate(experiment):
        buffer.append(df.assign(Experiment=f'L{i+1}')['mixed_accuracy', 'mixed_precision', 'mixed_recall', 'mixed_f1'])
    buffer = pd.concat(buffer)
    print(buffer.groupby('Experiment').mean().round(decimals=3).style.highlight_max(props='textbf:--rwrap;').format(precision=3).to_latex())

In [None]:
for experiment in test_sets:
    buffer = []
    for i, df in enumerate(experiment):
        buffer.append(df.assign(Experiment=f'L{i+1}')[['ignored_classes', 'ignored_regions', 'Experiment']])
    buffer = pd.concat(buffer)
    print(buffer.groupby('Experiment').mean().round(decimals=3).style.highlight_max(props='textbf:--rwrap;').format(precision=3).to_latex())
box_plot_experiments(test_sets, 'Ignored Classes on Test Set', save_path, loss_number=0, metric_names=['ignored_classes'], dataset_names=dataset_names, legend_out_of_plot=True)
box_plot_experiments(test_sets, 'Ignored Regions on Test Set', save_path, loss_number=0, metric_names=['ignored_regions'], dataset_names=dataset_names, legend_out_of_plot=True)



In [None]:
from sklearn.metrics import confusion_matrix
import pandas as pd
import os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import ast

def create_and_save_confusion_matrices(REPO_PATH, SAVE_FIGURES_PATH, true_countries, predicted_countries, normalize=False):
    """
    Create and save confusion matrices for countries and regions.

    Args:
        REPO_PATH (str): path to repo folder.
        SAVE_FIGURES_PATH (str): path to save the confusion matrices.
        true_countries (list): list of true country labels.
        predicted_countries (list): list of predicted country labels.
        normalize (bool): whether to normalize the confusion matrices or not.
    
    Returns:
        None
    """

    # Load country list and regional ordering index
    country_list = pd.read_csv(f'{REPO_PATH}/utils/country_list/country_list_region_and_continent.csv')
    regional_ordering_index = [8, 11, 144, 3, 4, 12, 16, 26, 28, 44, 46, 51, 52, 66, 74, 83, 95, 101, 105, 109, 121, 128, 153, 180, 191, 201, 202, 32, 43, 77, 81, 134, 140, 146, 179, 99, 106, 185, 187, 198, 58, 98, 122, 131, 133, 136, 159, 163, 166, 177, 178, 193, 195, 209, 210, 41, 80, 97, 102, 103, 126, 127, 192, 20, 31, 48, 84, 119, 152, 160, 162, 173, 194, 60, 137, 149, 165, 204, 78, 156, 7, 34, 35, 40, 64, 53, 56, 116, 117, 167, 188, 23, 33, 72, 196, 13, 50, 55, 59, 62, 65, 69,
                                    86, 88, 92, 94, 113, 115, 142, 168, 172, 38, 148, 189, 205, 9, 25, 27, 39, 42, 54, 61, 68, 76, 79, 147, 157, 197, 200, 24, 85, 100, 107, 125, 135, 150, 169, 184, 186, 203, 30, 138, 182, 208, 2, 17, 29, 89, 91, 111, 132, 143, 151, 0, 5, 15, 57, 71, 75, 82, 93, 120, 123, 130, 155, 161, 171, 175, 199, 206, 19, 22, 37, 45, 70, 73, 112, 124, 129, 139, 170, 174, 176, 183, 1, 6, 14, 21, 47, 67, 87, 90, 96, 104, 108, 145, 154, 158, 164, 181, 190, 207, 10, 18, 36, 49, 63, 110, 114, 118, 141]
    
    # constant for classes
    classes = country_list['Country']
    np_classes = np.array(classes)
    country_dict = {country: index for index, country in enumerate(country_list["Country"])}

    # Build country confusion matrices
    #if normalize:
    #    cf_matrix = confusion_matrix(true_countries, predicted_countries, labels=range(0, 211), normalize='true')
    #else:
    #    cf_matrix = confusion_matrix(true_countries, predicted_countries, labels=range(0, 211))
    
    # Get the unique classes from the 'prediction' and 'label' columns
    filtered_classes = list(set([*true_countries, *predicted_countries]))
    class_indices = [country_dict[country] for country in filtered_classes]
    regional_ordering_index = [x for x in regional_ordering_index if x in class_indices]
    true_countries_indices = [country_dict[country] for country in true_countries]
    predicted_countries_indices = [country_dict[country] for country in predicted_countries]


    if normalize:
        cf_matrix = confusion_matrix(true_countries, predicted_countries, labels=classes, normalize='true')
    else:
        cf_matrix = confusion_matrix(true_countries, predicted_countries, labels=classes)
    ordered_index = np.argsort(-cf_matrix.diagonal())
    ordered_index = [x for x in ordered_index if x in class_indices]
    ordered_matrix = cf_matrix[ordered_index][:, ordered_index]
    regionally_ordered_matrix = cf_matrix[regional_ordering_index][:,regional_ordering_index]
    ordered_classes = np_classes[ordered_index]
    regionally_ordered_classes = np_classes[regional_ordering_index]

    df_cm = pd.DataFrame(cf_matrix, index=classes, columns=classes)
    ordered_df_cm = pd.DataFrame(
        ordered_matrix, index=ordered_classes, columns=ordered_classes)
    regionally_ordered_df_cm = pd.DataFrame(
        regionally_ordered_matrix, index=regionally_ordered_classes, columns=regionally_ordered_classes)

    #Create region labels
    np_regions = np.sort(np.array(list(set(country_list['Intermediate Region Name']))))
    true_regions = []
    true_regions_indices = []
    predicted_regions = []
    predicted_regions_indices = []
    for i in range(0, len(true_countries)):
        true_country_index = country_dict[true_countries[i]]
        predicted_country_index = country_dict[predicted_countries[i]]
        true_regions.append(country_list.iloc[true_country_index]["Intermediate Region Name"])
        predicted_regions.append(country_list.iloc[predicted_country_index]["Intermediate Region Name"])
        true_regions_indices.append(ast.literal_eval(country_list.iloc[true_countries_indices[i]]["One Hot Region"]).index(1))
        predicted_regions_indices.append(ast.literal_eval(country_list.iloc[predicted_countries_indices[i]]["One Hot Region"]).index(1))

    region_indices = list(set([*true_regions_indices, *predicted_regions_indices]))

    # Build region confusion matrices
    #if normalize:
    #    regions_cf_matrix = confusion_matrix(true_regions, predicted_regions, labels=range(0, len(np_regions)), normalize='true')
    #else:
    #    regions_cf_matrix = confusion_matrix(true_regions, predicted_regions, labels=range(0, len(np_regions)))
    if normalize:
        regions_cf_matrix = confusion_matrix(true_regions, predicted_regions, labels=np_regions, normalize='true')
    else:
        regions_cf_matrix = confusion_matrix(true_regions, predicted_regions, labels=np_regions)
    regions_ordered_index = np.argsort(-regions_cf_matrix.diagonal())
    regions_ordered_index = [x for x in regions_ordered_index if x in region_indices]
    regions_ordered_matrix = regions_cf_matrix[regions_ordered_index][:,regions_ordered_index]
    ordered_regions = np_regions[regions_ordered_index]

    regions_df_cm = pd.DataFrame(regions_cf_matrix, index=np_regions, columns=np_regions)
    regions_ordered_df_cm = pd.DataFrame(regions_ordered_matrix, index=ordered_regions, columns=ordered_regions)

    # Save confusion matrices
    if normalize:
        if not os.path.exists(f'{SAVE_FIGURES_PATH}/normalized'):
            os.makedirs(f'{SAVE_FIGURES_PATH}/normalized')
    else:
        if not os.path.exists(f'{SAVE_FIGURES_PATH}'):
            os.makedirs(f'{SAVE_FIGURES_PATH}')       

    fig_1, ax_1 = plt.subplots(figsize=(120, 90))
    sns.set(font_scale=8)
    ax_1 = sns.heatmap(df_cm, cmap=sns.cubehelix_palette(as_cmap=True),xticklabels = 1,yticklabels=1)
    ax_1.tick_params(axis='both', labelsize=15)
    ax_1.set(xlabel=None, ylabel=None)
    if normalize:
        ax_1.figure.savefig(f'{SAVE_FIGURES_PATH}/normalized/simple_confusion_matrix.png')
    else:
        ax_1.figure.savefig(f'{SAVE_FIGURES_PATH}/simple_confusion_matrix.png')
    fig_2, ax_2 = plt.subplots(figsize=(120, 90))
    ax_2 = sns.heatmap(ordered_df_cm, cmap=sns.cubehelix_palette(as_cmap=True),xticklabels=1,yticklabels=1)
    ax_2.tick_params(axis='both', labelsize=15)
    ax_2.set(xlabel=None, ylabel=None)
    if normalize:
        ax_2.figure.savefig(f'{SAVE_FIGURES_PATH}/normalized/ordered_confusion_matrix.png')
    else:
        ax_2.figure.savefig(f'{SAVE_FIGURES_PATH}/ordered_confusion_matrix.png')
    fig_3, ax_3 = plt.subplots(figsize=(120, 90))
    ax_3 = sns.heatmap(regionally_ordered_df_cm, cmap=sns.cubehelix_palette(as_cmap=True),xticklabels=1,yticklabels=1)
    ax_3.tick_params(axis='both', labelsize=15)
    ax_3.set(xlabel=None, ylabel=None)
    if normalize:
        ax_3.figure.savefig(f'{SAVE_FIGURES_PATH}/normalized/regionally_ordered_confusion_matrix.png')
    else:
        ax_3.figure.savefig(f'{SAVE_FIGURES_PATH}/regionally_ordered_confusion_matrix.png')
    fig_4, ax_4 = plt.subplots(figsize=(120, 90))
    ax_4 = sns.heatmap(regions_df_cm, cmap=sns.cubehelix_palette(as_cmap=True),xticklabels = 1,yticklabels=1)
    ax_4.tick_params(axis='both', labelsize=50)
    ax_4.set(xlabel=None, ylabel=None)
    if normalize:
        ax_4.figure.savefig(f'{SAVE_FIGURES_PATH}/normalized/regions_confusion_matrix.png')
    else:
        ax_4.figure.savefig(f'{SAVE_FIGURES_PATH}/regions_confusion_matrix.png')
    fig_5, ax_5 = plt.subplots(figsize=(120, 90))
    ax_5 = sns.heatmap(regions_ordered_df_cm, cmap=sns.cubehelix_palette(as_cmap=True),xticklabels = 1,yticklabels=1)
    ax_5.tick_params(axis='both', labelsize=50)
    ax_5.set(xlabel=None, ylabel=None)
    if normalize:
        ax_5.figure.savefig(f'{SAVE_FIGURES_PATH}/normalized/regions_ordered_confusion_matrix.png')
    else:
        ax_5.figure.savefig(f'{SAVE_FIGURES_PATH}/regions_ordered_confusion_matrix.png')
    fig_1.clf()
    fig_2.clf()
    fig_3.clf()
    fig_4.clf()
    fig_5.clf()
    return

In [None]:
for i, dataset in enumerate(l1_test_sets):
    matrix_save_path = save_path + dataset_names[i] + '/'
    true_countries = pd.concat(dataset['label'].tolist())
    print(true_countries)
    break

In [None]:
dataset_names = ['Strongly Balanced', 'Unbalanced', 'Weakly Balanced', 'Mixed Strongly Balanced', 'Mixed Weakly Balanced']
dataset_to_indices = {'Strongly Balanced':0, 'Unbalanced':1, 'Weakly Balanced':2, 'Mixed Strongly Balanced':3, 'Mixed Weakly Balanced':4}
country_list = pd.read_csv(f'{REPO_PATH}/utils/country_list/country_list_region_and_continent.csv')
country_dict = {country: index for index, country in enumerate(country_list["Country"])}



for i, dataset in enumerate(l1_test_sets):

    matrix_save_path = save_path + dataset_names[i] + '/'
    true_countries = pd.concat(dataset['label'].tolist()).values
    predicted_countries = pd.concat(dataset['prediction'].tolist()).values
    #true_countries = true_countries.map(country_dict)
    #predicted_countries = predicted_countries.map(country_dict)
    create_and_save_confusion_matrices(REPO_PATH=REPO_PATH, SAVE_FIGURES_PATH=matrix_save_path, true_countries=true_countries, predicted_countries=predicted_countries, normalize=True)