# Setup

In [1]:
import os
import sys

sys.path.append(os.path.abspath("../.."))

In [2]:
from pathlib import Path
import json
import numpy as np


In [3]:
from src.experiment.helpers.variables import report_output_root_dir
report_output_root_dir

WindowsPath('C:/VisualStudioRepositories/MUSIC_DATA/metric_analysis/output')

### Datasets

In [7]:
from src.experiment.sets.binary_balanced_datasets import binary_balanced_datasets
binary_balanced_dataset_names = list(binary_balanced_datasets.keys())
binary_balanced_dataset_names

['all-in-one_sentiment_balanced',
 'amazon-reviews-0.25_balanced',
 'ceas_balanced',
 'colon-0.5_balanced',
 'fake-news_balanced',
 'news-sarcasm_balanced',
 'philippine_balanced',
 'santander-customer-satisfaction_balanced',
 'spambase_balanced',
 'vehicle-sensit_balanced']

In [11]:
from src.experiment.sets.binary_imbalanced_datasets import binary_imbalanced_datasets
binary_imbalanced_dataset_names = list(binary_imbalanced_datasets.keys())
binary_imbalanced_dataset_names

['all-in-one_sentiment_imbalanced',
 'amazon-reviews-0.25_imbalanced',
 'ceas_imbalanced',
 'colon-0.5_imbalanced',
 'fake-news_imbalanced',
 'news-sarcasm_imbalanced',
 'philippine_imbalanced',
 'santander-customer-satisfaction_imbalanced',
 'spambase_imbalanced',
 'vehicle-sensit_imbalanced']

In [12]:
from src.experiment.sets.multiclass_balanced_datasets import multiclass_balanced_datasets
multiclass_balanced_dataset_names = list(multiclass_balanced_datasets.keys())
multiclass_balanced_dataset_names

['ag-news_balanced',
 'dbpedia-ontology_balanced',
 'gas-drift_balanced',
 'gtsrb-huelist_balanced',
 'irish-times_balanced',
 'mfeat-karhunen_balanced',
 'news-category_balanced',
 'nyt-comments-april17_balanced',
 'usps_balanced',
 'volkert_balanced']

In [13]:
from src.experiment.sets.multiclass_imbalanced_datasets import multiclass_imbalanced_datasets
multiclass_imbalanced_dataset_names = list(multiclass_imbalanced_datasets.keys())
multiclass_imbalanced_dataset_names

['ag-news_imbalanced',
 'dbpedia-ontology_imbalanced',
 'gas-drift_imbalanced',
 'gtsrb-huelist_imbalanced',
 'irish-times_imbalanced',
 'mfeat-karhunen_imbalanced',
 'news-category_imbalanced',
 'nyt-comments-april17_imbalanced',
 'usps_imbalanced',
 'volkert_imbalanced']

In [8]:
from src.experiment.sets.multilabel_balanced_datasets import multilabel_balanced_datasets
multilabel_balanced_dataset_names = list(multilabel_balanced_datasets.keys())
multilabel_balanced_dataset_names

['bookmarks_balanced',
 'emotions_balanced',
 'imdb_balanced',
 'mediamill_balanced',
 'ng20_balanced',
 'nuswidevlad_balanced',
 'scene_balanced',
 'tmc2007500_balanced',
 'yeast_balanced',
 'yelp_balanced']

In [9]:
from src.experiment.sets.multilabel_imbalanced_datasets import multilabel_imbalanced_datasets
multilabel_imbalanced_dataset_names = list(multilabel_imbalanced_datasets.keys())
multilabel_imbalanced_dataset_names

['bookmarks_imbalanced',
 'emotions_imbalanced',
 'imdb_imbalanced',
 'mediamill_imbalanced',
 'ng20_imbalanced',
 'nuswidevlad_imbalanced',
 'scene_imbalanced',
 'tmc2007500_imbalanced',
 'yeast_imbalanced',
 'yelp_imbalanced']

In [16]:
all_dataset_names = binary_balanced_dataset_names + binary_imbalanced_dataset_names + multiclass_balanced_dataset_names + multiclass_imbalanced_dataset_names + multilabel_balanced_dataset_names + multilabel_imbalanced_dataset_names
all_dataset_names

['all-in-one_sentiment_balanced',
 'amazon-reviews-0.25_balanced',
 'ceas_balanced',
 'colon-0.5_balanced',
 'fake-news_balanced',
 'news-sarcasm_balanced',
 'philippine_balanced',
 'santander-customer-satisfaction_balanced',
 'spambase_balanced',
 'vehicle-sensit_balanced',
 'all-in-one_sentiment_imbalanced',
 'amazon-reviews-0.25_imbalanced',
 'ceas_imbalanced',
 'colon-0.5_imbalanced',
 'fake-news_imbalanced',
 'news-sarcasm_imbalanced',
 'philippine_imbalanced',
 'santander-customer-satisfaction_imbalanced',
 'spambase_imbalanced',
 'vehicle-sensit_imbalanced',
 'ag-news_balanced',
 'dbpedia-ontology_balanced',
 'gas-drift_balanced',
 'gtsrb-huelist_balanced',
 'irish-times_balanced',
 'mfeat-karhunen_balanced',
 'news-category_balanced',
 'nyt-comments-april17_balanced',
 'usps_balanced',
 'volkert_balanced',
 'ag-news_imbalanced',
 'dbpedia-ontology_imbalanced',
 'gas-drift_imbalanced',
 'gtsrb-huelist_imbalanced',
 'irish-times_imbalanced',
 'mfeat-karhunen_imbalanced',
 'news-c

In [17]:
len(all_dataset_names)

60

# Get all correlation JSONs

In [18]:
pearson_correlation_files = sorted(report_output_root_dir.rglob('pearson_correlation.json'), key=lambda x: x.name)
spearman_correlation_files = sorted(report_output_root_dir.rglob('spearman_correlation.json'), key=lambda x: x.name)
correlation_files = pearson_correlation_files + spearman_correlation_files


In [19]:
print(len(pearson_correlation_files))
print(len(spearman_correlation_files))
print(len(correlation_files))

60
60
120


# Functions

In [24]:
def average_corr_matrices(files, classification_types, class_balances, datasets, correlation):
    correlation_matrices = []

    for file_path in files:
        with open(file_path, 'r') as f:
            data = json.load(f)

            if (
                data.get("classification_type") in classification_types and
                data.get("class_balance") in class_balances and
                data.get("dataset_name") in datasets and
                data.get("correlation_type") == correlation
            ):
                correlation_matrices.append(data["correlation_matrix"])
            
    if not correlation_matrices:
        return
    
    metric_keys = correlation_matrices[0].keys()
    averaged_matrix = {}

    for metric in metric_keys:
        keys = correlation_matrices[0][metric].keys()
        averaged_matrix[metric] = {}
        for k in keys:
            # Collect all values for this cell across matrices
            values = [m[metric][k] for m in correlation_matrices]
            averaged_matrix[metric][k] = float(np.mean(values))
    
    full_matrix_info = {
        "classification_types": classification_types,
        "class_balances": class_balances,
        "correlation_type": correlation,
        "datasets": datasets,
        "matrix": averaged_matrix
    }
    
    return full_matrix_info

In [25]:
def calculate_and_save_matrix(files, classification_types, class_balances, datasets, suffix, root_dir=report_output_root_dir):
    for corr_type in ["pearson", "spearman"]:
        matrix_info = average_corr_matrices(
            files=files,
            classification_types=classification_types,
            class_balances=class_balances,
            datasets=datasets,
            correlation=corr_type
        )
        
        if not matrix_info:
            continue
    
        output_path = root_dir / f"averaged_{corr_type}_correlation_{suffix}.json"
        
        with open(output_path, "w") as f:
            json.dump(matrix_info, f, indent=2)

# Create matrices

All

In [26]:
calculate_and_save_matrix(
    files=correlation_files,
    classification_types=["binary", "multiclass", "multilabel"],
    class_balances=["balanced", "imbalanced"],
    datasets=all_dataset_names,
    suffix="all")

By classification type

In [27]:
calculate_and_save_matrix(
    files=correlation_files,
    classification_types=["binary"],
    class_balances=["balanced", "imbalanced"],
    datasets=binary_balanced_dataset_names + binary_imbalanced_dataset_names,
    suffix="binary")

In [28]:
calculate_and_save_matrix(
    files=correlation_files,
    classification_types=["multiclass"],
    class_balances=["balanced", "imbalanced"],
    datasets=multiclass_balanced_dataset_names + multiclass_imbalanced_dataset_names,
    suffix="multiclass")

In [29]:
calculate_and_save_matrix(
    files=correlation_files,
    classification_types=["multilabel"],
    class_balances=["balanced", "imbalanced"],
    datasets=multilabel_balanced_dataset_names + multilabel_imbalanced_dataset_names,
    suffix="multilabel")

By class balance

In [30]:
calculate_and_save_matrix(
        files=correlation_files,
        classification_types=["binary", "multiclass", "multilabel"],
        class_balances="balanced",
        datasets=binary_balanced_dataset_names + multiclass_balanced_dataset_names + multilabel_balanced_dataset_names,
        suffix="balanced"
    )

In [31]:
calculate_and_save_matrix(
        files=correlation_files,
        classification_types=["binary", "multiclass", "multilabel"],
        class_balances="imbalanced",
        datasets=binary_imbalanced_dataset_names + multiclass_imbalanced_dataset_names + multilabel_imbalanced_dataset_names,
        suffix="imbalanced"
    )