# Setup

In [1]:
import os
import sys

sys.path.append(os.path.abspath("../.."))

In [2]:
from pathlib import Path
import json
import numpy as np


In [3]:
from src.experiment.helpers.variables import report_output_root_dir
report_output_root_dir

WindowsPath('C:/VisualStudioRepositories/MUSIC_DATA/metric_analysis/output')

In [4]:
from src.experiment.sets.data_sets import multilabel_datasets
multilabel_dataset_names = list(multilabel_datasets.keys())
multilabel_dataset_names

['bibtex_trimmed', 'emotions_trimmed']

In [23]:
all_dataset_names = multilabel_dataset_names

# Get all correlation JSONs

In [5]:
pearson_correlation_files = sorted(report_output_root_dir.rglob('pearson_correlation.json'), key=lambda x: x.name)
spearman_correlation_files = sorted(report_output_root_dir.rglob('spearman_correlation.json'), key=lambda x: x.name)
correlation_files = pearson_correlation_files + spearman_correlation_files


In [6]:
print(len(pearson_correlation_files))
print(len(spearman_correlation_files))
print(len(correlation_files))

2
2
4


# Functions

In [29]:
def average_corr_matrices(files, classification_types, class_balances, datasets, correlation):
    correlation_matrices = []

    for file_path in files:
        with open(file_path, 'r') as f:
            data = json.load(f)

            if (
                data.get("classification_type") in classification_types and
                data.get("class_balance") in class_balances and
                data.get("dataset_name") in datasets and
                data.get("correlation_type") == correlation
            ):
                correlation_matrices.append(data["correlation_matrix"])
            
    if not correlation_matrices:
        return
    
    metric_keys = correlation_matrices[0].keys()
    averaged_matrix = {}

    for metric in metric_keys:
        keys = correlation_matrices[0][metric].keys()
        averaged_matrix[metric] = {}
        for k in keys:
            # Collect all values for this cell across matrices
            values = [m[metric][k] for m in correlation_matrices]
            averaged_matrix[metric][k] = float(np.mean(values))
    
    full_matrix_info = {
        "classification_types": classification_types,
        "class_balances": class_balances,
        "correlation_type": correlation,
        "datasets": multilabel_dataset_names,
        "matrix": averaged_matrix
    }
    
    return full_matrix_info

In [33]:
def calculate_and_save_matrix(files, classification_types, class_balances, datasets, suffix, root_dir=report_output_root_dir):
    for corr_type in ["pearson", "spearman"]:
        matrix_info = average_corr_matrices(
            files=files,
            classification_types=classification_types,
            class_balances=class_balances,
            datasets=datasets,
            correlation=corr_type
        )
        
        if not matrix_info:
            continue
    
        output_path = root_dir / f"averaged_{corr_type}_correlation_{suffix}.json"
        
        with open(output_path, "w") as f:
            json.dump(matrix_info, f, indent=2)

# Create matrices

All

In [32]:
calculate_and_save_matrix(
    files=correlation_files,
    classification_types=["binary", "multiclass", "multilabel"],
    class_balances=["balanced", "imbalanced"],
    datasets=all_dataset_names,
    suffix="all")

By classification type

In [34]:
for cls_type in ["binary", "multiclass", "multilabel"]:
    calculate_and_save_matrix(
        files=correlation_files,
        classification_types=[cls_type],
        class_balances=["balanced", "imbalanced"],
        datasets=all_dataset_names,
        suffix=cls_type
    )

By class balance

In [35]:
for balance in ["balanced", "imbalanced"]:
    calculate_and_save_matrix(
        files=correlation_files,
        classification_types=["binary", "multiclass", "multilabel"],
        class_balances=[balance],
        datasets=all_dataset_names,
        suffix=balance
    )