# Setup

In [2]:
sys.path.append(os.path.abspath("../.."))

In [4]:
import os
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import torch

from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from src.experiment.sets.metric_sets import create_full_multilabel_metrics

from src.experiment.helpers.variables import report_output_dirs

from src.experiment.helpers.task_type import TaskType
from src.experiment.metric_processing.metric_calc import update_metrics_using_probabilities, compute_metrics, create_metric_dictionary
from src.experiment.metric_processing.metric_display import print_metric_dictionary
from src.experiment.metric_processing.metric_reports import write_results_report_to_new_file, experiment_info, fold_info

from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [6]:
from src.experiment.helpers.variables import dataset_scene_dir
from src.experiment.helpers.variables import dataset_scene_trimmed_dir
print(dataset_scene_dir)
print(dataset_scene_trimmed_dir)

c:\VisualStudioRepositories\MUSIC_DATA\datasets\multilabel\scene
c:\VisualStudioRepositories\MUSIC_DATA\datasets\multilabel\scene_trimmed


# Preparation

In [7]:
X_copy = pd.read_csv(dataset_scene_trimmed_dir / 'X.csv')
y_copy = pd.read_csv(dataset_scene_trimmed_dir / 'y.csv')

In [8]:
output_dir = report_output_dirs['multilabel_balanced']
output_dir

WindowsPath('C:/VisualStudioRepositories/MUSIC_DATA/metric_analysis/output/multilabel_balanced')

In [9]:
num_classes = y_copy.shape[1]
num_classes

6

In [10]:
metrics = create_full_multilabel_metrics(num_classes=num_classes, device='cpu')
metrics

{'macro_accuracy': <src.experiment.metrics.qualitative.accuracies.MacroAccuracy at 0x16d94286cf0>,
 'micro_accuracy': <src.experiment.metrics.qualitative.accuracies.MicroAccuracy at 0x16d94287a10>,
 'accuracy_per_class': <src.experiment.metrics.qualitative.accuracies.PerClassAccuracy at 0x16d94287b60>,
 'macro_precision': <src.experiment.metrics.qualitative.precisions.MacroPrecision at 0x16d9443c050>,
 'micro_precision': <src.experiment.metrics.qualitative.precisions.MicroPrecision at 0x16d9443c1a0>,
 'precision_per_class': <src.experiment.metrics.qualitative.precisions.PerClassPrecision at 0x16d9443c2f0>,
 'macro_recall': <src.experiment.metrics.qualitative.recalls.MacroRecall at 0x16d9443c440>,
 'micro_recall': <src.experiment.metrics.qualitative.recalls.MicroRecall at 0x16d9443c590>,
 'recall_per_class': <src.experiment.metrics.qualitative.recalls.PerClassRecall at 0x16d9443c6e0>,
 'macro_f1': <src.experiment.metrics.qualitative.fscores.MacroF1 at 0x16d9443c830>,
 'micro_f1': <src.e

In [11]:
class_names = list(y_copy.columns)
print(class_names)

['Beach', 'Sunset', 'FallFoliage', 'Field', 'Mountain', 'Urban']


In [15]:
index = 1
output_dir_2 = output_dir / "scene" / f"{index}"
output_dir_2

WindowsPath('C:/VisualStudioRepositories/MUSIC_DATA/metric_analysis/output/multilabel_balanced/scene/1')

In [16]:
output_dir_2.mkdir(parents=True, exist_ok=True)

# Loop

In [17]:

X = X_copy.values
y = y_copy.values

mskf = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_idx, test_idx) in enumerate(mskf.split(X, y)):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    train_dist = y_train.sum(axis=0)
    test_dist = y_test.sum(axis=0)
    train_pct = train_dist / len(y_train) * 100
    test_pct = test_dist / len(y_test) * 100

    print(f"Fold {fold}:")
    print(f"Train label distribution: {train_dist}")
    print(f"Test label distribution:  {test_dist}")
    print(f"Train label percentages:   {np.round(train_pct, 2)}")
    print(f"Test label percentages:    {np.round(test_pct, 2)}\n")
    
    fold_information = fold_info(
        train_distribution=train_dist.tolist(),
        test_distribution=test_dist.tolist(),
        train_distribution_pct=train_pct.tolist(),
        test_distribution_pct=test_pct.tolist(),
    )
    
    logistic_clf = OneVsRestClassifier(LogisticRegression(max_iter=1000, solver='lbfgs'))
    logistic_clf.fit(X_train, y_train)
    print(f"Fold {fold} model trained.\n")
    
    # Get probabilities (sigmoid outputs) for each label
    y_probas = logistic_clf.predict_proba(X_test)
    
    update_metrics_using_probabilities(metrics, y_probas, y_test)
    computed_metrics = compute_metrics(metrics)
    
    print("RESULTS:")
    dictionary = create_metric_dictionary(computed_metrics, class_names)
    print_metric_dictionary(dictionary)
    
    output_dir_3 = output_dir_2 / f"fold_{fold}"
    output_dir_3.mkdir(parents=True, exist_ok=True)
    
    exp_info = experiment_info(
        model_name="Logistic Regression",
        dataset_name="scene",
        classification_type=TaskType.MULTILABEL,
        class_balance="balanced",
        class_names=class_names,
        index=index,
        cv_fold=fold,
    )
    
    write_results_report_to_new_file(
        output_dir_path=output_dir_3,
        results=computed_metrics,
        fold_info=fold_information,
        experiment_info=exp_info
        )
    
    print(f"Results written to {output_dir_3}\n")


Fold 0:
Train label distribution: [333 291 315 335 327 345]
Test label distribution:  [84 73 78 84 82 86]
Train label percentages:   [18.22 15.92 17.23 18.33 17.89 18.87]
Test label percentages:    [18.46 16.04 17.14 18.46 18.02 18.9 ]

Fold 0 model trained.

RESULTS:
macro_accuracy: 0.9017
micro_accuracy: 0.9017
accuracy_per_class:
	Beach: 0.8897, Sunset: 0.9591, FallFoliage: 0.9116, Field: 0.9405, Mountain: 0.8448, Urban: 0.8645, 
macro_precision: 0.7674
micro_precision: 0.7740
precision_per_class:
	Beach: 0.7398, Sunset: 0.9333, FallFoliage: 0.7759, Field: 0.8696, Mountain: 0.6078, Urban: 0.6780, 
macro_recall: 0.6349
micro_recall: 0.6312
recall_per_class:
	Beach: 0.6128, Sunset: 0.8009, FallFoliage: 0.6837, Field: 0.7952, Mountain: 0.3788, Urban: 0.5377, 
macro_f1: 0.6928
micro_f1: 0.6953
f1_per_class:
	Beach: 0.6703, Sunset: 0.8621, FallFoliage: 0.7269, Field: 0.8307, Mountain: 0.4668, Urban: 0.5998, 
MSE: 0.0705
LogLoss: 0.2345
AUNU: 0.9264
micro_ROC-AUC: 0.9349
AUNP: 0.9251
ROC-