# Setup

In [1]:
import os
import sys

sys.path.append(os.path.abspath("../.."))

In [2]:
from pathlib import Path
import pandas as pd
import numpy as np
import torch

from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from src.experiment.sets.metric_sets import create_full_multilabel_metrics

from src.experiment.helpers.variables import report_output_dirs

from src.experiment.helpers.task_type import TaskType
from src.experiment.metric_processing.metric_calc import update_metrics_using_probabilities, compute_metrics, create_metric_dictionary
from src.experiment.metric_processing.metric_display import print_metric_dictionary
from src.experiment.metric_processing.metric_reports import write_results_report_to_new_file, experiment_info, fold_info

from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [5]:
from src.experiment.helpers.variables import dataset_scene_dir
from src.experiment.helpers.variables import dataset_scene_trimmed_dir
print(dataset_scene_dir)
print(dataset_scene_trimmed_dir)

c:\VisualStudioRepositories\MUSIC_DATA\datasets\multilabel\scene
c:\VisualStudioRepositories\MUSIC_DATA\datasets\multilabel\scene_trimmed


# Preparation

Load datasets

In [6]:
X_copy = pd.read_csv(dataset_scene_trimmed_dir / 'X.csv')
y_copy = pd.read_csv(dataset_scene_trimmed_dir / 'y.csv')

In [7]:
output_dir = report_output_dirs['multilabel_balanced']
output_dir

WindowsPath('C:/VisualStudioRepositories/MUSIC_DATA/metric_analysis/output/multilabel_balanced')

In [8]:
num_classes = y_copy.shape[1]
num_classes

6

In [9]:
metrics = create_full_multilabel_metrics(num_classes=num_classes, device='cpu')
metrics

{'macro_accuracy': <src.experiment.metrics.qualitative.accuracies.MacroAccuracy at 0x22d74ba5010>,
 'micro_accuracy': <src.experiment.metrics.qualitative.accuracies.MicroAccuracy at 0x22d1b0ba510>,
 'accuracy_per_class': <src.experiment.metrics.qualitative.accuracies.PerClassAccuracy at 0x22d1b0bb230>,
 'macro_precision': <src.experiment.metrics.qualitative.precisions.MacroPrecision at 0x22d1b0bb620>,
 'micro_precision': <src.experiment.metrics.qualitative.precisions.MicroPrecision at 0x22d1b0bb770>,
 'precision_per_class': <src.experiment.metrics.qualitative.precisions.PerClassPrecision at 0x22d1b0bba10>,
 'macro_recall': <src.experiment.metrics.qualitative.recalls.MacroRecall at 0x22d1b0bb8c0>,
 'micro_recall': <src.experiment.metrics.qualitative.recalls.MicroRecall at 0x22d1b0bbb60>,
 'recall_per_class': <src.experiment.metrics.qualitative.recalls.PerClassRecall at 0x22d1b0bbcb0>,
 'macro_f1': <src.experiment.metrics.qualitative.fscores.MacroF1 at 0x22d1b0bbe00>,
 'micro_f1': <src.e

In [10]:
class_names = list(y_copy.columns)
print(class_names)

['Beach', 'Sunset', 'FallFoliage', 'Field', 'Mountain', 'Urban']


In [11]:
repetitions = 20

In [12]:
X = X_copy.values
y = y_copy.values

# Loop

In [13]:
def cross_validate(X, y, mskf, metrics, class_names, output_dir, index):
    for fold, (train_idx, test_idx) in enumerate(mskf.split(X, y)):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        train_dist = y_train.sum(axis=0)
        test_dist = y_test.sum(axis=0)
        train_pct = train_dist / len(y_train) * 100
        test_pct = test_dist / len(y_test) * 100
        
        fold_information = fold_info(
            train_distribution=train_dist.tolist(),
            test_distribution=test_dist.tolist(),
            train_distribution_pct=train_pct.tolist(),
            test_distribution_pct=test_pct.tolist(),
        )
        
        logistic_clf = OneVsRestClassifier(LogisticRegression(max_iter=1000, solver='lbfgs'))
        logistic_clf.fit(X_train, y_train)
        
        y_probas = logistic_clf.predict_proba(X_test)
        
        update_metrics_using_probabilities(metrics, y_probas, y_test)
        computed_metrics = compute_metrics(metrics)
        
        output_dir_final = output_dir / f"fold_{fold}"
        output_dir_final.mkdir(parents=True, exist_ok=True)
        
        exp_info = experiment_info(
            model_name="Logistic Regression",
            dataset_name="scene",
            classification_type=TaskType.MULTILABEL,
            class_balance="balanced",
            class_names=class_names,
            index=index,
            cv_fold=fold,
        )
        
        write_results_report_to_new_file(
            output_dir_path=output_dir_final,
            results=computed_metrics,
            fold_info=fold_information,
            experiment_info=exp_info
            )
        
        print(f"Results written to {output_dir_final} directory.\n")


In [14]:
for index in range(repetitions):
    output_dir_with_index = output_dir / "scene" / f"{index}"
    output_dir_with_index.mkdir(parents=True, exist_ok=True)
    
    mskf = MultilabelStratifiedKFold(n_splits=5, shuffle=True)
    cross_validate(X, y, mskf, metrics, class_names, output_dir_with_index, index)
    
    

Results written to C:\VisualStudioRepositories\MUSIC_DATA\metric_analysis\output\multilabel_balanced\scene\0\fold_0 directory.

Results written to C:\VisualStudioRepositories\MUSIC_DATA\metric_analysis\output\multilabel_balanced\scene\0\fold_1 directory.

Results written to C:\VisualStudioRepositories\MUSIC_DATA\metric_analysis\output\multilabel_balanced\scene\0\fold_2 directory.

Results written to C:\VisualStudioRepositories\MUSIC_DATA\metric_analysis\output\multilabel_balanced\scene\0\fold_3 directory.

Results written to C:\VisualStudioRepositories\MUSIC_DATA\metric_analysis\output\multilabel_balanced\scene\0\fold_4 directory.

Results written to C:\VisualStudioRepositories\MUSIC_DATA\metric_analysis\output\multilabel_balanced\scene\1\fold_0 directory.

Results written to C:\VisualStudioRepositories\MUSIC_DATA\metric_analysis\output\multilabel_balanced\scene\1\fold_1 directory.

Results written to C:\VisualStudioRepositories\MUSIC_DATA\metric_analysis\output\multilabel_balanced\sce