# Setup

In [1]:
import os
import sys

sys.path.append(os.path.abspath("../.."))

In [2]:
from pathlib import Path
import pandas as pd
import numpy as np
import torch

from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from src.experiment.sets.metric_sets import create_full_multilabel_metrics

from src.experiment.helpers.variables import report_output_root_dir

from src.experiment.helpers.task_type import TaskType
from src.experiment.metric_processing.metric_calc import update_metrics_using_probabilities, compute_metrics, create_metric_dictionary
from src.experiment.metric_processing.metric_display import print_metric_dictionary
from src.experiment.metric_processing.metric_reports import write_results_report_to_new_file, experiment_info, fold_info

from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [3]:
from src.experiment.helpers.variables import dataset_scene_dir
from src.experiment.helpers.variables import dataset_scene_trimmed_dir
print(dataset_scene_dir)
print(dataset_scene_trimmed_dir)

c:\VisualStudioRepositories\MUSIC_DATA\datasets\multilabel\scene
c:\VisualStudioRepositories\MUSIC_DATA\datasets\multilabel\scene_trimmed


# Preparation

Load datasets

In [5]:
X_copy_scene = pd.read_csv(dataset_scene_trimmed_dir / 'X.csv')
y_copy_scene = pd.read_csv(dataset_scene_trimmed_dir / 'y.csv')

In [6]:
output_dir_base = report_output_root_dir
output_dir_base

WindowsPath('C:/VisualStudioRepositories/MUSIC_DATA/metric_analysis/output')

In [7]:
num_classes = y_copy_scene.shape[1]
num_classes

6

In [8]:
class_names_scene = list(y_copy_scene.columns)
print(class_names_scene)

['Beach', 'Sunset', 'FallFoliage', 'Field', 'Mountain', 'Urban']


In [10]:
X_scene = X_copy_scene.values
y_scene = y_copy_scene.values

# Settings

In [48]:
REPETITIONS = 3

In [49]:
models = {
    'logistic_regression': OneVsRestClassifier(LogisticRegression(max_iter=1000, solver='lbfgs')),
    'logistic_regression2': OneVsRestClassifier(LogisticRegression(max_iter=100, solver='lbfgs')),
}

In [50]:
datasets = {
    "scene": {
        "X": X_scene,
        "y": y_scene,
        "classification_type": "multilabel",
        "class_balance": "balanced",
        "dataset_name": "scene",
        "class_names": class_names_scene,
    },
    "scene2": {
        "X": X_scene,
        "y": y_scene,
        "classification_type": "multilabel",
        "class_balance": "balanced",
        "dataset_name": "scene2",
        "class_names": class_names_scene,
    }
}

# Loop

In [64]:
from sklearn.base import clone

def cross_validation(dataset, mskf, metrics, output_dir, index, model, model_name):
    X = dataset["X"]
    y = dataset["y"]
    
    for fold, (train_idx, test_idx) in enumerate(mskf.split(X, y)):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        train_dist = y_train.sum(axis=0)
        test_dist = y_test.sum(axis=0)
        train_pct = train_dist / len(y_train) * 100
        test_pct = test_dist / len(y_test) * 100
        
        fold_information = fold_info(
            train_distribution=train_dist.tolist(),
            test_distribution=test_dist.tolist(),
            train_distribution_pct=train_pct.tolist(),
            test_distribution_pct=test_pct.tolist(),
        )
        
        model_clone = clone(model)
        model_clone.fit(X_train, y_train)
        
        y_probas = model_clone.predict_proba(X_test)
        
        update_metrics_using_probabilities(metrics, y_probas, y_test)
        computed_metrics = compute_metrics(metrics)
        
        output_dir_final = output_dir / f"fold_{fold}"
        output_dir_final.mkdir(parents=True, exist_ok=True)
        
        exp_info = experiment_info(
            model_name=model_name,
            classification_type=dataset["classification_type"],
            class_balance=dataset["class_balance"],
            dataset_name=dataset["dataset_name"],
            class_names=dataset["class_names"],
            index=index,
            cv_fold=fold,
        )
        
        write_results_report_to_new_file(
            output_dir_path=output_dir_final,
            results=computed_metrics,
            fold_info=fold_information,
            experiment_info=exp_info
            )
        
        print(f"Results written to {output_dir_final} directory.\n")


In [65]:
def repeated_cross_validation(dataset, repetitions, output_dir, metrics, model, model_name):
    for index in range(repetitions):
        output_dir_with_index = output_dir / f"{index}"
        mskf = MultilabelStratifiedKFold(n_splits=5, shuffle=True)
        cross_validation(dataset, mskf, metrics, output_dir_with_index, index, model, model_name)
        

In [66]:
def perform_experiments(datasets, models, repetitions, output_dir_base):
    for _, dataset_info in datasets.items():
        class_names = dataset_info["class_names"]
        class_no = len(class_names)
        
        output_dir = output_dir_base / dataset_info['classification_type'] / dataset_info['class_balance'] / dataset_info['dataset_name']
        metrics = create_full_multilabel_metrics(class_no, device="cpu")
        
        for model_name, model in models.items():
            output_dir_with_model = output_dir / model_name
            repeated_cross_validation(dataset_info, repetitions, output_dir_with_model, metrics, model, model_name)
    

In [67]:
perform_experiments(datasets, models, REPETITIONS, output_dir_base)

Results written to C:\VisualStudioRepositories\MUSIC_DATA\metric_analysis\output\multilabel\balanced\scene\logistic_regression\0\fold_0 directory.

Results written to C:\VisualStudioRepositories\MUSIC_DATA\metric_analysis\output\multilabel\balanced\scene\logistic_regression\0\fold_1 directory.

Results written to C:\VisualStudioRepositories\MUSIC_DATA\metric_analysis\output\multilabel\balanced\scene\logistic_regression\0\fold_2 directory.

Results written to C:\VisualStudioRepositories\MUSIC_DATA\metric_analysis\output\multilabel\balanced\scene\logistic_regression\0\fold_3 directory.

Results written to C:\VisualStudioRepositories\MUSIC_DATA\metric_analysis\output\multilabel\balanced\scene\logistic_regression\0\fold_4 directory.

Results written to C:\VisualStudioRepositories\MUSIC_DATA\metric_analysis\output\multilabel\balanced\scene\logistic_regression\1\fold_0 directory.

Results written to C:\VisualStudioRepositories\MUSIC_DATA\metric_analysis\output\multilabel\balanced\scene\logis