# Setup

In [1]:
import os
import sys

sys.path.append(os.path.abspath("../.."))

In [2]:
from pathlib import Path
import pandas as pd
import numpy as np
import torch

from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from src.experiment.sets.metric_sets import create_full_multilabel_metrics

from src.experiment.helpers.variables import report_output_root_dir

from src.experiment.helpers.task_type import TaskType
from src.experiment.metric_processing.metric_calc import update_metrics_using_probabilities, compute_metrics, create_metric_dictionary
from src.experiment.metric_processing.metric_display import print_metric_dictionary
from src.experiment.metric_processing.metric_reports import write_results_report_to_new_file, experiment_info, fold_info

from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [None]:
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# print(device)

cuda


In [3]:
from src.experiment.helpers.variables import dataset_scene_trimmed_dir, dataset_emotions_trimmed_dir
print(dataset_scene_trimmed_dir)
print(dataset_emotions_trimmed_dir)

c:\VisualStudioRepositories\MUSIC_DATA\datasets\multilabel\scene_trimmed
c:\VisualStudioRepositories\MUSIC_DATA\datasets\multilabel\emotions_trimmed


In [4]:
output_dir_base = report_output_root_dir
output_dir_base

WindowsPath('C:/VisualStudioRepositories/MUSIC_DATA/metric_analysis/output')

# Dataset preparation

Load datasets

In [5]:
X_scene_df = pd.read_csv(dataset_scene_trimmed_dir / 'X.csv')
y_scene_df = pd.read_csv(dataset_scene_trimmed_dir / 'y.csv')

class_names_scene = list(y_scene_df.columns)
print(class_names_scene)

X_scene = X_scene_df.to_numpy()
y_scene = y_scene_df.to_numpy()

['Beach', 'Sunset', 'FallFoliage', 'Field', 'Mountain', 'Urban']


In [6]:
X_emotions_df = pd.read_csv(dataset_emotions_trimmed_dir / 'X.csv')
y_emotions_df = pd.read_csv(dataset_emotions_trimmed_dir / 'y.csv')

class_names_emotions = list(y_emotions_df.columns)
print(class_names_emotions)

X_emotions = X_emotions_df.to_numpy()
y_emotions = y_emotions_df.to_numpy()

['amazed-suprised', 'happy-pleased', 'relaxing-calm', 'quiet-still', 'sad-lonely', 'angry-aggresive']


# Settings

In [7]:
REPETITIONS = 2

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.multioutput import MultiOutputClassifier
from sklearn.multiclass import OneVsRestClassifier

multilabel_models = {
    'logistic_regression': OneVsRestClassifier(LogisticRegression()),
    'linear_svm': OneVsRestClassifier(SVC(kernel='linear', probability=True)),
    'decision_tree': MultiOutputClassifier(DecisionTreeClassifier()), 
    #'random_forest': MultiOutputClassifier(RandomForestClassifier()), 
    #'gradient_boosting': MultiOutputClassifier(GradientBoostingClassifier()),
    'knn': MultiOutputClassifier(KNeighborsClassifier(n_neighbors=5)),
}


In [9]:
multilabel_datasets = {
    "scene": {
        "X": X_scene,
        "y": y_scene,
        "classification_type": "multilabel",
        "class_balance": "balanced",
        "dataset_name": "scene",
        "class_names": class_names_scene,
    },
    "emotions": {
        "X": X_emotions,
        "y": y_emotions,
        "classification_type": "multilabel",
        "class_balance": "balanced",
        "dataset_name": "emotions",
        "class_names": class_names_emotions,
    }
}

# Loop

In [10]:
from sklearn.base import clone

def cross_validation(dataset, mskf, metrics, output_dir, index, model, model_name):
    X = dataset["X"]
    y = dataset["y"]
    
    for fold, (train_idx, test_idx) in enumerate(mskf.split(X, y)):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        train_dist = y_train.sum(axis=0)
        test_dist = y_test.sum(axis=0)
        train_pct = train_dist / len(y_train) * 100
        test_pct = test_dist / len(y_test) * 100
        
        fold_information = fold_info(
            train_distribution=train_dist.tolist(),
            test_distribution=test_dist.tolist(),
            train_distribution_pct=train_pct.tolist(),
            test_distribution_pct=test_pct.tolist(),
        )
        
        model_clone = clone(model)
        model_clone.fit(X_train, y_train)
        
        probas = model_clone.predict_proba(X_test)
        
        if isinstance(probas, list):
            y_probas = np.vstack([p[:, 1] for p in probas]).T # MultiOutputClassifer
        else:
            y_probas = probas # OneVsRestClassifier
        
        update_metrics_using_probabilities(metrics, y_probas, y_test)
        computed_metrics = compute_metrics(metrics)
        
        output_dir_final = output_dir / f"fold_{fold}"
        output_dir_final.mkdir(parents=True, exist_ok=True)
        
        exp_info = experiment_info(
            model_name=model_name,
            classification_type=dataset["classification_type"],
            class_balance=dataset["class_balance"],
            dataset_name=dataset["dataset_name"],
            class_names=dataset["class_names"],
            index=index,
            cv_fold=fold,
        )
        
        write_results_report_to_new_file(
            output_dir_path=output_dir_final,
            results=computed_metrics,
            fold_info=fold_information,
            experiment_info=exp_info
            )
        
        print(f"Results written to {output_dir_final} directory.\n")


In [11]:
def repeated_cross_validation(dataset, repetitions, output_dir, metrics, model, model_name):
    for index in range(repetitions):
        output_dir_with_index = output_dir / f"{index}"
        mskf = MultilabelStratifiedKFold(n_splits=5, shuffle=True)
        cross_validation(dataset, mskf, metrics, output_dir_with_index, index, model, model_name)
        

In [15]:
def perform_experiments(datasets, models, repetitions, output_dir_base):
    for _, dataset_info in datasets.items():
        class_names = dataset_info["class_names"]
        class_no = len(class_names)
        
        output_dir = output_dir_base / dataset_info['classification_type'] / dataset_info['class_balance'] / dataset_info['dataset_name']
        metrics = create_full_multilabel_metrics(class_no, device="cpu")
        
        for model_name, model in models.items():
            output_dir_with_model = output_dir / model_name
            repeated_cross_validation(dataset_info, repetitions, output_dir_with_model, metrics, model, model_name)
    

In [16]:
perform_experiments(multilabel_datasets, multilabel_models, REPETITIONS, output_dir_base)

Results written to C:\VisualStudioRepositories\MUSIC_DATA\metric_analysis\output\multilabel\balanced\scene\logistic_regression\0\fold_0 directory.

Results written to C:\VisualStudioRepositories\MUSIC_DATA\metric_analysis\output\multilabel\balanced\scene\logistic_regression\0\fold_1 directory.

Results written to C:\VisualStudioRepositories\MUSIC_DATA\metric_analysis\output\multilabel\balanced\scene\logistic_regression\0\fold_2 directory.

Results written to C:\VisualStudioRepositories\MUSIC_DATA\metric_analysis\output\multilabel\balanced\scene\logistic_regression\0\fold_3 directory.

Results written to C:\VisualStudioRepositories\MUSIC_DATA\metric_analysis\output\multilabel\balanced\scene\logistic_regression\0\fold_4 directory.

Results written to C:\VisualStudioRepositories\MUSIC_DATA\metric_analysis\output\multilabel\balanced\scene\logistic_regression\1\fold_0 directory.

Results written to C:\VisualStudioRepositories\MUSIC_DATA\metric_analysis\output\multilabel\balanced\scene\logis

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to sca

Results written to C:\VisualStudioRepositories\MUSIC_DATA\metric_analysis\output\multilabel\balanced\emotions\logistic_regression\0\fold_0 directory.

Results written to C:\VisualStudioRepositories\MUSIC_DATA\metric_analysis\output\multilabel\balanced\emotions\logistic_regression\0\fold_1 directory.



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to sca

Results written to C:\VisualStudioRepositories\MUSIC_DATA\metric_analysis\output\multilabel\balanced\emotions\logistic_regression\0\fold_2 directory.

Results written to C:\VisualStudioRepositories\MUSIC_DATA\metric_analysis\output\multilabel\balanced\emotions\logistic_regression\0\fold_3 directory.



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to sca

Results written to C:\VisualStudioRepositories\MUSIC_DATA\metric_analysis\output\multilabel\balanced\emotions\logistic_regression\0\fold_4 directory.

Results written to C:\VisualStudioRepositories\MUSIC_DATA\metric_analysis\output\multilabel\balanced\emotions\logistic_regression\1\fold_0 directory.



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to sca

Results written to C:\VisualStudioRepositories\MUSIC_DATA\metric_analysis\output\multilabel\balanced\emotions\logistic_regression\1\fold_1 directory.

Results written to C:\VisualStudioRepositories\MUSIC_DATA\metric_analysis\output\multilabel\balanced\emotions\logistic_regression\1\fold_2 directory.



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to sca

Results written to C:\VisualStudioRepositories\MUSIC_DATA\metric_analysis\output\multilabel\balanced\emotions\logistic_regression\1\fold_3 directory.

Results written to C:\VisualStudioRepositories\MUSIC_DATA\metric_analysis\output\multilabel\balanced\emotions\logistic_regression\1\fold_4 directory.

Results written to C:\VisualStudioRepositories\MUSIC_DATA\metric_analysis\output\multilabel\balanced\emotions\linear_svm\0\fold_0 directory.

Results written to C:\VisualStudioRepositories\MUSIC_DATA\metric_analysis\output\multilabel\balanced\emotions\linear_svm\0\fold_1 directory.

Results written to C:\VisualStudioRepositories\MUSIC_DATA\metric_analysis\output\multilabel\balanced\emotions\linear_svm\0\fold_2 directory.

Results written to C:\VisualStudioRepositories\MUSIC_DATA\metric_analysis\output\multilabel\balanced\emotions\linear_svm\0\fold_3 directory.

Results written to C:\VisualStudioRepositories\MUSIC_DATA\metric_analysis\output\multilabel\balanced\emotions\linear_svm\0\fold_4 