# Setup

In [1]:
import os
import sys

sys.path.append(os.path.abspath("../.."))

In [2]:
from pathlib import Path
import pandas as pd
import numpy as np
import torch

from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from src.experiment.sets.metric_sets import create_full_multilabel_metrics

from src.experiment.helpers.variables import report_output_root_dir

from src.experiment.helpers.task_type import TaskType
from src.experiment.metric_processing.metric_calc import update_metrics_using_probabilities, compute_metrics, create_metric_dictionary
from src.experiment.metric_processing.metric_display import print_metric_dictionary
from src.experiment.metric_processing.metric_reports import write_results_report_to_new_file, experiment_info, fold_info

from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [3]:
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# print(device)

In [4]:
from src.experiment.helpers.variables import dataset_root_dir
dataset_root_dir

WindowsPath('c:/VisualStudioRepositories/MUSIC_DATA/datasets')

In [5]:
output_dir_base = report_output_root_dir
output_dir_base

WindowsPath('C:/VisualStudioRepositories/MUSIC_DATA/metric_analysis/output')

# Settings

In [5]:
REPETITIONS = 2

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer

def get_preprocessor(X):
    X_df = pd.DataFrame(X)
    
    binary_cols = X_df.columns[(X_df.nunique() == 2)].tolist()
    numeric_cols = X_df.columns.difference(binary_cols).tolist()

    preprocessor = ColumnTransformer(
        transformers=[
            ('binary', FunctionTransformer(validate=False), binary_cols),
            ('num', StandardScaler(), numeric_cols)
        ],
        remainder='drop'
    )
    return preprocessor

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.multioutput import MultiOutputClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline

multilabel_models = {
    'logistic_regression': lambda X: Pipeline([
        ('preprocess', get_preprocessor(X)),
        ('clf', OneVsRestClassifier(LogisticRegression()))
    ]),
    'linear_svm': lambda X: Pipeline([
        ('preprocess', get_preprocessor(X)),
        ('clf', OneVsRestClassifier(SVC(kernel='linear', probability=True)))
    ]),
    # 'decision_tree': lambda X: Pipeline([
    #     ('preprocess', get_preprocessor(X)),
    #     ('clf', MultiOutputClassifier(DecisionTreeClassifier()))
    # ]),
    # 'random_forest': lambda X: Pipeline([
    #     ('preprocess', get_preprocessor(X)),
    #     ('clf', MultiOutputClassifier(RandomForestClassifier()))
    # ]),
    # 'gradient_boosting': lambda X: Pipeline([
    #     ('preprocess', get_preprocessor(X)),
    #     ('clf', MultiOutputClassifier(GradientBoostingClassifier()))
    # ]),
    'knn': lambda X: Pipeline([
        ('preprocess', get_preprocessor(X)),
        ('clf', MultiOutputClassifier(KNeighborsClassifier(n_neighbors=5)))
    ]),
}


In [7]:
from src.experiment.sets.data_sets import multilabel_datasets
multilabel_datasets

{'bibtex_trimmed': {'path': WindowsPath('c:/VisualStudioRepositories/MUSIC_DATA/datasets/multilabel/bibtex_trimmed'),
  'classification_type': 'multilabel',
  'class_balance': 'balanced',
  'dataset_name': 'bibtex_trimmed'},
 'emotions_trimmed': {'path': WindowsPath('c:/VisualStudioRepositories/MUSIC_DATA/datasets/multilabel/emotions_trimmed'),
  'classification_type': 'multilabel',
  'class_balance': 'balanced',
  'dataset_name': 'emotions_trimmed'}}

# Loop

In [9]:
from sklearn.base import clone

def cross_validation(X, y, dataset_info, class_names, mskf, metrics, output_dir, index, model, model_name):    
    for fold, (train_idx, test_idx) in enumerate(mskf.split(X, y)):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        train_dist = y_train.sum(axis=0)
        test_dist = y_test.sum(axis=0)
        train_pct = train_dist / len(y_train) * 100
        test_pct = test_dist / len(y_test) * 100
        
        fold_information = fold_info(
            train_distribution=train_dist.tolist(),
            test_distribution=test_dist.tolist(),
            train_distribution_pct=train_pct.tolist(),
            test_distribution_pct=test_pct.tolist(),
        )
        
        model_clone = clone(model)
        model_clone.fit(X_train, y_train)
        
        probas = model_clone.predict_proba(X_test)
        
        if isinstance(probas, list):
            y_probas = np.vstack([p[:, 1] for p in probas]).T # MultiOutputClassifer
        else:
            y_probas = probas # OneVsRestClassifier
        
        update_metrics_using_probabilities(metrics, y_probas, y_test)
        computed_metrics = compute_metrics(metrics)
        
        output_dir_final = output_dir / f"fold_{fold}"
        output_dir_final.mkdir(parents=True, exist_ok=True)
        
        exp_info = experiment_info(
            model_name=model_name,
            classification_type=dataset_info["classification_type"],
            class_balance=dataset_info["class_balance"],
            dataset_name=dataset_info["dataset_name"],
            class_names=class_names,
            index=index,
            cv_fold=fold,
        )
        
        write_results_report_to_new_file(
            output_dir_path=output_dir_final,
            results=computed_metrics,
            fold_info=fold_information,
            experiment_info=exp_info
            )
        
        print(f"Results written to {output_dir_final} directory.\n")


In [10]:
def repeated_cross_validation(X, y, dataset_info, class_names, repetitions, output_dir, metrics, model, model_name):
    for index in range(repetitions):
        output_dir_with_index = output_dir / f"{index}"
        mskf = MultilabelStratifiedKFold(n_splits=5, shuffle=True)
        cross_validation(X, y, dataset_info, class_names, mskf, metrics, output_dir_with_index, index, model, model_name)
        

In [11]:
def perform_experiments(datasets, models, repetitions, output_dir_base):
    for _, dataset_info in datasets.items():
        X_df = pd.read_csv(dataset_info["path"] / 'X.csv')
        y_df = pd.read_csv(dataset_info["path"] / 'y.csv')

        class_names = list(y_df.columns)
        class_no = len(class_names)
        
        X = X_df.to_numpy()
        y = y_df.to_numpy()    
        
        output_dir = output_dir_base / dataset_info['classification_type'] / dataset_info['class_balance'] / dataset_info['dataset_name']
        metrics = create_full_multilabel_metrics(class_no, device="cpu")
        
        for model_name, model_fn in models.items():
            model = model_fn(X)
            output_dir_with_model = output_dir / model_name
            repeated_cross_validation(X, y, dataset_info, class_names, repetitions, output_dir_with_model, metrics, model, model_name)
    

In [12]:
perform_experiments(multilabel_datasets, multilabel_models, REPETITIONS, output_dir_base)

Results written to C:\VisualStudioRepositories\MUSIC_DATA\metric_analysis\output\multilabel\balanced\bibtex_trimmed\logistic_regression\0\fold_0 directory.

Results written to C:\VisualStudioRepositories\MUSIC_DATA\metric_analysis\output\multilabel\balanced\bibtex_trimmed\logistic_regression\0\fold_1 directory.

Results written to C:\VisualStudioRepositories\MUSIC_DATA\metric_analysis\output\multilabel\balanced\bibtex_trimmed\logistic_regression\0\fold_2 directory.

Results written to C:\VisualStudioRepositories\MUSIC_DATA\metric_analysis\output\multilabel\balanced\bibtex_trimmed\logistic_regression\0\fold_3 directory.

Results written to C:\VisualStudioRepositories\MUSIC_DATA\metric_analysis\output\multilabel\balanced\bibtex_trimmed\logistic_regression\0\fold_4 directory.

Results written to C:\VisualStudioRepositories\MUSIC_DATA\metric_analysis\output\multilabel\balanced\bibtex_trimmed\logistic_regression\1\fold_0 directory.

Results written to C:\VisualStudioRepositories\MUSIC_DATA\