# Setup

In [14]:
import os
import sys

sys.path.append(os.path.abspath("../.."))

In [15]:
from pathlib import Path
import pandas as pd
import numpy as np
import torch

from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from src.experiment.sets.metric_sets import create_full_multilabel_metrics

from src.experiment.helpers.variables import report_output_root_dir

from src.experiment.helpers.task_type import TaskType
from src.experiment.metric_processing.metric_calc import update_metrics_using_probabilities, compute_metrics, create_metric_dictionary
from src.experiment.metric_processing.metric_display import print_metric_dictionary
from src.experiment.metric_processing.metric_reports import write_results_report_to_new_file, experiment_info, fold_info

from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [16]:
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# print(device)

In [17]:
from src.experiment.helpers.variables import dataset_root_dir
dataset_root_dir

WindowsPath('c:/VisualStudioRepositories/MUSIC_DATA/datasets')

In [18]:
output_dir_base = report_output_root_dir
output_dir_base

WindowsPath('C:/VisualStudioRepositories/MUSIC_DATA/metric_analysis/output')

# Settings

In [19]:
REPETITIONS = 1

In [20]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer

def get_preprocessor(X):
    X_df = pd.DataFrame(X)

    binary_cols = [col for col in X_df.columns if set(X_df[col].dropna().unique()).issubset({0, 1})]
    categorical_cols = [col for col in X_df.select_dtypes(include='object') if col not in binary_cols]
    numeric_cols = [col for col in X_df.select_dtypes(exclude='object') if col not in binary_cols]

    preprocessor = ColumnTransformer(
        transformers=[
            ('binary', FunctionTransformer(validate=False), binary_cols),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
            ('num', StandardScaler(), numeric_cols),
        ],
        remainder='drop'
    )
    
    print(f"Preprocessor created with {len(binary_cols)} binary, {len(categorical_cols)} categorical, and {len(numeric_cols)} numeric columns.")
    return preprocessor


In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.multioutput import MultiOutputClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline

multilabel_models = {
    'logistic_regression': lambda X: Pipeline([
        ('preprocess', get_preprocessor(X)),
        ('clf', OneVsRestClassifier(LogisticRegression()))
    ]),
    # 'linear_svm': lambda X: Pipeline([
    #     ('preprocess', get_preprocessor(X)),
    #     ('clf', OneVsRestClassifier(SVC(kernel='linear', probability=True)))
    # ]),
    #  'decision_tree': lambda X: Pipeline([
    #      ('preprocess', get_preprocessor(X)),
    #      ('clf', MultiOutputClassifier(DecisionTreeClassifier()))
    #  ]),
    #  'random_forest': lambda X: Pipeline([
    #      ('preprocess', get_preprocessor(X)),
    #      ('clf', MultiOutputClassifier(RandomForestClassifier()))
    #  ]),
    #  'gradient_boosting': lambda X: Pipeline([
    #      ('preprocess', get_preprocessor(X)),
    #      ('clf', MultiOutputClassifier(GradientBoostingClassifier()))
    #  ]),
    # 'knn': lambda X: Pipeline([
    #     ('preprocess', get_preprocessor(X)),
    #     ('clf', MultiOutputClassifier(KNeighborsClassifier(n_neighbors=5)))
    # ]),
}


In [22]:
from src.experiment.sets.multilabel_balanced_datasets import multilabel_balanced_datasets
multilabel_balanced_datasets

{'bookmarks_balanced': {'path': WindowsPath('c:/VisualStudioRepositories/MUSIC_DATA/datasets/multilabel/bookmarks_balanced'),
  'classification_type': 'multilabel',
  'class_balance': 'balanced',
  'dataset_name': 'bookmarks_balanced'},
 'cal500_balanced': {'path': WindowsPath('c:/VisualStudioRepositories/MUSIC_DATA/datasets/multilabel/cal500_balanced'),
  'classification_type': 'multilabel',
  'class_balance': 'balanced',
  'dataset_name': 'cal500_balanced'},
 'corel16k009_balanced': {'path': WindowsPath('c:/VisualStudioRepositories/MUSIC_DATA/datasets/multilabel/corel16k009_balanced'),
  'classification_type': 'multilabel',
  'class_balance': 'balanced',
  'dataset_name': 'corel16k009_balanced'},
 'delicious_balanced': {'path': WindowsPath('c:/VisualStudioRepositories/MUSIC_DATA/datasets/multilabel/delicious_balanced'),
  'classification_type': 'multilabel',
  'class_balance': 'balanced',
  'dataset_name': 'delicious_balanced'},
 'emotions_balanced': {'path': WindowsPath('c:/VisualSt

# Loop

In [23]:
from sklearn.base import clone

def cross_validation(X, y, dataset_info, class_names, mskf, metrics, output_dir, index, model, model_name):    
    for fold, (train_idx, test_idx) in enumerate(mskf.split(X, y)):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        train_dist = y_train.sum(axis=0)
        test_dist = y_test.sum(axis=0)
        train_pct = train_dist / len(y_train) * 100
        test_pct = test_dist / len(y_test) * 100
        
        fold_information = fold_info(
            train_distribution=train_dist.tolist(),
            test_distribution=test_dist.tolist(),
            train_distribution_pct=train_pct.tolist(),
            test_distribution_pct=test_pct.tolist(),
        )
        
        model_clone = clone(model)
        model_clone.fit(X_train, y_train)
        
        probas = model_clone.predict_proba(X_test)
        
        if isinstance(probas, list):
            y_probas = np.vstack([p[:, 1] for p in probas]).T # MultiOutputClassifer
        else:
            y_probas = probas # OneVsRestClassifier
        
        update_metrics_using_probabilities(metrics, y_probas, y_test)
        computed_metrics = compute_metrics(metrics)
        
        output_dir_final = output_dir / f"fold_{fold}"
        output_dir_final.mkdir(parents=True, exist_ok=True)
        
        exp_info = experiment_info(
            model_name=model_name,
            classification_type=dataset_info["classification_type"],
            class_balance=dataset_info["class_balance"],
            dataset_name=dataset_info["dataset_name"],
            class_names=class_names,
            index=index,
            cv_fold=fold,
        )
        
        write_results_report_to_new_file(
            output_dir_path=output_dir_final,
            results=computed_metrics,
            fold_info=fold_information,
            experiment_info=exp_info
            )
        
        print(f"Results written to {output_dir_final} directory.\n")


In [24]:
def repeated_cross_validation(X, y, dataset_info, class_names, repetitions, output_dir, metrics, model, model_name):
    for index in range(repetitions):
        output_dir_with_index = output_dir / f"{index}"
        mskf = MultilabelStratifiedKFold(n_splits=5, shuffle=True)
        cross_validation(X, y, dataset_info, class_names, mskf, metrics, output_dir_with_index, index, model, model_name)
        

In [25]:
def perform_experiments(datasets, models, repetitions, output_dir_base):
    for _, dataset_info in datasets.items():
        X_df = pd.read_csv(dataset_info["path"] / 'X.csv')
        y_df = pd.read_csv(dataset_info["path"] / 'y.csv')

        class_names = list(y_df.columns)
        class_no = len(class_names)
        
        X = X_df.to_numpy()
        y = y_df.to_numpy()    
        
        output_dir = output_dir_base / dataset_info['classification_type'] / dataset_info['class_balance'] / dataset_info['dataset_name']
        metrics = create_full_multilabel_metrics(class_no, device="cpu")
        
        for model_name, model_fn in models.items():
            model = model_fn(X)
            output_dir_with_model = output_dir / model_name
            repeated_cross_validation(X, y, dataset_info, class_names, repetitions, output_dir_with_model, metrics, model, model_name)
    

In [26]:
perform_experiments(multilabel_balanced_datasets, multilabel_models, REPETITIONS, output_dir_base)

Preprocessor created with 2150 binary, 0 categorical, and 0 numeric columns.
Results written to C:\VisualStudioRepositories\MUSIC_DATA\metric_analysis\output\multilabel\balanced\bookmarks_balanced\logistic_regression\0\fold_0 directory.



KeyboardInterrupt: 