In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt

from sklearn.feature_selection import SelectFromModel

#balancing:
try:
    from imblearn.over_sampling import RandomOverSampler
    IMBLEARN_AVAILABLE = True
except ImportError:
    IMBLEARN_AVAILABLE = False


def eval_classifier(name, model, X_train, y_train, X_val, y_val, X_test, y_test):
    """ Print train/val/test metrics and returns a dict of results"""
    def metrics(split_name, X, y):
        preds = model.predict(X)
        return {
            "split": split_name,
            "accuracy" : accuracy_score(y, preds),
            "macro_f1":f1_score(y, preds, average="macro")
        }
    
    train_m = metrics("train", X_train, y_train)
    val_m = metrics("val", X_val, y_val)
    test_m = metrics("test",X_test,y_test)

    print(f"\n=====name=====")
    print("Train: ", train_m)
    print("Validate: ", val_m)
    print("Test: ", test_m)

    return {"name":name,"train":train_m,"val":val_m,"test":test_m}


def show_confidence(model,X_test, y_test, label_order=None):
    preds = model.predict(X_test)
    conf_matrix = confusion_matrix(y_test, preds, labels=label_order)
    print("\nConfusion Matrix (rows=true, cols=pred):")
    print(conf_matrix)
    print("\nClassification Report:")
    print(classification_report(y_test, preds))
    return conf_matrix

In [None]:
"""Loading the dataset"""

def load_data(file_path, label_col="class_label", drop_cols=None):
    df = pd.read_csv(file_path)

    if label_col not in df.columns:
        raise ValueError(f"Label Column '{label_col}' not found in {file_path}.")
    
    # Drop metadata columns and label columns
    if drop_cols is None:
        drop_cols = [
            "dataset",
            "splits",
            "vocals",
            "source",
            "dominance",
            "BE_Classification_FH",
            "BE_Classification_FS",
            "BE_Classification_H",
            "BE_Classification_HF",
            "BE_Classification_HS",
            "BE_Classification_S",
            "BE_Classification_U"
        ]
        cols_to_drop = [c for c in drop_cols in df.columns] + [label_col]

        X = df.drop(columns=cols_to_drop, errors="ignore")
        X = X.select_dtypes(include=["number"]).copy()

        y = df[label_col].astype(int)

        return df, X, y

In [None]:
def split(X, y, random_state=70):
    """first split: 60% train, 40% temp (will split in second split)"""
    X_train, X_temp, y_train, y_temp = train_test_split(
        X, y,
        test_size=0.4,
        random_state=random_state,
        stratify=y
    )

    X_val, X_test, y_val, y_test = train_test_split(
        X_temp, y_temp,
        test_size = 0.5,
        random_state=random_state,
        stratify=y_temp
    )

    return X_train, y_train, X_val, y_val, X_test, y_test