In [103]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt

from sklearn.feature_selection import SelectFromModel

"""
    balancing
"""
try:
    from imblearn.over_sampling import RandomOverSampler
    IMBLEARN_AVAILABLE = True
except ImportError:
    IMBLEARN_AVAILABLE = False


def eval_classifier(name, model, X_train, y_train, X_val, y_val, X_test, y_test):
    def metrics(split_name, X, y):
        preds = model.predict(X)
        return {
            "split": split_name,
            "accuracy": accuracy_score(y, preds),
            "macro_f1": f1_score(y, preds, average="macro")
        }

    train_m = metrics("train", X_train, y_train)
    val_m   = metrics("val",   X_val,   y_val)
    test_m  = metrics("test",  X_test,  y_test)

    print(f"\n=== {name} ===")
    print("Train:", train_m)
    print("Val:  ", val_m)
    print("Test: ", test_m)

    return {"name": name, "train": train_m, "val": val_m, "test": test_m}

def show_confusion(model, X_test, y_test, label_order=None):
    preds = model.predict(X_test)
    cm = confusion_matrix(y_test, preds, labels=label_order)
    print("\nConfusion Matrix (rows=true, cols=pred):")
    print(cm)
    print("\nClassification Report:")
    print(classification_report(y_test, preds))
    return cm

In [104]:
"""Loading the dataset"""

def load_data(file_path, label_col="emotion_types", drop_cols=None, label_map=None):
    df = pd.read_csv(file_path)

    if label_col not in df.columns:
        raise ValueError(f"Label column '{label_col}' not found in {file_path}.")

    """ 
        dropping metadata and label-leak columns.
        Works for both emosounds and iadsed datasets.
    """
    if drop_cols is None:
        drop_cols = [
            "dataset", "splits", "vocals",
            "source", "dominance",
            "BE_Classification_FH", "BE_Classification_FS", "BE_Classification_H",
            "BE_Classification_HF", "BE_Classification_HS",
            "BE_Classification_S", "BE_Classification_U"
        ]

    cols_to_drop = [c for c in drop_cols if c in df.columns] + [label_col]

    """
        keep numeric predictors (which includes arousal and valence)
    """
    X = df.drop(columns=cols_to_drop, errors="ignore")
    X = X.select_dtypes(include=["number"]).copy()

    """
        y: map string labels 
    """
    y_raw = df[label_col]

    """ Mapping values to LABEL_MAP (defined in pipeline execution)"""
    if label_map is not None:
        y = y_raw.map(label_map)
        if y.isna().any():
            bad_vals = sorted(y_raw[y.isna()].unique().tolist())
            raise ValueError(
                "Some labels were not found in LABEL_MAP. Examples:\n"
                f"{bad_vals[:10]}"
            )
        y = y.astype(int).to_numpy()
    else:
        y = y_raw.astype(int).to_numpy()

    return df, X, y


In [105]:
def split(X, y, random_state=70):
    """first split: 60% train, 40% temp (will split in second split)"""
    X_train, X_temp, y_train, y_temp = train_test_split(
        X, y,
        test_size=0.4,
        random_state=random_state,
        stratify=y
    )
    """ second split: 0.5 of the prior 40% split = 20% validate 20% test"""
    X_val, X_test, y_val, y_test = train_test_split(
        X_temp, y_temp,
        test_size = 0.5,
        random_state=random_state,
        stratify=y_temp
    )

    return X_train, y_train, X_val, y_val, X_test, y_test

In [106]:
""" balance training data """
def balance_training_data(X_train, y_train, method="oversample", random_state=78):
    if method == "oversample":
        if not IMBLEARN_AVAILABLE:
            raise ImportError("imblearn is not installed. Either install it or use class_weight in the model.")
        ros = RandomOverSampler(random_state=random_state)
        Xb, yb = ros.fit_resample(X_train, y_train)
        return Xb, yb

    raise ValueError("Unknown balance method. Use method='oversample'.")


In [107]:
"""Feature selection"""
def select_features_tree_importance(X_train, y_train, X_val, X_test, threshold="median", random_state=78):
    base = DecisionTreeClassifier(random_state=random_state)
    base.fit(X_train, y_train)

    selector = SelectFromModel(base, threshold=threshold, prefit=True)
    X_train_fs = selector.transform(X_train)
    X_val_fs = selector.transform(X_val)
    X_test_fs = selector.transform(X_test)

    return selector, X_train_fs, X_val_fs, X_test_fs

In [108]:

def run_decision_tree_pipeline(
    csv_path,
    dataset_name,
    label_col="emotion_types",
    balance=False,
    balance_method="oversample",
    feature_selection=False,
    random_state=78
):
    df, X, y = load_data(csv_path, label_col=label_col, label_map=LABEL_MAP)
    
    X_train, y_train, X_val, y_val, X_test, y_test = split(X, y, random_state=random_state)

    """
      balancing on training only
    """
    if balance:
        X_train_fit, y_train_fit = balance_training_data(X_train, y_train, method=balance_method, random_state=random_state)
        balance_tag = "Balanced"
    else:
        X_train_fit, y_train_fit = X_train, y_train
        balance_tag = "Imbalanced"

    """
        baseline model
    """
    baseline = DecisionTreeClassifier(random_state=random_state)
    baseline.fit(X_train_fit, y_train_fit)
    baseline_results = eval_classifier(
        name=f"{dataset_name} | {balance_tag} | Baseline DT",
        model=baseline,
        X_train=X_train, y_train=y_train,
        X_val=X_val,     y_val=y_val,
        X_test=X_test,   y_test=y_test
    )

    best_model = baseline
    best_name  = f"{dataset_name} | {balance_tag} | Baseline DT"
    best_val_f1 = baseline_results["val"]["macro_f1"]

    """ 
        Feature selection + baseline model
    """
    if feature_selection:
        selector, X_train_fs, X_val_fs, X_test_fs = select_features_tree_importance(
            X_train_fit, y_train_fit, X_val, X_test, threshold="median", random_state=random_state
        )

        baseline_fs = DecisionTreeClassifier(random_state=random_state)
        baseline_fs.fit(X_train_fs, y_train_fit)

        fs_results = eval_classifier(
            name=f"{dataset_name} | {balance_tag} | DT + feature selection",
            model=baseline_fs,
            X_train=selector.transform(X_train), y_train=y_train,
            X_val=X_val_fs,     y_val=y_val,
            X_test=X_test_fs,   y_test=y_test
        )

        if fs_results["val"]["macro_f1"] > best_val_f1:
            best_model = ("fs", selector, baseline_fs)  # store pipeline parts
            best_name = f"{dataset_name} | {balance_tag} | DT + FS(SelectFromModel)"
            best_val_f1 = fs_results["val"]["macro_f1"]

    """
        Hyperparameter tuning (5-fold CV on training)
    """
    param_grid = {
        "max_depth": [3, 4, 5, 6, 8, None],
        "min_samples_split": [2, 3, 5, 10],
        "min_samples_leaf": [1, 2, 5, 10, 20, 25],
        "criterion": ["gini", "entropy"]
    }

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)
    grid = GridSearchCV(
        estimator=DecisionTreeClassifier(random_state=random_state),
        param_grid=param_grid,
        scoring="f1_macro",
        cv=cv,
        n_jobs=-1
    )
    grid.fit(X_train_fit, y_train_fit)

    tuned = grid.best_estimator_
    print(f"\nBest hyperparameters ({dataset_name} | {balance_tag}): {grid.best_params_}")

    tuned.fit(X_train_fit, y_train_fit)

    tuned_results = eval_classifier(
        name=f"{dataset_name} | {balance_tag} | Tuned DT",
        model=tuned,
        X_train=X_train, y_train=y_train,
        X_val=X_val,     y_val=y_val,
        X_test=X_test,   y_test=y_test
    )

    if tuned_results["val"]["macro_f1"] > best_val_f1:
        best_model = tuned
        best_name = f"{dataset_name} | {balance_tag} | Tuned DT"
        best_val_f1 = tuned_results["val"]["macro_f1"]

    # --- Final: confusion matrix on TEST for best model (chosen by val macro-F1) ---
    print(f"\n>>> Best model selected by VAL macro-F1: {best_name} (VAL macro-F1={best_val_f1:.4f})")

    if isinstance(best_model, tuple) and best_model[0] == "fs":
        _, selector, model = best_model
        show_confusion(
            model=model,
            X_test=selector.transform(X_test),
            y_test=y_test,
            label_order=[1, 2, 3, 4]
        )
    else:
        show_confusion(
            model=best_model,
            X_test=X_test,
            y_test=y_test,
            label_order=[1, 2, 3, 4]
        )

    return best_name, best_model

In [109]:
EMO_FINAL = "data/classification/final_emosounds.csv"
IAD_FINAL = "data/classification/final_iadsed.csv"

LABEL_COL = "emotion_types"

LABEL_MAP = {
    "Positive Arousal, Positive Valence": 1,
    "Positive Arousal, Negative Valence": 2,
    "Negative Arousal, Negative Valence": 3,
    "Negative Arousal, Positive Valence": 4,
}


# 1) EmoSounds: imbalanced
run_decision_tree_pipeline(
    EMO_FINAL, "EmoSounds",
    label_col=LABEL_COL,
    balance=False,
    feature_selection=True
)

# 2) EmoSounds: balanced
run_decision_tree_pipeline(
    EMO_FINAL, "EmoSounds",
    label_col=LABEL_COL,
    balance=True,           
    feature_selection=True
)

# 3) IADSED: imbalanced
run_decision_tree_pipeline(
    IAD_FINAL, "IADSED",
    label_col=LABEL_COL,
    balance=False,
    feature_selection=True
)

# 4) IADSED: balanced
run_decision_tree_pipeline(
    IAD_FINAL, "IADSED",
    label_col=LABEL_COL,
    balance=True,
    feature_selection=True
)



=== EmoSounds | Imbalanced | Baseline DT ===
Train: {'split': 'train', 'accuracy': 1.0, 'macro_f1': 1.0}
Val:   {'split': 'val', 'accuracy': 0.5833333333333334, 'macro_f1': 0.47545010905553203}
Test:  {'split': 'test', 'accuracy': 0.5416666666666666, 'macro_f1': 0.457663252240717}

=== EmoSounds | Imbalanced | DT + feature selection ===
Train: {'split': 'train', 'accuracy': 1.0, 'macro_f1': 1.0}
Val:   {'split': 'val', 'accuracy': 0.5583333333333333, 'macro_f1': 0.45390227183705445}
Test:  {'split': 'test', 'accuracy': 0.5333333333333333, 'macro_f1': 0.46441468981821143}





Best hyperparameters (EmoSounds | Imbalanced): {'criterion': 'gini', 'max_depth': 6, 'min_samples_leaf': 2, 'min_samples_split': 5}

=== EmoSounds | Imbalanced | Tuned DT ===
Train: {'split': 'train', 'accuracy': 0.8916666666666667, 'macro_f1': 0.8533200263063769}
Val:   {'split': 'val', 'accuracy': 0.575, 'macro_f1': 0.42073148563326346}
Test:  {'split': 'test', 'accuracy': 0.5583333333333333, 'macro_f1': 0.4497405372405372}

>>> Best model selected by VAL macro-F1: EmoSounds | Imbalanced | Baseline DT (VAL macro-F1=0.4755)

Confusion Matrix (rows=true, cols=pred):
[[ 4  8  0  0]
 [ 5 36  3  3]
 [ 2  6  5  7]
 [ 2  3 16 20]]

Classification Report:
              precision    recall  f1-score   support

           1       0.31      0.33      0.32        12
           2       0.68      0.77      0.72        47
           3       0.21      0.25      0.23        20
           4       0.67      0.49      0.56        41

    accuracy                           0.54       120
   macro avg   




Best hyperparameters (EmoSounds | Balanced): {'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 2}

=== EmoSounds | Balanced | Tuned DT ===
Train: {'split': 'train', 'accuracy': 0.975, 'macro_f1': 0.9790361748170019}
Val:   {'split': 'val', 'accuracy': 0.55, 'macro_f1': 0.45362654901998645}
Test:  {'split': 'test', 'accuracy': 0.6416666666666667, 'macro_f1': 0.5941289559301982}

>>> Best model selected by VAL macro-F1: EmoSounds | Balanced | Baseline DT (VAL macro-F1=0.4999)

Confusion Matrix (rows=true, cols=pred):
[[ 5  6  0  1]
 [ 8 33  2  4]
 [ 1  5 10  4]
 [ 1  0 13 27]]

Classification Report:
              precision    recall  f1-score   support

           1       0.33      0.42      0.37        12
           2       0.75      0.70      0.73        47
           3       0.40      0.50      0.44        20
           4       0.75      0.66      0.70        41

    accuracy                           0.62       120
   macro avg       0.56      0.5




Best hyperparameters (IADSED | Imbalanced): {'criterion': 'gini', 'max_depth': 8, 'min_samples_leaf': 10, 'min_samples_split': 2}

=== IADSED | Imbalanced | Tuned DT ===
Train: {'split': 'train', 'accuracy': 0.7410071942446043, 'macro_f1': 0.6953119250851751}
Val:   {'split': 'val', 'accuracy': 0.4810810810810811, 'macro_f1': 0.3859307014852381}
Test:  {'split': 'test', 'accuracy': 0.478494623655914, 'macro_f1': 0.36964042109275475}

>>> Best model selected by VAL macro-F1: IADSED | Imbalanced | Baseline DT (VAL macro-F1=0.4123)

Confusion Matrix (rows=true, cols=pred):
[[12 15  1  5]
 [15 60 11 11]
 [ 4 12  6  4]
 [ 6 10  2 12]]

Classification Report:
              precision    recall  f1-score   support

           1       0.32      0.36      0.34        33
           2       0.62      0.62      0.62        97
           3       0.30      0.23      0.26        26
           4       0.38      0.40      0.39        30

    accuracy                           0.48       186
   macro av




Best hyperparameters (IADSED | Balanced): {'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2}

=== IADSED | Balanced | Tuned DT ===
Train: {'split': 'train', 'accuracy': 1.0, 'macro_f1': 1.0}
Val:   {'split': 'val', 'accuracy': 0.4486486486486487, 'macro_f1': 0.37664077414918323}
Test:  {'split': 'test', 'accuracy': 0.4838709677419355, 'macro_f1': 0.41257555419046105}

>>> Best model selected by VAL macro-F1: IADSED | Balanced | Baseline DT (VAL macro-F1=0.3766)

Confusion Matrix (rows=true, cols=pred):
[[11 15  1  6]
 [16 57 14 10]
 [ 3 10  6  7]
 [ 2 10  2 16]]

Classification Report:
              precision    recall  f1-score   support

           1       0.34      0.33      0.34        33
           2       0.62      0.59      0.60        97
           3       0.26      0.23      0.24        26
           4       0.41      0.53      0.46        30

    accuracy                           0.48       186
   macro avg       0.41      0.42      0.41

('IADSED | Balanced | Baseline DT', DecisionTreeClassifier(random_state=78))