Imports

In [66]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns

## Training and evaluation function.

In [67]:
def train_and_evaluate_dt_full_pipeline(file_path, target_cols, meta_cols, name, feat_select=False):
    print(f"\n--------Decision tree for dataset: {name} ||| Target: {target_cols[0]} ||| Feature selection: {feat_select}    ---------")
    
    df = pd.read_csv(file_path)
    X = df.drop(columns=target_cols + meta_cols, errors='ignore')
    #Get only numerical features
    X = X.select_dtypes(include=[np.number])
    y = df[target_cols]

    X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.20, random_state=78)


    # max_depth: limits how deep the tree grows (prevents overfitting)
    # min_samples_split: minimum data points required to make a new branch
    # min_samples_leaf: minimum data points that must end up in a final leaf (prevents overfitting)
    param_grid = {
        'max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10, None],
        'min_samples_split': [2, 3, 5],
        'min_samples_leaf': [1, 5, 10, 15, 18, 20, 25, 30, 40, 50],      
    }
    
    dt = DecisionTreeRegressor(random_state=78)
    #5 fold
    kf = KFold(n_splits=5, shuffle=True, random_state=78)
    
    #Use grid search to check hyperparam combinations
    grid_search = GridSearchCV(
        estimator=dt,
        param_grid=param_grid,
        cv=kf,
        scoring='neg_mean_squared_error',
        n_jobs=-1,
        return_train_score=True
    )
    
    
    grid_search.fit(X_train_val, y_train_val)
    

    print(f"Best Hyperparameters: {grid_search.best_params_}")

    # Retrain the best model
    best_model = grid_search.best_estimator_
    

    # If feature selection, use importance rankings given by decision tree and refit the model
    if feat_select:

        importances = best_model.feature_importances_
        feature_df = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
        feature_df = feature_df.sort_values(by='Importance', ascending=False)

        top_feature_df = feature_df[feature_df['Importance'] > 0.01].copy()
        top_feature_names = top_feature_df['Feature'].tolist()

        print(f"Selected {len(top_feature_names)} features with importance > 1%")
        print(top_feature_names)

        X_train_fs = X_train_val[top_feature_names]
        X_test_fs = X_test[top_feature_names]

        best_model.fit(X_train_fs, y_train_val)

        train_preds = best_model.predict(X_train_fs)
        test_preds = best_model.predict(X_test_fs)
    else:
        # Evaluate without feature selection
        train_preds = best_model.predict(X_train_val)
        test_preds = best_model.predict(X_test)

    train_mse = mean_squared_error(y_train_val, train_preds)
    test_mse = mean_squared_error(y_test, test_preds)
    
    print(f"\nPerformance:")
    print(f"Training MSE: {train_mse:.4f} (RMSE: {np.sqrt(train_mse):.4f})")
    print(f"Test MSE:     {test_mse:.4f} (RMSE: {np.sqrt(test_mse):.4f})")
    
    return best_model, X.columns

# Run training and evaluation for arousal and valence on the emosounds dataset

### Meta columns to remove. Non numerical columns

In [68]:
emo_meta = ['dataset', 'fnames', 'genre', 'splits', 'vocals']
iad_meta = ['source', 'description', 'category', 'fname', 'BE_Classification']

### EmoSounds arousal Without feature selection

In [69]:
best_emo_arousal, emo_arousal_features = train_and_evaluate_dt_full_pipeline(
    '../data/preprocessed/EmoSounds_preprocessed.csv', ['arousal'], emo_meta, "EmotionSounds", feat_select = False
)


--------Decision tree for dataset: EmotionSounds ||| Target: arousal ||| Feature selection: False    ---------
Best Hyperparameters: {'max_depth': 5, 'min_samples_leaf': 25, 'min_samples_split': 2}

Performance:
Training MSE: 0.0492 (RMSE: 0.2217)
Test MSE:     0.1217 (RMSE: 0.3488)


### EmoSounds arousal With feature selection

In [70]:
best_emo_arousal, emo_arousal_features = train_and_evaluate_dt_full_pipeline(
    '../data/preprocessed/EmoSounds_preprocessed.csv', ['arousal'], emo_meta, "EmotionSounds", feat_select = True
)


--------Decision tree for dataset: EmotionSounds ||| Target: arousal ||| Feature selection: True    ---------
Best Hyperparameters: {'max_depth': 5, 'min_samples_leaf': 25, 'min_samples_split': 2}
Selected 2 features with importance > 1%
['spectral_roughness_mean', 'spectral_spectentropy_mean']

Performance:
Training MSE: 0.0551 (RMSE: 0.2348)
Test MSE:     0.1365 (RMSE: 0.3694)


### EmoSounds valence Without feature selection

In [71]:
best_emo_valence, emo_valence_features = train_and_evaluate_dt_full_pipeline(
    '../data/preprocessed/EmoSounds_preprocessed.csv', ['valence'], emo_meta, "EmotionSounds", feat_select = False
)


--------Decision tree for dataset: EmotionSounds ||| Target: valence ||| Feature selection: False    ---------
Best Hyperparameters: {'max_depth': 4, 'min_samples_leaf': 20, 'min_samples_split': 2}

Performance:
Training MSE: 0.1060 (RMSE: 0.3256)
Test MSE:     0.1756 (RMSE: 0.4191)


### EmoSounds valence With feature selection

In [72]:
best_emo_valence, emo_valence_features = train_and_evaluate_dt_full_pipeline(
    '../data/preprocessed/EmoSounds_preprocessed.csv', ['valence'], emo_meta, "EmotionSounds", feat_select = True
)


--------Decision tree for dataset: EmotionSounds ||| Target: valence ||| Feature selection: True    ---------
Best Hyperparameters: {'max_depth': 4, 'min_samples_leaf': 20, 'min_samples_split': 2}
Selected 10 features with importance > 1%
['spectral_roughness_mean', 'arousal', 'timbre_lowenergy_std', 'timbre_lowenergy_mean', 'spectral_mfcc_std_9', 'spectral_mfcc_std_10', 'spectral_mfcc_std_13', 'tonal_keyclarity_std', 'rhythm_eventdensity_mean', 'spectral_spread_std']

Performance:
Training MSE: 0.1065 (RMSE: 0.3264)
Test MSE:     0.1676 (RMSE: 0.4093)


# Run training and evaluation for arousal and valence on IADSED data set

### IADSED arousal Without feature selection

In [73]:
best_iad_arousal, iad_arousal_features = train_and_evaluate_dt_full_pipeline(
    '../data/preprocessed/IADSED_preprocessed.csv', ['arousal'], iad_meta, "IADSED", feat_select=False
)


--------Decision tree for dataset: IADSED ||| Target: arousal ||| Feature selection: False    ---------
Best Hyperparameters: {'max_depth': 5, 'min_samples_leaf': 18, 'min_samples_split': 2}

Performance:
Training MSE: 0.5047 (RMSE: 0.7105)
Test MSE:     0.8064 (RMSE: 0.8980)


### IADSED arousal With feature selection

In [74]:
best_iad_arousal, iad_arousal_features = train_and_evaluate_dt_full_pipeline(
    '../data/preprocessed/IADSED_preprocessed.csv', ['arousal'], iad_meta, "IADSED", feat_select=True
)


--------Decision tree for dataset: IADSED ||| Target: arousal ||| Feature selection: True    ---------
Best Hyperparameters: {'max_depth': 5, 'min_samples_leaf': 18, 'min_samples_split': 2}
Selected 11 features with importance > 1%
['timbre_spectralflux_mean', 'dominance', 'spectral_skewness_mean', 'spectral_flatness_mean', 'valence', 'spectral_spectentropy_mean', 'dynamics_rms_mean', 'spectral_brightness_std', 'tonal_keyclarity_mean', 'spectral_rolloff85_std', 'spectral_mfcc_std_8']

Performance:
Training MSE: 0.5048 (RMSE: 0.7105)
Test MSE:     0.7931 (RMSE: 0.8905)


### IADSED valence Without feature selection

In [75]:
best_iad_valence, iad_valence_features = train_and_evaluate_dt_full_pipeline(
    '../data/preprocessed/IADSED_preprocessed.csv', ['valence'], iad_meta, "IADSED", feat_select=False
)


--------Decision tree for dataset: IADSED ||| Target: valence ||| Feature selection: False    ---------
Best Hyperparameters: {'max_depth': 4, 'min_samples_leaf': 50, 'min_samples_split': 2}

Performance:
Training MSE: 0.7090 (RMSE: 0.8420)
Test MSE:     0.8309 (RMSE: 0.9116)


### IADSED valence With feature selection

In [76]:
best_iad_valence, iad_valence_features = train_and_evaluate_dt_full_pipeline(
    '../data/preprocessed/IADSED_preprocessed.csv', ['valence'], iad_meta, "IADSED", feat_select=True
)


--------Decision tree for dataset: IADSED ||| Target: valence ||| Feature selection: True    ---------
Best Hyperparameters: {'max_depth': 4, 'min_samples_leaf': 50, 'min_samples_split': 2}
Selected 3 features with importance > 1%
['dominance', 'tonal_keyclarity_mean', 'timbre_lowenergy_std']

Performance:
Training MSE: 0.7090 (RMSE: 0.8420)
Test MSE:     0.8309 (RMSE: 0.9116)
