In [3]:
import pandas as pd
import numpy as np
import json
import pickle
from ast import literal_eval
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score, classification_report
from groupyr import LogisticSGLCV
from imblearn.over_sampling import SMOTE, RandomOverSampler

### Load Datasets

In [13]:
X_train = pd.read_csv("../data/X_train.csv")
X_test = pd.read_csv("../data/X_test.csv")
y_train = pd.read_csv("../data/y_train.csv").squeeze()
y_test = pd.read_csv("../data/y_test.csv").squeeze()

# Print exploratory dataset information
print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)
print("\nLabel distribution in training set:")
print(y_train.value_counts(normalize=True))

Train shape: (796, 595)
Test shape: (199, 595)

Label distribution in training set:
y
2    0.201005
6    0.154523
1    0.153266
0    0.141960
4    0.136935
3    0.115578
5    0.096734
Name: proportion, dtype: float64


In [14]:
# Extract essentia features only
openl3_cols = X_train.columns[X_train.columns.str.startswith("e")]
X_train_es = X_train.drop(columns = openl3_cols)
X_test_es = X_test.drop(columns = openl3_cols)

### Extract feature and label names 

In [15]:
y_class_mapping = { 
    0:'discharge', 
    1:'diversion',
    2: 'entertainment', 
    3:'mental_work',
    4:'revival', 
    5:'solace', 
    6:'strong_sensation'
}

targets = list(y_class_mapping.values())
feature_names = X_train_es.columns

### Standardize Features

In [16]:
scaler = StandardScaler()
scaler.fit(X_train_es)
X_train_es_std = scaler.transform(X_train_es)
X_test_es_std = scaler.transform(X_test_es)

### Group-Lasso Feature Selection

In [17]:
# Define feature groups
cols = X_train_es.columns.str

melbands_cols = np.where(cols.startswith('lowlevel.melbands'))[0]
spectral_cols = np.where(cols.startswith('lowlevel.spectral'))[0]
mfcc_cols = np.where(cols.startswith('lowlevel.mfcc'))[0]
meta_cols = np.where(cols.startswith('meta.') | cols.contains('popularity'))[0]
rhythm_cols =  np.where(cols.startswith('rhythm.'))[0]
tonal_cols =  np.where(cols.startswith('tonal.'))[0]

categorised_cols = np.concatenate([
    melbands_cols,
    mfcc_cols,
    spectral_cols,
    meta_cols,
    rhythm_cols,
    tonal_cols
])

lowlevel_misc_cols = np.array(list((set(list(range(len(X_train_es.columns)))) - set(categorised_cols))))

groups = [
    lowlevel_misc_cols,
    melbands_cols, 
    spectral_cols,
    mfcc_cols, 
    meta_cols, 
    rhythm_cols, 
    tonal_cols
]

# Build base algorithm
base = LogisticSGLCV(
    groups=groups,                     
    alphas=np.logspace(-4, 1, 15),    
    l1_ratio=[0.01, 0.05, 0.1, 0.3, 0.5, 0.7],
    max_iter=5000,
    tol=1e-6,
    fit_intercept=True,
    n_jobs=-1,
    scoring="accuracy"               
)

### With unbalanced data

In [18]:
ovr = OneVsRestClassifier(base, n_jobs=6)
ovr.fit(X_train_es_std, y_train)

In [22]:
for i, estimator in enumerate(ovr.estimators_):
    print(f"Class {i}:")
    print(f"  Best alpha: {estimator.alpha_}")
    print(f"  Best l1_ratio: {estimator.l1_ratio_}\n")

Class 0:
  Best alpha: 10.0
  Best l1_ratio: 0.01

Class 1:
  Best alpha: 0.013894954943731374
  Best l1_ratio: 0.5

Class 2:
  Best alpha: 10.0
  Best l1_ratio: 0.01

Class 3:
  Best alpha: 0.03162277660168379
  Best l1_ratio: 0.5

Class 4:
  Best alpha: 0.03162277660168379
  Best l1_ratio: 0.1

Class 5:
  Best alpha: 0.006105402296585327
  Best l1_ratio: 0.1

Class 6:
  Best alpha: 10.0
  Best l1_ratio: 0.01



In [139]:
y_pred_unbalanced = ovr.predict(X_test_es_std)
print("Accuracy:", accuracy_score(y_test, y_pred_unbalanced))
print(classification_report(y_test, y_pred_unbalanced))

Accuracy: 0.22110552763819097
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        28
           1       0.21      0.20      0.20        30
           2       0.20      0.47      0.28        40
           3       0.42      0.22      0.29        23
           4       0.24      0.33      0.28        27
           5       0.19      0.25      0.21        20
           6       0.00      0.00      0.00        31

    accuracy                           0.22       199
   macro avg       0.18      0.21      0.18       199
weighted avg       0.17      0.22      0.18       199



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [150]:
# Print selected features for each strategy
selected_features_unbalanced = set() 

for c, est in enumerate(ovr.estimators_):
    coef = est.coef_.ravel()
    selected_idx = np.where(coef != 0)[0]
    sorted_idx = selected_idx[np.argsort(-np.abs(coef[selected_idx]))]
    
    print(f"===Class {targets[c]}:====")
    for idx in sorted_idx:
        feature_name = feature_names[idx]
        print(f"  Feature: {feature_name} | Coef: {coef[idx]}")
        selected_features_unbalanced.add(feature_name)

===Class discharge:====
===Class diversion:====
  Feature: lowlevel.melbands_skewness.stdev | Coef: 0.26059147082536294
  Feature: tonal.key_krumhansl.key_Bb | Coef: 0.25846780975243744
  Feature: lowlevel.mfcc.mean.3 | Coef: -0.15844731406730456
  Feature: tonal.chords_strength.stdev | Coef: -0.12044288476418948
  Feature: lowlevel.dynamic_complexity | Coef: -0.11907207396024351
  Feature: lowlevel.mfcc.mean.2 | Coef: 0.10875810148579927
  Feature: tonal.key_krumhansl.key_Eb | Coef: 0.1071834830616153
  Feature: rhythm.beats_loudness.stdev | Coef: 0.10594861955446495
  Feature: rhythm.bpm | Coef: -0.10407220010105321
  Feature: tonal.key_krumhansl.key_C | Coef: -0.10176619525329463
  Feature: tonal.key_krumhansl.key_G | Coef: 0.09641903892898532
  Feature: tonal.key_krumhansl.strength | Coef: 0.09194359583774035
  Feature: tonal.chords_number_rate | Coef: 0.06868853378354003
  Feature: tonal.key_krumhansl.key_D | Coef: 0.06665581496080415
  Feature: lowlevel.melbands_skewness.mean | C

In [164]:
all_selected_features_unbalanced = sorted(selected_features_unbalanced)
removed_features_unbalanced = set(feature_names) - set(all_selected_features_unbalanced)

print("Removed features:")
for f in removed_features_unbalanced:
    print(f)

with open("../outputs/selected_features.unbalanced.json", 'w') as f:
    json.dump(all_selected_features_unbalanced, f) 

Removed features:
lowlevel.spectral_energy.mean
lowlevel.spectral_decrease.mean
lowlevel.spectral_flux.mean
lowlevel.mfcc.mean.9
tonal.key_krumhansl.scale_nan
lowlevel.spectral_centroid.mean
lowlevel.melbands_spread.mean
lowlevel.spectral_skewness.mean
lowlevel.melbands_kurtosis.mean
lowlevel.spectral_decrease.stdev
lowlevel.zerocrossingrate.stdev
lowlevel.melbands_flatness_db.mean
tonal.key_krumhansl.key_nan
lowlevel.spectral_skewness.stdev
tonal.key_krumhansl.key_F


### With a balanced dataset

In [101]:
# use an random oversampler to balance class distribution 
oversampler = RandomOverSampler(random_state=42)
X_train_balanced, y_train_balanced = oversampler.fit_resample(X_train_es_std, y_train)

In [102]:
ovr_balanced = OneVsRestClassifier(base, n_jobs=6)
ovr_balanced.fit(X_train_balanced, y_train_balanced)

In [137]:
y_pred_balanced = ovr_balanced.predict(X_test_es_std)
print("Accuracy:", accuracy_score(y_test, y_pred_balanced))
print(classification_report(y_test, y_pred_balanced))

Accuracy: 0.21105527638190955
              precision    recall  f1-score   support

           0       0.21      0.21      0.21        28
           1       0.15      0.17      0.16        30
           2       0.33      0.03      0.05        40
           3       0.21      0.35      0.26        23
           4       0.25      0.52      0.34        27
           5       0.20      0.40      0.27        20
           6       0.00      0.00      0.00        31

    accuracy                           0.21       199
   macro avg       0.19      0.24      0.18       199
weighted avg       0.20      0.21      0.17       199



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [165]:
# Print selected features for each strategy
selected_features_balanced = set() 

for c, est in enumerate(ovr_balanced.estimators_):
    coef = est.coef_.ravel()
    selected_idx = np.where(coef != 0)[0]
    sorted_idx = selected_idx[np.argsort(-np.abs(coef[selected_idx]))]
    
    print(f"===Class {targets[c]}:====")
    for idx in sorted_idx:
        feature_name = feature_names[idx]
        print(f"  Feature: {feature_name} | Coef: {coef[idx]}")
        selected_features_balanced.add(feature_name)

===Class discharge:====
  Feature: rhythm.onset_rate | Coef: -0.44840439885065064
  Feature: rhythm.beats_loudness.mean | Coef: -0.21564991349870993
  Feature: tonal.key_krumhansl.key_Ab | Coef: 0.14133008765564012
  Feature: tonal.key_krumhansl.strength | Coef: -0.07027408804238432
  Feature: lowlevel.mfcc.mean.8 | Coef: 0.061336194381404946
  Feature: tonal.chords_strength.stdev | Coef: 0.05996880913173668
  Feature: tonal.key_krumhansl.scale_minor | Coef: 0.05944318617091304
  Feature: tonal.key_krumhansl.scale_major | Coef: -0.05944318617091304
  Feature: lowlevel.spectral_kurtosis.mean | Coef: -0.05749645739817166
  Feature: meta.year | Coef: 0.05439144718634144
  Feature: popularity | Coef: 0.049069363747696354
  Feature: lowlevel.spectral_complexity.mean | Coef: 0.04779303522162856
  Feature: tonal.key_krumhansl.key_Bb | Coef: -0.04716918669166009
  Feature: lowlevel.spectral_skewness.mean | Coef: -0.04526778888750176
  Feature: lowlevel.mfcc.mean.5 | Coef: -0.044898837458694385