In [75]:
import os
import pandas as pd
import numpy as np
import json
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.dummy import DummyClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer, make_column_selector as selector
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from ast import literal_eval

### Load and explore data files

In [5]:
# Load data
X_train = pd.read_csv("../data/X_train.csv")
X_test = pd.read_csv("../data/X_test.csv")
y_train = pd.read_csv("../data/y_train.csv").squeeze()
y_test = pd.read_csv("../data/y_test.csv").squeeze()

# Print exploratory dataset information
print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)
print("\nLabel distribution in training set:")
print(y_train.value_counts(normalize=True))

Train shape: (796, 595)
Test shape: (199, 595)

Label distribution in training set:
y
2    0.201005
6    0.154523
1    0.153266
0    0.141960
4    0.136935
3    0.115578
5    0.096734
Name: proportion, dtype: float64


In [None]:
def drop_l3_embeddings(df):
    def is_l3(col):
        if not col.startswith("e"): return False
        suf = col[1:]
        if not suf.isdigit(): return False
        idx = int(suf)
        return 0 <= idx <= 511
    cols_to_drop = [c for c in df.columns if is_l3(c)]
    return df.drop(columns=cols_to_drop, errors="ignore")

X_train_f = drop_l3_embeddings(X_train)
X_test_f  = drop_l3_embeddings(X_test)

print("Shapes (features):", X_train_f.shape, X_test_f.shape)

### Train and evaluate two baselines: 1) majority and 2) random classifier

In [6]:
# BASELINE 1: Majority classifier
majority_baseline = DummyClassifier(strategy="most_frequent")
majority_baseline.fit(X_train, y_train)
y_pred_majority = majority_baseline.predict(X_test)

print("MAJORITY BASELINE")
print("Accuracy:", accuracy_score(y_test, y_pred_majority))
print("Macro-F1:", f1_score(y_test, y_pred_majority, average="macro"))
print(classification_report(y_test, y_pred_majority, zero_division=0)) # zero_division=0 is added to avoid annoying warning

MAJORITY BASELINE
Accuracy: 0.20100502512562815
Macro-F1: 0.04781829049611476
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        28
           1       0.00      0.00      0.00        30
           2       0.20      1.00      0.33        40
           3       0.00      0.00      0.00        23
           4       0.00      0.00      0.00        27
           5       0.00      0.00      0.00        20
           6       0.00      0.00      0.00        31

    accuracy                           0.20       199
   macro avg       0.03      0.14      0.05       199
weighted avg       0.04      0.20      0.07       199



In [7]:
# BASELINE 2: Random classifier
random_baseline = DummyClassifier(strategy="uniform", random_state=42)
random_baseline.fit(X_train, y_train)
y_pred_random = random_baseline.predict(X_test)

print("RANDOM BASELINE")
print("Accuracy:", accuracy_score(y_test, y_pred_random))
print("Macro-F1:", f1_score(y_test, y_pred_random, average="macro"))
print(classification_report(y_test, y_pred_random, zero_division=0)) # zero_division=0 is added to avoid annoying warning

RANDOM BASELINE
Accuracy: 0.1708542713567839
Macro-F1: 0.166605585836632
              precision    recall  f1-score   support

           0       0.21      0.18      0.19        28
           1       0.19      0.17      0.18        30
           2       0.23      0.15      0.18        40
           3       0.06      0.09      0.07        23
           4       0.21      0.22      0.22        27
           5       0.09      0.10      0.09        20
           6       0.22      0.26      0.24        31

    accuracy                           0.17       199
   macro avg       0.17      0.17      0.17       199
weighted avg       0.18      0.17      0.17       199



### Train and evaluate three models: LogReg, LinearSVM, RF on whole dataset

In [14]:
# Evaluate function
def evaluate(name, est):
    est.fit(X_train_f, y_train)
    y_pred = est.predict(X_test_f)
    acc = accuracy_score(y_test, y_pred)
    mf1 = f1_score(y_test, y_pred, average="macro")
    print(f"\n=== {name} ===")
    print("Accuracy:", acc)
    print("Macro-F1:", mf1)
    print(classification_report(y_test, y_pred, zero_division=0)) # i am not exactly sure what zero division does but it helped me get rid of an annoying warning
    return name, acc, mf1

# define CV splits
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [15]:
# MODEL 1: LOGISTIC REGRESSION
pipe_lr = Pipeline([
    ("scaler", StandardScaler(with_mean=True, with_std=True)),
    ("clf", LogisticRegression(max_iter=4000, class_weight="balanced", solver="lbfgs", random_state=42)),
])
gs_lr = GridSearchCV(pipe_lr, {"clf__C":[0.3, 1.0, 3.0]}, scoring="f1_macro", cv=cv, n_jobs=-1)
gs_lr.fit(X_train_f, y_train)

# MODEL 2: LINEAR SVM
pipe_svm = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LinearSVC(C=1.0, class_weight="balanced", random_state=42)),
])
gs_svm = GridSearchCV(pipe_svm, {"clf__C":[0.3, 1.0, 3.0]}, scoring="f1_macro", cv=cv, n_jobs=-1)
gs_svm.fit(X_train_f, y_train)

# MODEL 3: RANDOM FOREST
rf = RandomForestClassifier(
    n_estimators=500, max_depth=None, min_samples_leaf=1,
    class_weight="balanced_subsample", random_state=42
)

In [16]:
# Collects summaries to print
rows = []
rows.append(evaluate("LogReg (scaled, balanced)", gs_lr.best_estimator_))
rows.append(evaluate("LinearSVC (scaled, balanced)", gs_svm.best_estimator_))
rows.append(evaluate("RandomForest (balanced_subsample)", rf))

# Prints summaries, also outputs best parameters for function
summary = pd.DataFrame(rows, columns=["model","test_accuracy","test_macro_f1"]).sort_values("test_macro_f1", ascending=False)
print("Summaries (sorted on macro_f1)")
print(summary)
print("\nBest params:")
print("- LogReg:", gs_lr.best_params_)
print("- LinearSVC:", gs_svm.best_params_)


=== LogReg (scaled, balanced) ===
Accuracy: 0.22110552763819097
Macro-F1: 0.21230367287598054
              precision    recall  f1-score   support

           0       0.15      0.14      0.15        28
           1       0.22      0.17      0.19        30
           2       0.36      0.35      0.35        40
           3       0.19      0.17      0.18        23
           4       0.24      0.30      0.26        27
           5       0.21      0.30      0.25        20
           6       0.11      0.10      0.10        31

    accuracy                           0.22       199
   macro avg       0.21      0.22      0.21       199
weighted avg       0.22      0.22      0.22       199


=== LinearSVC (scaled, balanced) ===
Accuracy: 0.20603015075376885
Macro-F1: 0.19239616485518127
              precision    recall  f1-score   support

           0       0.12      0.11      0.12        28
           1       0.20      0.13      0.16        30
           2       0.37      0.38      0.37    

### Train and evaluate RF model with feature selection

In [22]:
# Drop unselected features
with open("../outputs/selected_features.unbalanced.json") as f:
    selected_features = json.load(f)

X_train_fs = X_train_f[selected_features]
X_test_fs = X_test_f[selected_features]

print("Shapes (features):", X_train_fs.shape, X_test_fs.shape)

Shapes (features): (796, 68) (199, 68)


In [97]:
pipe_rf = Pipeline([
    ("clf", RandomForestClassifier(class_weight="balanced_subsample", random_state=42))
])

param_grid = {
    "clf__n_estimators": [150, 175, 200],
    "clf__max_depth": [None, 20, 40],
    "clf__min_samples_leaf": [1, 2]
}

gs = GridSearchCV(pipe_rf, param_grid, scoring="f1_macro", cv=cv, n_jobs=-1, verbose=0)
gs.fit(X_train_fs, y_train)

best = gs.best_estimator_
print("\n Best params:", gs.best_params_)
print("Best CV Macro-F1:", round(gs.best_score_, 3))

y_pred = best.predict(X_test_fs)
print("\n RandomForest + Feature Selection")
print("Accuracy:", round(accuracy_score(y_test, y_pred), 3))
print("Macro-F1:", round(f1_score(y_test, y_pred, average='macro'), 3))
print(classification_report(y_test, y_pred, zero_division=0))


 Best params: {'clf__max_depth': None, 'clf__min_samples_leaf': 1, 'clf__n_estimators': 200}
Best CV Macro-F1: 0.222

 RandomForest + Feature Selection
Accuracy: 0.327
Macro-F1: 0.286
              precision    recall  f1-score   support

           0       0.27      0.21      0.24        28
           1       0.35      0.23      0.28        30
           2       0.35      0.72      0.47        40
           3       0.39      0.30      0.34        23
           4       0.28      0.30      0.29        27
           5       0.29      0.10      0.15        20
           6       0.32      0.19      0.24        31

    accuracy                           0.33       199
   macro avg       0.32      0.30      0.29       199
weighted avg       0.32      0.33      0.30       199



In [98]:
# Save best RF model
best_rf = gs.best_estimator_
joblib.dump(best_rf, "../outputs/best_rf_model.pkl")

['../outputs/best_rf_model.pkl']

### Train and evaluate RF model (with response_id)

#### Prepare dataset with response_id

In [83]:
df_r_all= pd.read_csv("../data/combined_openl3_audiofeatures.csv")
df_r = drop_l3_embeddings(df_r_all)
df_r = df_r.drop(columns="track_id")

df_r['strategy'] = df_r['strategy'].apply(
    lambda s: literal_eval(s) if isinstance(s, str) and s.strip().startswith('[') else s
)

In [84]:
def process_multi_strategies(x):
    """
    If songs occur more than once, but for the same strategy (e.g. [('solace', 2)]) keep the strategy
    """
    if isinstance(x,str):
        # keep current x
        return x
    elif isinstance(x,list) and len(x) == 1:
        new_x = x[0][0]
        return new_x
    elif isinstance(x,list) and len(x) > 1:
        return np.nan

df_r['strategy'] = df_r['strategy'].apply(process_multi_strategies)

# drop tracks corresponding to multiple strategies
df_r = df_r.dropna(subset=['strategy'])

X_r = df_r.drop(columns='strategy')
y_r = df_r['strategy']

In [85]:
# Encode strategies into numerical labels
label_mapping = { 
    0:'discharge', 
    1:'diversion', 
    2:'entertainment', 
    3:'mental_work',
    4:'revival', 
    5:'solace', 
    6:'strong_sensation'
}

inverse_label_mapping = {v: k for k, v in label_mapping.items()}
y_r_enc = y_r.map(inverse_label_mapping)

In [86]:
X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X_r, y_r_enc, train_size=0.8, random_state=42, shuffle=True, stratify=y_r)

print("Train shape:", X_train_r.shape)
print("Test shape:", X_test_r.shape)
print("\nLabel distribution in training set:")
print(y_train_r.value_counts(normalize=True))

Train shape: (796, 70)
Test shape: (199, 70)

Label distribution in training set:
strategy
2    0.201005
6    0.154523
1    0.153266
0    0.141960
4    0.136935
3    0.115578
5    0.096734
Name: proportion, dtype: float64


#### Train and evaluate

In [95]:
cat_cols = X_train_r.select_dtypes(include=['object', 'category', 'string']).columns.tolist()

preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
    ],
    remainder="passthrough"
)

pipe_rf = Pipeline([
    ("preprocess", preprocess),
    ("clf", RandomForestClassifier(class_weight="balanced_subsample", random_state=42))
])

param_grid = {
    "clf__n_estimators": [130, 150, 170, 190],
    "clf__max_depth": [None, 20, 40],
    "clf__min_samples_leaf": [1, 2]
}

gs = GridSearchCV(pipe_rf, param_grid, scoring="f1_macro", cv=cv, n_jobs=-1, verbose=0)
gs.fit(X_train_r, y_train_r)

best = gs.best_estimator_
print("\n Best params:", gs.best_params_)
print("Best CV Macro-F1:", round(gs.best_score_, 3))

y_pred = best.predict(X_test_r)
print("\n RandomForest with Response ID")
print("Accuracy:", round(accuracy_score(y_test_r, y_pred), 3))
print("Macro-F1:", round(f1_score(y_test_r, y_pred, average='macro'), 3))
print(classification_report(y_test_r, y_pred, zero_division=0))


 Best params: {'clf__max_depth': None, 'clf__min_samples_leaf': 1, 'clf__n_estimators': 170}
Best CV Macro-F1: 0.219

 RandomForest with Response ID
Accuracy: 0.281
Macro-F1: 0.235
              precision    recall  f1-score   support

           0       0.16      0.11      0.13        28
           1       0.16      0.10      0.12        30
           2       0.32      0.68      0.44        40
           3       0.43      0.26      0.32        23
           4       0.26      0.30      0.28        27
           5       0.14      0.05      0.07        20
           6       0.32      0.26      0.29        31

    accuracy                           0.28       199
   macro avg       0.26      0.25      0.24       199
weighted avg       0.26      0.28      0.25       199

