In [2]:
# EMO-DB + openSMILE eGeMAPSv02 Functionals → Logistic Regression baseline
# Works CPU-only. ~1–2 minutes download + feature extraction.
from datasets import load_dataset, Audio
import numpy as np
import opensmile

from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_val_predict
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import soundfile as sf

In [1]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

In [3]:
# 1) Load dataset (Hugging Face)
ds = load_dataset("renumics/emodb-enriched", split="train")  # 535 rows, 7 emotions
ds = ds.cast_column("audio", Audio(sampling_rate=16000))     # ensure consistent SR (SR means)

README.md: 0.00B [00:00, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


data/train-00000-of-00001-f82e24d3d04414(…):   0%|          | 0.00/57.0M [00:00<?, ?B/s]

: 

In [None]:
# 2) openSMILE extractor: eGeMAPSv02 (Functionals = fixed-size per file)
smile = opensmile.Smile(
    feature_set=opensmile.FeatureSet.eGeMAPSv02,
    feature_level=opensmile.FeatureLevel.Functionals
)

def extract_features(batch):
    wav = batch["audio"]["array"]
    sr  = batch["audio"]["sampling_rate"]
    if wav.ndim > 1:
        wav = wav.mean(axis=1)  # mono
    df = smile.process_signal(wav.astype(np.float32), sr)  # 1-row DataFrame
    batch["features"] = df.to_numpy().squeeze().astype(np.float32)
    return batch

feat_ds = ds.map(extract_features, remove_columns=["audio"])

In [None]:
# 3) Build matrices
X = np.stack(feat_ds["features"])
y = np.array(feat_ds["emotion"])          # strings like 'anger', 'happiness', ...
le = LabelEncoder()
y_enc = le.fit_transform(y)

In [None]:
# 4) Model: scaler + multinomial logistic regression
pipe = make_pipeline(
    StandardScaler(with_mean=True, with_std=True),
    LogisticRegression(max_iter=3000, class_weight="balanced", random_state=42)
)

In [None]:
# 5) Evaluation: stratified 5-fold CV
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(pipe, X, y_enc, cv=cv, scoring="accuracy", n_jobs=-1)
print(f"5-fold CV accuracy: {scores.mean():.3f} ± {scores.std():.3f}")

y_pred = cross_val_predict(pipe, X, y_enc, cv=cv, n_jobs=-1)
print(classification_report(y_enc, y_pred, target_names=le.classes_))

In [None]:
# 6) Fit final model on all data
pipe.fit(X, y_enc)

In [None]:
# 7) Confusion matrix
cm = confusion_matrix(y_enc, y_pred, labels=np.arange(len(le.classes_)))
ConfusionMatrixDisplay(cm, display_labels=le.classes_).plot(xticks_rotation=45)
plt.tight_layout(); plt.show()

In [None]:
# 8) Inference helper for an external WAV
def predict_one(wav_path: str) -> str:
    wav, sr = sf.read(wav_path, dtype="float32")
    if wav.ndim > 1:
        wav = wav.mean(axis=1)
    feats = smile.process_signal(wav, sr).to_numpy().squeeze().astype(np.float32)
    pred = pipe.predict(feats[None, :])[0]
    return le.inverse_transform([pred])[0]


In [None]:
# Example:
# print(predict_one("sample.wav"))