In [1]:
import os
from pathlib import Path

cwd = Path.cwd()
if cwd.name == "notebooks":
    os.chdir(cwd.parent)

#### Load Dataset

In [2]:
from EmoBox.EmoBox import EmoDataset, EmoEval

In [3]:
dataset = "esd"
fold = 1  # different datasets have different number of folds, which can be find in data/
user_data_dir = "./" # path to EmoBox - FIXED: Changed from "Emobox" to "EmoBox"
meta_data_dir = "EmoBox/data/" # path to data folder - FIXED: Changed from "Emobox" to "EmoBox"

language = "English"  if dataset == "esd" else None
train = EmoDataset(dataset, user_data_dir, meta_data_dir, fold=fold, split="train", language=language)
test = EmoDataset(dataset, user_data_dir, meta_data_dir, fold=fold, split="test", language=language)

since there is no official valid data, use random split for train valid split, with a ratio of [80, 20]
load in 28000 samples, only 28000 exists in data dir EmoBox/data/
load in 7000 samples, only 7000 exists in data dir EmoBox/data/
Filtered from 28000 to 12250 samples for language: English
Filtered from 0 to 0 samples for language: English
Filtered from 7000 to 5250 samples for language: English
Num. training samples 12250
Num. valid samples 0
Num. test samples 5250
Using label_map {'Neutral': 'Neutral', 'Angry': 'Angry', 'Happy': 'Happy', 'Sad': 'Sad', 'Surprise': 'Surprise'}
since there is no official valid data, use random split for train valid split, with a ratio of [80, 20]
load in 28000 samples, only 28000 exists in data dir EmoBox/data/
load in 7000 samples, only 7000 exists in data dir EmoBox/data/
Filtered from 28000 to 12250 samples for language: English
Filtered from 0 to 0 samples for language: English
Filtered from 7000 to 5250 samples for language: English
Num. training

In [4]:
sample = test[0]
sample

{'key': 'esd-0011-000001',
 'audio': array([-1.2207031e-04, -1.2207031e-04, -1.5258789e-04, ...,
         6.1035156e-05,  0.0000000e+00,  6.1035156e-05],
       shape=(40960,), dtype=float32),
 'label': 'Neutral',
 'gender': 'Male',
 'language': 'English'}

In [5]:
test.label_map.values()

dict_values(['Neutral', 'Angry', 'Happy', 'Sad', 'Surprise'])

In [6]:
from collections import Counter
labels =  [data['label'] for data in test]
Counter(labels)

Counter({'Neutral': 1050,
         'Angry': 1050,
         'Happy': 1050,
         'Sad': 1050,
         'Surprise': 1050})

#### Load Model

In [7]:
import torch
from mllm_emotion_classifier.models import ModelFactory

device = "cuda" if torch.cuda.is_available() else "cpu"
model = ModelFactory.create(
    name="qwen2-audio-instruct",
    class_labels=set(train.label_map.values()),
    do_sample=False,
    prompt_name="user_labels",
    device=device
)

  from .autonotebook import tqdm as notebook_tqdm


Fetching 5 files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 36921.69it/s]
Loading weights: 100%|██████████████████████████████████████████████████████| 876/876 [00:02<00:00, 315.07it/s, Materializing param=multi_modal_projector.linear.weight]


In [8]:
data_loader = torch.utils.data.DataLoader(
    dataset=test,
    batch_size=4,
    num_workers=4,
    pin_memory=True,
    drop_last=False,
    collate_fn=model.collate_fn
)

In [None]:
# from tqdm import tqdm
# predictions, labels = [], []
# i = 0
# for inputs, lbl in tqdm(data_loader, total=len(data_loader)):
#     inputs = {k: v.to(model.device) for k, v in inputs.items()}
#     preds = model.predict(inputs)
#     predictions.extend(preds)
#     labels.extend(lbl)
#     i += 1
#     if i == 100: break

#### Evaluation

In [9]:
from mllm_emotion_classifier.evaluate import Evaluator

evaluator = Evaluator()
evaluator.evaluate(model, data_loader)

  valid_indices = [i for i, p in enumerate(self.y_pred) if p is not "Unknown"]



Evaluating qwen2-audio-instruct on esd


Inference:   0%|                                                                                                                               | 0/1313 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Inference:   4%|█████▏                                                                                                                | 58/1313 [00:22<07:49,  2.67it/s]Could not confidently parse response: "The speaker's tone is"
Inference:   8%|████████▉                                                                                                             | 99/1313 [00:38<07:41,  2.63it/s]Could not confidently parse response: "The speaker's tone is"
Inference:   9%|██████████                                                                                                           | 113/1313 [00:43<07:41,  2.60it/s]Could not confidently parse response: "The speaker's tone is"
Inference:  14

{'timestamp': '2026-01-02 15:10:43',
 'dataset': 'esd',
 'model_name': 'qwen2-audio-instruct',
 'fold': None,
 'num_samples': 5250,
 'valid_rate': 1.0,
 'class_labels': ['Surprise', 'Sad', 'Angry', 'Happy', 'Neutral'],
 'metrics': {'global': {'f1_macro': 0.3463,
   'f1_weighted': 0.4155,
   'accuracy_unweighted': 0.4347,
   'accuracy_weighted': 0.4347,
   'precision_macro': 0.388,
   'precision_weighted': 0.4656,
   'recall_macro': 0.3622,
   'recall_weighted': 0.4347},
  'classwise': {'accuracy': {'Angry': 0.7461,
    'Happy': 0.7539,
    'Neutral': 0.768,
    'Sad': 0.7882,
    'Surprise': 0.8168},
   'false_positive_rate': {'Angry': 0.2617,
    'Happy': 0.1733,
    'Neutral': 0.1452,
    'Sad': 0.0907,
    'Surprise': 0.0312},
   'false_negative_rate': {'Angry': 0.2229,
    'Happy': 0.5371,
    'Neutral': 0.579,
    'Sad': 0.6962,
    'Surprise': 0.7914},
   'true_positive_rate': {'Angry': 0.7771,
    'Happy': 0.4629,
    'Neutral': 0.421,
    'Sad': 0.3038,
    'Surprise': 0.2086},

In [11]:
evaluator.results['metrics']['global']

{'f1_macro': 0.3463,
 'f1_weighted': 0.4155,
 'accuracy_unweighted': 0.4347,
 'accuracy_weighted': 0.4347,
 'precision_macro': 0.388,
 'precision_weighted': 0.4656,
 'recall_macro': 0.3622,
 'recall_weighted': 0.4347}