In [1]:
import os
from pathlib import Path

cwd = Path.cwd()
if cwd.name == "notebooks":
    os.chdir(cwd.parent)

#### Load Dataset

In [2]:
from EmoBox.EmoBox import EmoDataset, EmoEval

In [3]:
dataset = "emovdb"
fold = 1  # different datasets have different number of folds, which can be find in data/
user_data_dir = "./" # path to EmoBox - FIXED: Changed from "Emobox" to "EmoBox"
meta_data_dir = "EmoBox/data/" # path to data folder - FIXED: Changed from "Emobox" to "EmoBox"


train = EmoDataset(dataset, user_data_dir, meta_data_dir, fold=fold, split="train")
test = EmoDataset(dataset, user_data_dir, meta_data_dir, fold=fold, split="test")

since there is no official valid data, use random split for train valid split, with a ratio of [80, 20]
load in 5168 samples, only 5168 exists in data dir EmoBox/data/
load in 1719 samples, only 1719 exists in data dir EmoBox/data/
Num. training samples 5168
Num. valid samples 0
Num. test samples 1719
Using label_map {'Amused': 'Amused', 'Sleepy': 'Sleepy', 'Angry': 'Angry', 'Disgust': 'Disgust', 'Neutral': 'Neutral'}
since there is no official valid data, use random split for train valid split, with a ratio of [80, 20]
load in 5168 samples, only 5168 exists in data dir EmoBox/data/
load in 1719 samples, only 1719 exists in data dir EmoBox/data/
Num. training samples 5168
Num. valid samples 0
Num. test samples 1719
Using label_map {'Amused': 'Amused', 'Sleepy': 'Sleepy', 'Angry': 'Angry', 'Disgust': 'Disgust', 'Neutral': 'Neutral'}


In [4]:
sample = test[0]
sample

{'key': 'emovdb-sam-Amused-0384',
 'audio': array([-0.00119863,  0.00034247,  0.0015411 , ...,  0.00291096,
         0.00308219,  0.00308219], shape=(131361,), dtype=float32),
 'label': 'Amused',
 'gender': 'Male',
 'language': 'English'}

In [5]:
test.label_map.values()

dict_values(['Neutral', 'Happy', 'Angry', 'Sad', 'Happy'])

In [6]:
from collections import Counter
labels =  [data['label'] for data in test]
Counter(labels)

Counter({'Neutral': 384, 'Happy': 278, 'Angry': 229, 'Sad': 194})

#### Load Model

In [7]:
import torch
from mllm_emotion_classifier.models import ModelFactory

device = "cuda" if torch.cuda.is_available() else "cpu"
model = ModelFactory.create(
    name="qwen2-audio",
    checkpoint="Qwen/Qwen2-Audio-7B",
    class_labels=set(train.label_map.values()),
    device=device
)

  from .autonotebook import tqdm as notebook_tqdm
`torch_dtype` is deprecated! Use `dtype` instead!
Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:02<00:00,  1.86it/s]


In [8]:
data_loader = torch.utils.data.DataLoader(
    dataset=test,
    batch_size=1,
    num_workers=4,
    pin_memory=True,
    drop_last=False,
    collate_fn=model.collate_fn
)

In [9]:
# from tqdm import tqdm
# predictions, labels = [], []
# i = 0
# for inputs, lbl in tqdm(data_loader, total=len(data_loader)):
#     inputs = {k: v.to(model.device) for k, v in inputs.items()}
#     preds = model.predict(inputs)
#     predictions.extend(preds)
#     labels.extend(lbl)
#     i += 1
#     if i == 100: break

#### Evaluation

In [10]:
from mllm_emotion_classifier.evaluate import Evaluator

evaluator = Evaluator()
evaluator.evaluate(model, data_loader)


Evaluating Qwen2-Audio-7B on iemocap


Inference:   0%|                                                                                                                                           | 0/1085 [00:00<?, ?it/s]

The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Inference:   9%|███████████▉                                                                                                                     | 100/1085 [00:22<03:40,  4.47it/s]


{'timestamp': '2025-11-27 09:48:45',
 'dataset': 'iemocap',
 'model_name': 'Qwen2-Audio-7B',
 'fold': None,
 'num_samples': 101,
 'num_valid_predictions': np.int64(101),
 'class_labels': ['Angry', 'Happy', 'Neutral', 'Sad'],
 'metrics': {'overall': {'accuracy': {'Angry': 0.8713,
    'Happy': 0.901,
    'Neutral': 0.7723,
    'Sad': 0.901},
   'false_positive_rate': {'Angry': 0.0161,
    'Happy': 0.0918,
    'Neutral': 0.2364,
    'Sad': 0.0568},
   'false_negative_rate': {'Angry': 0.3077,
    'Happy': 0.3333,
    'Neutral': 0.2174,
    'Sad': 0.3846},
   'true_positive_rate': {'Angry': 0.6923,
    'Happy': 0.6667,
    'Neutral': 0.7826,
    'Sad': 0.6154},
   'true_negative_rate': {'Angry': 0.9839,
    'Happy': 0.9082,
    'Neutral': 0.7636,
    'Sad': 0.9432},
   'positive_predictive_value': {'Angry': 0.9643,
    'Happy': 0.1818,
    'Neutral': 0.7347,
    'Sad': 0.6154},
   'negative_predictive_value': {'Angry': 0.8356,
    'Happy': 0.9889,
    'Neutral': 0.8077,
    'Sad': 0.9432},


In [11]:
evaluator.results

{'timestamp': '2025-11-27 09:48:45',
 'dataset': 'iemocap',
 'model_name': 'Qwen2-Audio-7B',
 'fold': None,
 'num_samples': 101,
 'num_valid_predictions': np.int64(101),
 'class_labels': ['Angry', 'Happy', 'Neutral', 'Sad'],
 'metrics': {'overall': {'accuracy': {'Angry': 0.8713,
    'Happy': 0.901,
    'Neutral': 0.7723,
    'Sad': 0.901},
   'false_positive_rate': {'Angry': 0.0161,
    'Happy': 0.0918,
    'Neutral': 0.2364,
    'Sad': 0.0568},
   'false_negative_rate': {'Angry': 0.3077,
    'Happy': 0.3333,
    'Neutral': 0.2174,
    'Sad': 0.3846},
   'true_positive_rate': {'Angry': 0.6923,
    'Happy': 0.6667,
    'Neutral': 0.7826,
    'Sad': 0.6154},
   'true_negative_rate': {'Angry': 0.9839,
    'Happy': 0.9082,
    'Neutral': 0.7636,
    'Sad': 0.9432},
   'positive_predictive_value': {'Angry': 0.9643,
    'Happy': 0.1818,
    'Neutral': 0.7347,
    'Sad': 0.6154},
   'negative_predictive_value': {'Angry': 0.8356,
    'Happy': 0.9889,
    'Neutral': 0.8077,
    'Sad': 0.9432},
