In [None]:
import os
from pathlib import Path

cwd = Path.cwd()
if cwd.name == "notebooks":
    os.chdir(cwd.parent)

#### Load Dataset

In [2]:
from EmoBox.EmoBox import EmoDataset, EmoEval

In [3]:
dataset = "iemocap"
fold = 1  # different datasets have different number of folds, which can be find in data/
user_data_dir = "./" # path to EmoBox - FIXED: Changed from "Emobox" to "EmoBox"
meta_data_dir = "EmoBox/data/" # path to data folder - FIXED: Changed from "Emobox" to "EmoBox"
label2idx = {'hap':0, 'sad':1, 'ang':2, 'neu':3} # you may need to define a label to index mapping for your own training, see `data/iemocap/label_map.json`

train = EmoDataset(dataset, user_data_dir, meta_data_dir, fold=fold, split="train")
test = EmoDataset(dataset, user_data_dir, meta_data_dir, fold=fold, split="test")

In [4]:
sample = test[0]
sample

RuntimeError: Could not load libtorchcodec. Likely causes:
          1. FFmpeg is not properly installed in your environment. We support
             versions 4, 5, 6, and 7 on all platforms, and 8 on Mac and Linux.
          2. The PyTorch version (2.9.0+cu128) is not compatible with
             this version of TorchCodec. Refer to the version compatibility
             table:
             https://github.com/pytorch/torchcodec?tab=readme-ov-file#installing-torchcodec.
          3. Another runtime dependency; see exceptions below.
        The following exceptions were raised as we tried to load libtorchcodec:
        
[start of libtorchcodec loading traceback]
FFmpeg version 8: Could not load this library: /nfs/home/dasaro/research/FairMLLM-Emotion-Recognition/.venv/lib/python3.10/site-packages/torchcodec/libtorchcodec_core8.so
FFmpeg version 7: Could not load this library: /nfs/home/dasaro/research/FairMLLM-Emotion-Recognition/.venv/lib/python3.10/site-packages/torchcodec/libtorchcodec_core7.so
FFmpeg version 6: Could not load this library: /nfs/home/dasaro/research/FairMLLM-Emotion-Recognition/.venv/lib/python3.10/site-packages/torchcodec/libtorchcodec_core6.so
FFmpeg version 5: Could not load this library: /nfs/home/dasaro/research/FairMLLM-Emotion-Recognition/.venv/lib/python3.10/site-packages/torchcodec/libtorchcodec_core5.so
FFmpeg version 4: Could not load this library: /nfs/home/dasaro/research/FairMLLM-Emotion-Recognition/.venv/lib/python3.10/site-packages/torchcodec/libtorchcodec_core4.so
[end of libtorchcodec loading traceback].

In [5]:
train.label_map.values()

dict_values(['Neutral', 'Happy', 'Angry', 'Sad', 'Happy'])

#### Load Model

In [None]:
import torch
from mllm_emotion_classifier.models import ModelFactory

device = "cuda" if torch.cuda.is_available() else "cpu"
model = ModelFactory.create(
    name="qwen2-audio",
    checkpoint="Qwen/Qwen2-Audio-7B",
    class_labels=set(train.label_map.values()),
    device=device
)

  from .autonotebook import tqdm as notebook_tqdm
`torch_dtype` is deprecated! Use `dtype` instead!
Loading checkpoint shards: 100%|██████████| 5/5 [00:00<00:00, 24.14it/s]


In [4]:
data_loader = torch.utils.data.DataLoader(
    dataset=train,
    batch_size=1,
    num_workers=1,
    pin_memory=True,
    drop_last=False,
    collate_fn=model.collate_fn
)

NameError: name 'torch' is not defined

In [None]:
inputs, labels = next(iter(data_loader))
inputs = {k: v.to(model.device) for k, v in inputs.items()}
predictions = model.predict(inputs)   

The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


In [16]:
predictions, labels

([' Happy'], ['Happy'])

#### ---------------------------------------------------------------------------------------------------------------------
#### Dataset Wrapper (??)

In [23]:
class EmoDatasetWrapper(EmoDataset):

    DEFAULT_PROMPT_TEMPLATE = (
        "<|audio_bos|><|AUDIO|><|audio_eos|>"
        "What emotion is expressed in this audio? "
        "Answer with a single word emotion label among: {labels}."
    )

    # DEFAULT_PROMPT_TEMPLATE = (
    #     "<|audio_bos|><|AUDIO|><|audio_eos|>"
    #     "You are an expert in audio-based emotion classification. "
    #     "Listen to the following audio utterance and identify the emotion expressed by the speaker. "
    #     "Select the emotion label from the following options: {labels}. "
    #     "Answer with a single word emotion label."
    # )

    def __init__(self, dataset, data_dir, meta_data_dir, fold=1, split="train", prompt_template=None):
        super().__init__(dataset, data_dir, meta_data_dir, fold, split)
        
        self.prompt = prompt_template or self.DEFAULT_PROMPT_TEMPLATE
        labels_str = ", ".join(dict.fromkeys(self.label_map.values()))
        self.prompt = self.prompt.format(labels=labels_str)

    def __getitem__(self, idx) -> dict:
        item = super().__getitem__(idx)
        item["prompt"] = self.prompt
        return item

train = EmoDatasetWrapper(dataset, user_data_dir, meta_data_dir, fold=fold, split="train")
test = EmoDatasetWrapper(dataset, user_data_dir, meta_data_dir, fold=fold, split="test")

In [24]:
train[0]

{'key': 'iemocap-Ses02F_impro07_F001',
 'audio': tensor([ 0.0024,  0.0021,  0.0020,  ..., -0.0366, -0.0303, -0.0301]),
 'label': 'Happy',
 'prompt': '<|audio_bos|><|AUDIO|><|audio_eos|>What emotion is expressed in this audio? Answer with a single word emotion label among: Neutral, Happy, Angry, Sad.'}