In [1]:
import os
from pathlib import Path

cwd = Path.cwd()
if cwd.name == "notebooks":
    os.chdir(cwd.parent)

#### Load Dataset

In [2]:
from EmoBox.EmoBox import EmoDataset, EmoEval

In [3]:
dataset = "iemocap"
fold = 1  # different datasets have different number of folds, which can be find in data/
user_data_dir = "./" # path to EmoBox - FIXED: Changed from "Emobox" to "EmoBox"
meta_data_dir = "EmoBox/data/" # path to data folder - FIXED: Changed from "Emobox" to "EmoBox"
label2idx = {'hap':0, 'sad':1, 'ang':2, 'neu':3} # you may need to define a label to index mapping for your own training, see `data/iemocap/label_map.json`

train = EmoDataset(dataset, user_data_dir, meta_data_dir, fold=fold, split="train")
test = EmoDataset(dataset, user_data_dir, meta_data_dir, fold=fold, split="test")

In [4]:
sample = test[0]
sample

{'key': 'iemocap-Ses01F_impro04_F000',
 'audio': array([ 0.00228882,  0.00183105,  0.00180054, ..., -0.00778198,
        -0.00982666, -0.01132202], shape=(70312,), dtype=float32),
 'label': 'Neutral'}

In [7]:
test.label_map.values()

dict_values(['Neutral', 'Happy', 'Angry', 'Sad', 'Happy'])

In [6]:
from collections import Counter
labels =  [data['label'] for data in test]
Counter(labels)

Counter({'Neutral': 384, 'Happy': 278, 'Angry': 229, 'Sad': 194})

#### Load Model

In [8]:
import torch
from mllm_emotion_classifier.models import ModelFactory

device = "cuda" if torch.cuda.is_available() else "cpu"
model = ModelFactory.create(
    name="qwen2-audio",
    checkpoint="Qwen/Qwen2-Audio-7B",
    class_labels=set(train.label_map.values()),
    device=device
)

  from .autonotebook import tqdm as notebook_tqdm
`torch_dtype` is deprecated! Use `dtype` instead!
Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████████████| 5/5 [00:02<00:00,  1.86it/s]


In [9]:
data_loader = torch.utils.data.DataLoader(
    dataset=test,
    batch_size=4,
    num_workers=4,
    pin_memory=True,
    drop_last=False,
    collate_fn=model.collate_fn
)

In [None]:
for inputs, labels in data_loader:
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    predictions = model.predict(inputs)
    print(predictions, labels)
    break

The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


[' The answer is neutral', ' The emotion of the speech is neutral', ' The answer is "Sad"', ' The answer is "Happy"'] ['Neutral', 'Neutral', 'Neutral', 'Neutral']


In [15]:
predictions, labels

([' Happy',
  ' Happy',
  ' Happy',
  ' The answer is Sad',
  ' Happy',
  ' Happy',
  ' Happy',
  ' Happy'],
 ['Happy', 'Happy', 'Happy', 'Happy', 'Happy', 'Happy', 'Happy', 'Happy'])

#### ---------------------------------------------------------------------------------------------------------------------
#### Dataset Wrapper (??)

In [23]:
class EmoDatasetWrapper(EmoDataset):

    DEFAULT_PROMPT_TEMPLATE = (
        "<|audio_bos|><|AUDIO|><|audio_eos|>"
        "What emotion is expressed in this audio? "
        "Answer with a single word emotion label among: {labels}."
    )

    # DEFAULT_PROMPT_TEMPLATE = (
    #     "<|audio_bos|><|AUDIO|><|audio_eos|>"
    #     "You are an expert in audio-based emotion classification. "
    #     "Listen to the following audio utterance and identify the emotion expressed by the speaker. "
    #     "Select the emotion label from the following options: {labels}. "
    #     "Answer with a single word emotion label."
    # )

    def __init__(self, dataset, data_dir, meta_data_dir, fold=1, split="train", prompt_template=None):
        super().__init__(dataset, data_dir, meta_data_dir, fold, split)
        
        self.prompt = prompt_template or self.DEFAULT_PROMPT_TEMPLATE
        labels_str = ", ".join(dict.fromkeys(self.label_map.values()))
        self.prompt = self.prompt.format(labels=labels_str)

    def __getitem__(self, idx) -> dict:
        item = super().__getitem__(idx)
        item["prompt"] = self.prompt
        return item

train = EmoDatasetWrapper(dataset, user_data_dir, meta_data_dir, fold=fold, split="train")
test = EmoDatasetWrapper(dataset, user_data_dir, meta_data_dir, fold=fold, split="test")

In [24]:
train[0]

{'key': 'iemocap-Ses02F_impro07_F001',
 'audio': tensor([ 0.0024,  0.0021,  0.0020,  ..., -0.0366, -0.0303, -0.0301]),
 'label': 'Happy',
 'prompt': '<|audio_bos|><|AUDIO|><|audio_eos|>What emotion is expressed in this audio? Answer with a single word emotion label among: Neutral, Happy, Angry, Sad.'}