In [1]:
import os
from pathlib import Path

cwd = Path.cwd()
if cwd.name == "notebooks":
    os.chdir(cwd.parent)

#### Load Dataset

In [2]:
from EmoBox.EmoBox import EmoDataset, EmoEval

In [3]:
dataset = "iemocap"
fold = 1  # different datasets have different number of folds, which can be find in data/
user_data_dir = "./" # path to EmoBox - FIXED: Changed from "Emobox" to "EmoBox"
meta_data_dir = "EmoBox/data/" # path to data folder - FIXED: Changed from "Emobox" to "EmoBox"

language = "English"  if dataset == "esd" else None
train = EmoDataset(dataset, user_data_dir, meta_data_dir, fold=fold, split="train", language=language)
test = EmoDataset(dataset, user_data_dir, meta_data_dir, fold=fold, split="test", language=language)

since there is no official valid data, use random split for train valid split, with a ratio of [80, 20]
load in 4446 samples, only 4446 exists in data dir EmoBox/data/
load in 1085 samples, only 1085 exists in data dir EmoBox/data/
Num. training samples 4446
Num. valid samples 0
Num. test samples 1085
Using label_map {'neu': 'Neutral', 'hap': 'Happy', 'ang': 'Angry', 'sad': 'Sad', 'exc': 'Happy'}
since there is no official valid data, use random split for train valid split, with a ratio of [80, 20]
load in 4446 samples, only 4446 exists in data dir EmoBox/data/
load in 1085 samples, only 1085 exists in data dir EmoBox/data/
Num. training samples 4446
Num. valid samples 0
Num. test samples 1085
Using label_map {'neu': 'Neutral', 'hap': 'Happy', 'ang': 'Angry', 'sad': 'Sad', 'exc': 'Happy'}


In [4]:
sample = test[0]
sample

{'key': 'iemocap-Ses01F_impro04_F000',
 'audio': array([ 0.00228882,  0.00183105,  0.00180054, ..., -0.00778198,
        -0.00982666, -0.01132202], shape=(70312,), dtype=float32),
 'label': 'Neutral',
 'gender': 'Female',
 'language': 'English'}

In [5]:
test.label_map.values()

dict_values(['Neutral', 'Happy', 'Angry', 'Sad', 'Happy'])

In [6]:
# from collections import Counter
# from tqdm import tqdm
# labels = []
# for i in tqdm(range(len(test))):
#     sample = test[i]
#     labels.append(sample["label"])
# Counter(labels)

#### Load Model

In [7]:
import torch
from mllm_emotion_classifier.models import ModelFactory

device = "cuda" if torch.cuda.is_available() else "cpu"
model = ModelFactory.create(
    name="audio-flamingo-3", # "qwen2-audio-instruct"
    class_labels=set(train.label_map.values()),
    do_sample=True,
    temperature=0.0001,
    prompt_name="user_labels",
    device=device
)

  from .autonotebook import tqdm as notebook_tqdm
Loading weights: 100%|██████████████████████████████████████████| 830/830 [00:11<00:00, 75.35it/s, Materializing param=multi_modal_projector.linear_2.weight]


In [8]:
data_loader = torch.utils.data.DataLoader(
    dataset=test,
    batch_size=4,
    num_workers=4,
    pin_memory=True,
    drop_last=False,
    collate_fn=model.collate_fn
)

In [9]:
# from tqdm import tqdm
# predictions, labels = [], []
# i = 0
# for inputs, lbl in tqdm(data_loader, total=len(data_loader)):
#     inputs = {k: v.to(model.device) for k, v in inputs.items()}
#     preds = model.predict(inputs)
#     predictions.extend(preds)
#     labels.extend(lbl)
#     i += 1
#     if i == 100: break

#### Evaluation

In [10]:
from mllm_emotion_classifier.evaluate import Evaluator

evaluator = Evaluator()
evaluator.evaluate(model, data_loader)

  valid_indices = [i for i, p in enumerate(self.y_pred) if p is not "Unknown"]



Evaluating audio-flamingo-3 on iemocap


Inference:   0%|                                                                                                                     | 0/272 [00:00<?, ?it/s]Keyword argument `tokenize` is not a valid argument for this processor and will be ignored.
Keyword argument `tokenize` is not a valid argument for this processor and will be ignored.
Keyword argument `tokenize` is not a valid argument for this processor and will be ignored.
Keyword argument `tokenize` is not a valid argument for this processor and will be ignored.
Inference:  14%|███████████████                                                                                             | 38/272 [02:07<15:31,  3.98s/it]Could not confidently parse response: "excited"
Inference:  15%|████████████████▋                                                                                           | 42/272 [02:19<13:10,  3.44s/it]Could not confidently parse response: "excited"
Inference:  16%|█████████████████▍                               

KeyboardInterrupt: 

In [13]:
evaluator.results['metrics']['global']

{'f1_macro': 0.4928,
 'f1_weighted': 0.6218,
 'accuracy_unweighted': 0.5825,
 'accuracy_weighted': 0.6249,
 'precision_macro': 0.6064,
 'precision_weighted': 0.7211,
 'recall_macro': 0.466,
 'recall_weighted': 0.6249}

In [12]:
# from transformers import AudioFlamingo3ForConditionalGeneration, AutoProcessor

# model_id = "nvidia/audio-flamingo-3-hf"
# processor = AutoProcessor.from_pretrained(model_id)
# model = AudioFlamingo3ForConditionalGeneration.from_pretrained(model_id, device_map="auto")

# conversation = [
#     {
#         "role": "user",
#         "content": [
#             {"type": "text", "text": "Transcribe the input speech."},
#             {"type": "audio", "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/WhDJDIviAOg_120_10.mp3"},
#         ],
#     }
# ]

# inputs = processor.apply_chat_template(
#     conversation,
#     tokenize=True,
#     add_generation_prompt=True,
#     return_dict=True,
# ).to(model.device)

# outputs = model.generate(**inputs, max_new_tokens=500)

# decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
# print(decoded_outputs)