In [None]:
import os
from pathlib import Path

cwd = Path.cwd()
if cwd.name == "notebooks":
    os.chdir(cwd.parent)

#### Load Dataset

In [2]:
from EmoBox.EmoBox import EmoDataset, EmoEval

In [3]:
dataset = "iemocap"
fold = 1  # different datasets have different number of folds, which can be find in data/
user_data_dir = "./" # path to EmoBox - FIXED: Changed from "Emobox" to "EmoBox"
meta_data_dir = "EmoBox/data/" # path to data folder - FIXED: Changed from "Emobox" to "EmoBox"
label2idx = {'hap':0, 'sad':1, 'ang':2, 'neu':3} # you may need to define a label to index mapping for your own training, see `data/iemocap/label_map.json`

train = EmoDataset(dataset, user_data_dir, meta_data_dir, fold=fold, split="train")
test = EmoDataset(dataset, user_data_dir, meta_data_dir, fold=fold, split="test")

since there is no official valid data, use random split for train valid split, with a ratio of [80, 20]
load in 4446 samples, only 4446 exists in data dir EmoBox/data/
load in 1085 samples, only 1085 exists in data dir EmoBox/data/
Num. training samples 4446
Num. valid samples 0
Num. test samples 1085
Using label_map {'neu': 'Neutral', 'hap': 'Happy', 'ang': 'Angry', 'sad': 'Sad', 'exc': 'Happy'}
since there is no official valid data, use random split for train valid split, with a ratio of [80, 20]
load in 4446 samples, only 4446 exists in data dir EmoBox/data/
load in 1085 samples, only 1085 exists in data dir EmoBox/data/
Num. training samples 4446
Num. valid samples 0
Num. test samples 1085
Using label_map {'neu': 'Neutral', 'hap': 'Happy', 'ang': 'Angry', 'sad': 'Sad', 'exc': 'Happy'}


In [None]:
# import numpy as np

# id = 102
# audio_1 = test[id]['audio']
# audio_2 = test[id]['audio']
# np.array_equal(audio_1, audio_2)

In [5]:
test.label_map.values()

dict_values(['Neutral', 'Happy', 'Angry', 'Sad', 'Happy'])

In [6]:
train[0]

{'key': 'iemocap-Ses02F_impro07_F001',
 'audio': array([ 0.00238037,  0.00213623,  0.00204468, ..., -0.03656006,
        -0.03027344, -0.03005981], shape=(36400,), dtype=float32),
 'label': 'Happy',
 'gender': 'Female',
 'language': 'English'}

In [7]:
from collections import Counter
labels =  [data['label'] for data in test]
Counter(labels)

Counter({'Neutral': 384, 'Happy': 278, 'Angry': 229, 'Sad': 194})

#### Load Model

In [8]:
AUDIO_PROMPT_TEMPLATE = (
    "<|audio_bos|><|AUDIO|><|audio_eos|>"
    "Classify the speaker’s tone in the audio. "
    "Select one of: {labels}. "
    "Answer:"
)

class_labels = test.label_map.values()
# letter_to_label = {label[0].upper(): label for label in class_labels}
# label_to_letter = {label: label[0].upper() for label in class_labels}
# label_options = ", ".join([f"{label_to_letter[label]}: {label}" for label in class_labels])
label_dict = {label[0]: label for label in class_labels}
AUDIO_PROMPT_TEMPLATE = AUDIO_PROMPT_TEMPLATE.format(labels=label_dict)
AUDIO_PROMPT_TEMPLATE

"<|audio_bos|><|AUDIO|><|audio_eos|>Classify the speaker’s tone in the audio. Select one of: {'N': 'Neutral', 'H': 'Happy', 'A': 'Angry', 'S': 'Sad'}. Answer:"

In [9]:
import torch
from mllm_emotion_classifier.models import ModelFactory

device = "cuda" if torch.cuda.is_available() else "cpu"
model = ModelFactory.create(
    name="qwen2-audio-instruct",
    # name="qwen2-audio",
    # name="audio-flamingo-3",
    # checkpoint="Qwen/Qwen2-Audio-7B",
    # checkpoint="Qwen/Qwen2-Audio-7B-Instruct",
    class_labels=set(test.label_map.values()),
    do_sample=True,
    device=device,
    # prompt_name="simple",
    prompt_name="user_labels", # direct, user_labels, cameo
)

  from .autonotebook import tqdm as notebook_tqdm
Fetching 5 files: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 60436.66it/s]
Loading weights: 100%|█████████████████████████████████████████████████████████████████████████| 876/876 [00:03<00:00, 284.69it/s, Materializing param=multi_modal_projector.linear.weight]


In [None]:
data_loader = torch.utils.data.DataLoader(
    dataset=test,
    batch_size=4,
    num_workers=4,
    pin_memory=True,
    drop_last=False,
    collate_fn=model.collate_fn
)

In [None]:
# from tqdm import tqdm
# import pandas as pd

# predictions, labels = [], []
# i = 0
# for inputs, lbl in tqdm(data_loader, total=len(data_loader)):
#     inputs = {k: v.to(model.device) for k, v in inputs.items()}
#     preds = model.predict(inputs)
#     predictions.extend(preds)
#     labels.extend(lbl)
#     i += 1
    # if len(predictions) >= 300:
    #     break

# new_data = pd.DataFrame({
#     "label": labels,
#     "prediction": predictions,
# })
# csv_path = "notebooks/qwen2-audio-iemocap-fold1-predictions-json.csv"

# if os.path.exists(csv_path):
#     df = pd.read_csv(csv_path)
#     last_digit = df.columns[-1][0]
#     next_id = int(last_digit) + 1
#     df[f"{next_id}_prediction"] = new_data["prediction"]
#     print(f"Added columns with ID {next_id} to existing CSV")
# else:
#     df = pd.DataFrame({
#         "label": new_data["label"],
#         "0_prediction": new_data["prediction"],
#     })
#     print(f"Created new CSV with ID 0")

# df.to_csv(csv_path, index=False)
# print(f"Saved to {csv_path}")
# print(f"Shape: {df.shape}")

In [None]:
# import pandas as pd
# from sklearn.metrics import f1_score

# csv_path = "notebooks/qwen2-audio-iemocap-fold1-predictions-subset.csv"
# df = pd.read_csv(csv_path)
# prediction_cols = [col for col in df.columns if col.endswith("_prediction")]
# y_true = df["label"]

# results = []
# for pred_col in prediction_cols:
#     pred_id = pred_col.split("_")[0]
#     y_pred = df[pred_col].dropna()
#     y_true_valid = y_true[df[pred_col].notna()]
    
#     f1_macro = f1_score(y_true_valid, y_pred, average='macro')
#     f1_weighted = f1_score(y_true_valid, y_pred, average='weighted')
    
#     results.append({
#         'ID': pred_id,
#         'F1 Macro': round(f1_macro, 4),
#         'F1 Weighted': round(f1_weighted, 4)
#     })

# results

#### Evaluation

In [13]:
from mllm_emotion_classifier.evaluate import Evaluator

evaluator = Evaluator()
evaluator.evaluate(model, data_loader)

  valid_indices = [i for i, p in enumerate(self.y_pred) if p is not "Unknown"]



Evaluating qwen2-audio-instruct on iemocap


Inference:   0%|                                                                                                                                                   | 0/272 [00:00<?, ?it/s]

Inference:  86%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                  | 235/272 [01:45<00:14,  2.56it/s]Could not confidently parse response: "Fearful"
Inference: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 272/272 [02:01<00:00,  2.24it/s]


{'timestamp': '2025-12-16 10:10:10',
 'dataset': 'iemocap',
 'model_name': 'qwen2-audio-instruct',
 'fold': None,
 'num_samples': 1085,
 'valid_rate': 1.0,
 'class_labels': ['Happy', 'Angry', 'Sad', 'Neutral'],
 'metrics': {'global': {'f1_macro': 0.5625,
   'f1_weighted': 0.6979,
   'accuracy_unweighted': 0.712,
   'accuracy_weighted': 0.6949,
   'precision_macro': 0.5699,
   'precision_weighted': 0.7161,
   'recall_macro': 0.5696,
   'recall_weighted': 0.6949},
  'classwise': {'accuracy': {'Angry': 0.9336,
    'Happy': 0.8618,
    'Neutral': 0.7594,
    'Sad': 0.8359},
   'false_positive_rate': {'Angry': 0.0397,
    'Happy': 0.0508,
    'Neutral': 0.174,
    'Sad': 0.1493},
   'false_negative_rate': {'Angry': 0.1659,
    'Happy': 0.3921,
    'Neutral': 0.362,
    'Sad': 0.232},
   'true_positive_rate': {'Angry': 0.8341,
    'Happy': 0.6079,
    'Neutral': 0.638,
    'Sad': 0.768},
   'true_negative_rate': {'Angry': 0.9603,
    'Happy': 0.9492,
    'Neutral': 0.826,
    'Sad': 0.8507},

In [15]:
evaluator.results['metrics']['global']

{'f1_macro': 0.5625,
 'f1_weighted': 0.6979,
 'accuracy_unweighted': 0.712,
 'accuracy_weighted': 0.6949,
 'precision_macro': 0.5699,
 'precision_weighted': 0.7161,
 'recall_macro': 0.5696,
 'recall_weighted': 0.6949}

In [None]:
from mllm_emotion_classifier.utils import flatten_dict
metrics = flatten_dict(evaluator.results['metrics'])
metrics

{'global_f1_macro': 0.2274,
 'global_f1_weighted': 0.2685,
 'global_accuracy_unweighted': 0.3222,
 'global_accuracy_weighted': 0.3173,
 'global_precision_macro': 0.3274,
 'global_precision_weighted': 0.4028,
 'global_recall_macro': 0.2819,
 'global_recall_weighted': 0.3173,
 'classwise_accuracy_Angry': 0.8462,
 'classwise_accuracy_Disgust': 0.9327,
 'classwise_accuracy_Fear': 0.8846,
 'classwise_accuracy_Happy': 0.875,
 'classwise_accuracy_Neutral': 0.8558,
 'classwise_accuracy_Sad': 0.4038,
 'classwise_accuracy_Surprise': 0.8558,
 'classwise_false_positive_rate_Angry': 0.0,
 'classwise_false_positive_rate_Disgust': 0.0114,
 'classwise_false_positive_rate_Fear': 0.0,
 'classwise_false_positive_rate_Happy': 0.0899,
 'classwise_false_positive_rate_Neutral': 0.0,
 'classwise_false_positive_rate_Sad': 0.6593,
 'classwise_false_positive_rate_Surprise': 0.0,
 'classwise_false_negative_rate_Angry': 1.0,
 'classwise_false_negative_rate_Disgust': 0.375,
 'classwise_false_negative_rate_Fear': 1.

In [None]:
evaluator.results['metrics']['classwise']

{'accuracy': {'Angry': 0.86, 'Happy': 0.86, 'Neutral': 0.73, 'Sad': 0.73},
 'false_positive_rate': {'Angry': 0.0,
  'Happy': 0.1443,
  'Neutral': 0.0,
  'Sad': 0.3103},
 'false_negative_rate': {'Angry': 0.3684,
  'Happy': 0.0,
  'Neutral': 0.587,
  'Sad': 0.0},
 'true_positive_rate': {'Angry': 0.6316,
  'Happy': 1.0,
  'Neutral': 0.413,
  'Sad': 1.0},
 'true_negative_rate': {'Angry': 1.0,
  'Happy': 0.8557,
  'Neutral': 1.0,
  'Sad': 0.6897},
 'positive_predictive_value': {'Angry': 1.0,
  'Happy': 0.1765,
  'Neutral': 1.0,
  'Sad': 0.325},
 'negative_predictive_value': {'Angry': 0.8158,
  'Happy': 1.0,
  'Neutral': 0.6667,
  'Sad': 1.0},
 'f1_score': {'Angry': 0.7742, 'Happy': 0.3, 'Neutral': 0.5846, 'Sad': 0.4906}}