In [1]:
import os
from pathlib import Path

cwd = Path.cwd()
if cwd.name == "notebooks":
    os.chdir(cwd.parent)

#### Load Dataset

In [2]:
from EmoBox.EmoBox import EmoDataset, EmoEval

In [3]:
dataset = "iemocap"
fold = 1  # different datasets have different number of folds, which can be find in data/
user_data_dir = "./" # path to EmoBox - FIXED: Changed from "Emobox" to "EmoBox"
meta_data_dir = "EmoBox/data/" # path to data folder - FIXED: Changed from "Emobox" to "EmoBox"
label2idx = {'hap':0, 'sad':1, 'ang':2, 'neu':3} # you may need to define a label to index mapping for your own training, see `data/iemocap/label_map.json`

train = EmoDataset(dataset, user_data_dir, meta_data_dir, fold=fold, split="train")
test = EmoDataset(dataset, user_data_dir, meta_data_dir, fold=fold, split="test")

since there is no official valid data, use random split for train valid split, with a ratio of [80, 20]
load in 4446 samples, only 4446 exists in data dir EmoBox/data/
load in 1085 samples, only 1085 exists in data dir EmoBox/data/
Num. training samples 4446
Num. valid samples 0
Num. test samples 1085
Using label_map {'neu': 'Neutral', 'hap': 'Happy', 'ang': 'Angry', 'sad': 'Sad', 'exc': 'Happy'}
since there is no official valid data, use random split for train valid split, with a ratio of [80, 20]
load in 4446 samples, only 4446 exists in data dir EmoBox/data/
load in 1085 samples, only 1085 exists in data dir EmoBox/data/
Num. training samples 4446
Num. valid samples 0
Num. test samples 1085
Using label_map {'neu': 'Neutral', 'hap': 'Happy', 'ang': 'Angry', 'sad': 'Sad', 'exc': 'Happy'}


In [4]:
import numpy as np

id = 102
audio_1 = test[id]['audio']
audio_2 = test[id]['audio']
np.array_equal(audio_1, audio_2)

True

In [5]:
test.label_map.values()

dict_values(['Neutral', 'Happy', 'Angry', 'Sad', 'Happy'])

In [6]:
from collections import Counter
labels =  [data['label'] for data in test]
Counter(labels)

Counter({'Neutral': 384, 'Happy': 278, 'Angry': 229, 'Sad': 194})

In [7]:
len(test)

1085

#### Load Model

In [8]:
AUDIO_PROMPT_TEMPLATE = (
    "<|audio_bos|><|AUDIO|><|audio_eos|>"
    "Classify the speaker’s tone in the audio. "
    "Select one of: {labels}. "
    "Answer:"
)

class_labels = test.label_map.values()
# letter_to_label = {label[0].upper(): label for label in class_labels}
# label_to_letter = {label: label[0].upper() for label in class_labels}
# label_options = ", ".join([f"{label_to_letter[label]}: {label}" for label in class_labels])
label_dict = {label[0]: label for label in class_labels}
AUDIO_PROMPT_TEMPLATE = AUDIO_PROMPT_TEMPLATE.format(labels=label_dict)
AUDIO_PROMPT_TEMPLATE

"<|audio_bos|><|AUDIO|><|audio_eos|>Classify the speaker’s tone in the audio. Select one of: {'N': 'Neutral', 'H': 'Happy', 'A': 'Angry', 'S': 'Sad'}. Answer:"

In [9]:
import torch
from mllm_emotion_classifier.models import ModelFactory

device = "cuda" if torch.cuda.is_available() else "cpu"
model = ModelFactory.create(
    name="qwen2-audio-instruct",
    # checkpoint="Qwen/Qwen2-Audio-7B",
    # checkpoint="Qwen/Qwen2-Audio-7B-Instruct",
    class_labels=set(train.label_map.values()),
    do_sample=False,
    device=device,
    prompt_name="user_labels"
)

  from .autonotebook import tqdm as notebook_tqdm
`torch_dtype` is deprecated! Use `dtype` instead!
Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:03<00:00,  1.52it/s]


In [10]:
data_loader = torch.utils.data.DataLoader(
    dataset=test,
    batch_size=4,
    num_workers=4,
    pin_memory=True,
    drop_last=False,
    collate_fn=model.collate_fn
)

In [11]:
from tqdm import tqdm
import pandas as pd

predictions, labels = [], []
i = 0
for inputs, lbl in tqdm(data_loader, total=len(data_loader)):
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    preds = model.predict(inputs)
    predictions.extend(preds)
    labels.extend(lbl)
    i += 1
    if len(predictions) >= 20:
        break

# new_data = pd.DataFrame({
#     "label": labels,
#     "prediction": predictions,
# })
# csv_path = "notebooks/qwen2-audio-iemocap-fold1-predictions-json.csv"

# if os.path.exists(csv_path):
#     df = pd.read_csv(csv_path)
#     last_digit = df.columns[-1][0]
#     next_id = int(last_digit) + 1
#     df[f"{next_id}_prediction"] = new_data["prediction"]
#     print(f"Added columns with ID {next_id} to existing CSV")
# else:
#     df = pd.DataFrame({
#         "label": new_data["label"],
#         "0_prediction": new_data["prediction"],
#     })
#     print(f"Created new CSV with ID 0")

# df.to_csv(csv_path, index=False)
# print(f"Saved to {csv_path}")
# print(f"Shape: {df.shape}")

  0%|                                                                                                                                                         | 0/272 [00:00<?, ?it/s]

Keyword argument `audios` is not a valid argument for this processor and will be ignored.
Keyword argument `audios` is not a valid argument for this processor and will be ignored.
Keyword argument `audios` is not a valid argument for this processor and will be ignored.
Keyword argument `audios` is not a valid argument for this processor and will be ignored.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
  1%|██▏                                                                                                                                              | 4/272 [00:01<01:20,  3.34it/s]


In [12]:
predictions

['Neutral',
 'Neutral',
 'Neutral',
 'Neutral',
 'Neutral',
 'Neutral',
 'Neutral',
 'Neutral',
 'Neutral',
 'Neutral',
 'Neutral',
 'Neutral',
 'Neutral',
 'Neutral',
 'Neutral',
 'Neutral',
 'Neutral',
 'Neutral',
 'Neutral',
 'Neutral']

In [1]:
import pandas as pd
from sklearn.metrics import f1_score

csv_path = "notebooks/qwen2-audio-iemocap-fold1-predictions-subset.csv"
df = pd.read_csv(csv_path)
prediction_cols = [col for col in df.columns if col.endswith("_prediction")]
y_true = df["label"]

results = []
for pred_col in prediction_cols:
    pred_id = pred_col.split("_")[0]
    y_pred = df[pred_col].dropna()
    y_true_valid = y_true[df[pred_col].notna()]
    
    f1_macro = f1_score(y_true_valid, y_pred, average='macro')
    f1_weighted = f1_score(y_true_valid, y_pred, average='weighted')
    
    results.append({
        'ID': pred_id,
        'F1 Macro': round(f1_macro, 4),
        'F1 Weighted': round(f1_weighted, 4)
    })

results

[{'ID': '0', 'F1 Macro': 0.7043, 'F1 Weighted': 0.7183},
 {'ID': '1', 'F1 Macro': 0.6647, 'F1 Weighted': 0.717},
 {'ID': '2', 'F1 Macro': 0.766, 'F1 Weighted': 0.7892}]

#### Evaluation

In [12]:
from mllm_emotion_classifier.evaluate import Evaluator

evaluator = Evaluator()
evaluator.evaluate(model, data_loader)


Evaluating Qwen2-Audio-7B-Instruct on iemocap


Inference:   0%|                                                                                                         | 0/272 [00:00<?, ?it/s]

Inference:  45%|██████████████████████████████████████████▉                                                    | 123/272 [00:53<01:11,  2.08it/s]



Inference: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 272/272 [01:55<00:00,  2.36it/s]


TypeError: '<' not supported between instances of 'str' and 'NoneType'

In [13]:
evaluator.results['metrics']['global']

{'accuracy': 0.7926,
 'false_positive_rate': 0.1296,
 'false_negative_rate': 0.3504,
 'true_positive_rate': 0.6496,
 'true_negative_rate': 0.8704,
 'positive_predictive_value': 0.7376,
 'negative_predictive_value': 0.8779,
 'f1_score': 0.6141}

In [14]:
evaluator.results['metrics']['classwise']

{'accuracy': {'Angry': 0.9364,
  'Happy': 0.8756,
  'Neutral': 0.718,
  'Sad': 0.6406},
 'false_positive_rate': {'Angry': 0.0035,
  'Happy': 0.0322,
  'Neutral': 0.0471,
  'Sad': 0.4355},
 'false_negative_rate': {'Angry': 0.2882,
  'Happy': 0.3921,
  'Neutral': 0.7109,
  'Sad': 0.0103},
 'true_positive_rate': {'Angry': 0.7118,
  'Happy': 0.6079,
  'Neutral': 0.2891,
  'Sad': 0.9897},
 'true_negative_rate': {'Angry': 0.9965,
  'Happy': 0.9678,
  'Neutral': 0.9529,
  'Sad': 0.5645},
 'positive_predictive_value': {'Angry': 0.9819,
  'Happy': 0.8667,
  'Neutral': 0.7708,
  'Sad': 0.331},
 'negative_predictive_value': {'Angry': 0.9282,
  'Happy': 0.8775,
  'Neutral': 0.7099,
  'Sad': 0.996},
 'f1_score': {'Angry': 0.8253,
  'Happy': 0.7146,
  'Neutral': 0.4205,
  'Sad': 0.4961}}