In [70]:
import os
from pathlib import Path

cwd = Path.cwd()
if cwd.name == "notebooks":
    os.chdir(cwd.parent)

In [71]:
import pandas as pd
from mllm_emotion_classifier.utils import add_fairness_metrics_to_df
from EmoBox.EmoBox import EmoDataset

In [72]:
sensitive_attr_dict = {
    'iemocap': ['gender'],
    'cremad': ['gender', 'age', 'ethnicity', 'race'],
    'emovdb': ['gender'],
    'tess': ['agegroup'],
    'ravdess': ['gender'],
    'esd': ['gender']
}

In [73]:
hparam = 'temperature' # or 'top_p'
assert hparam in ['temperature', 'top_p'], "hparam must be either 'temperature' or 'top_p'"

dataset = 'tess' # iemocap, meld, cremad, ravdess, emovdb, tess (agegroup only),
fold = None # Set to an integer fold number if needed, else None to aggregate all folds
sensitive_attrs = sensitive_attr_dict[dataset] # gender, age, ethnicity, race
model = 'qwen2-audio-instruct'

metadata_dir = Path('EmoBox/data/')
dataset_path = metadata_dir / dataset
n_folds = len([d for d in dataset_path.iterdir() if d.is_dir() and d.name.startswith("fold_")])
out_dir = Path('outputs-2') / "temperature_runs" if hparam == 'temperature' else Path('outputs-2') / "topp_runs"

test = EmoDataset(dataset, './', metadata_dir, fold=1, split="test")
emotions = set(test.label_map.values())

if fold is None:
    dfs = []
    for f in range(1, n_folds + 1):
        results_csv = out_dir / model / dataset / f'fold_{f}.csv'
        df_fold = pd.read_csv(results_csv)
        dfs.append(df_fold)
    df = pd.concat(dfs, ignore_index=True)
else:
    results_csv = out_dir / model / dataset / f'fold_{fold}.csv'
    df = pd.read_csv(results_csv)

print(len(df), "rows")
df.head(5)

since there is no official valid data, use random split for train valid split, with a ratio of [80, 20]
load in 2100 samples, only 2100 exists in data dir EmoBox/data
load in 700 samples, only 700 exists in data dir EmoBox/data
Num. training samples 2100
Num. valid samples 0
Num. test samples 700
Using label_map {'fear': 'Fear', 'disgust': 'Disgust', 'angry': 'Angry', 'sad': 'Sad', 'ps': 'Surprise', 'neutral': 'Neutral', 'happy': 'Happy'}
60 rows


Unnamed: 0,run,dataset,fold,model,prompt,temperature,valid_rate,global_f1_macro,global_f1_weighted,global_accuracy_unweighted,...,language_statistical_parity,language_Angry_equal_opportunity,language_Disgust_equal_opportunity,language_Fear_equal_opportunity,language_Happy_equal_opportunity,language_Neutral_equal_opportunity,language_Sad_equal_opportunity,language_Surprise_equal_opportunity,language_equal_opportunity,language_overall_accuracy_equality
0,0,tess,1,qwen2-audio-instruct,user_labels,0.0001,1.0,0.6737,0.6737,0.7086,...,0.0,,,,,,,,,
1,0,tess,1,qwen2-audio-instruct,user_labels,0.3,1.0,0.7494,0.7494,0.7886,...,0.0,,,,,,,,,
2,0,tess,1,qwen2-audio-instruct,user_labels,0.7,1.0,0.692,0.692,0.7186,...,0.0,,,,,,,,,
3,0,tess,1,qwen2-audio-instruct,user_labels,1.0,1.0,0.6133,0.7009,0.7129,...,0.0,,,,,,,,,
4,0,tess,1,qwen2-audio-instruct,user_labels,1.2,1.0,0.5736,0.6555,0.6643,...,0.0,,,,,,,,,


In [74]:
test[0]

{'key': 'tess-OAF-should-fear',
 'audio': array([-2.2856926e-05, -7.4530079e-05, -5.1498064e-05, ...,
        -1.9920158e-04, -2.2834234e-04,  0.0000000e+00],
       shape=(23805,), dtype=float32),
 'label': 'Fear',
 'agegroup': '64',
 'gender': 'Female',
 'language': 'English'}

In [75]:
# Select columns
cols = [hparam, 'global_f1_macro', 'global_accuracy_unweighted'] + \
       [f"{attr}_{metric}" for attr in sensitive_attrs 
        for metric in ['statistical_parity', 'equal_opportunity', 'overall_accuracy_equality']]

# Group and compute mean and std
grouped_stats = df[cols].groupby([hparam]).agg(['mean', 'std']).reset_index()

# Create a formatted table with mean ± std
grouped = grouped_stats[[hparam]].copy()
for col in cols:
    if col == hparam:
        continue
    mean_vals = (grouped_stats[(col, 'mean')] * 100).round(2)
    std_vals = (grouped_stats[(col, 'std')] * 100).round(2)
    grouped[col] = mean_vals.astype(str) + ' ± ' + std_vals.astype(str)

# For finding best row, use the mean values
best_idx = (grouped_stats[('global_f1_macro', 'mean')]).idxmax()
best_row = grouped.loc[best_idx]

display(best_row)

temperature                                    0.0001
global_f1_macro                          73.59 ± 4.59
global_accuracy_unweighted               76.34 ± 3.95
agegroup_statistical_parity               2.58 ± 0.44
agegroup_equal_opportunity                 7.0 ± 2.07
agegroup_overall_accuracy_equality      25.23 ± 18.38
Name: 0, dtype: object