In [65]:
import os
from pathlib import Path

cwd = Path.cwd()
if cwd.name == "notebooks":
    os.chdir(cwd.parent)

In [None]:
import pandas as pd
from mllm_emotion_classifier.utils import add_fairness_metrics_to_df
from EmoBox.EmoBox import EmoDataset

In [None]:
hparam = 'temperature' # or 'top_p'
assert hparam in ['temperature', 'top_p'], "hparam must be either 'temperature' or 'top_p'"

dataset = 'tess' # iemocap, meld, cremad, ravdess, emovdb
fold = None # Set to an integer fold number if needed, else None to aggregate all folds
sensitive_attr = 'gender' # gender, age, ethnicity, race
model = 'qwen2-audio-instruct'

metadata_dir = Path('EmoBox/data/')
dataset_path = metadata_dir / dataset
n_folds = len([d for d in dataset_path.iterdir() if d.is_dir() and d.name.startswith("fold_")])
out_dir = Path('outputs') / "temperature_runs" if hparam == 'temperature' else Path('outputs') / "topp_runs"

test = EmoDataset(dataset, './', metadata_dir, fold=1, split="test")
emotions = set(test.label_map.values())

if fold is None:
    dfs = []
    for f in range(1, n_folds + 1):
        results_csv = out_dir / model / dataset / f'fold_{f}.csv'
        df_fold = pd.read_csv(results_csv)
        dfs.append(df_fold)
    df = pd.concat(dfs, ignore_index=True)
else:
    results_csv = out_dir / model / dataset / f'fold_{fold}.csv'
    df = pd.read_csv(results_csv)

print(len(df), "rows")
df.head(5)

since there is no official valid data, use random split for train valid split, with a ratio of [80, 20]
load in 5168 samples, only 5168 exists in data dir EmoBox/data
load in 1719 samples, only 1719 exists in data dir EmoBox/data
Num. training samples 5168
Num. valid samples 0
Num. test samples 1719
Using label_map {'Amused': 'Amused', 'Sleepy': 'Sleepy', 'Angry': 'Angry', 'Disgust': 'Disgust', 'Neutral': 'Neutral'}
60 rows


Unnamed: 0,run,dataset,fold,model,prompt,temperature,valid_rate,global_f1_macro,global_f1_weighted,global_accuracy_unweighted,...,language_English_classwise_negative_predictive_value_Amused,language_English_classwise_negative_predictive_value_Angry,language_English_classwise_negative_predictive_value_Disgust,language_English_classwise_negative_predictive_value_Neutral,language_English_classwise_negative_predictive_value_Sleepy,language_English_classwise_f1_score_Amused,language_English_classwise_f1_score_Angry,language_English_classwise_f1_score_Disgust,language_English_classwise_f1_score_Neutral,language_English_classwise_f1_score_Sleepy
0,0,emovdb,1,qwen2-audio-instruct,user_labels,0.0001,1.0,0.7077,0.8485,0.8549,...,0.9574,0.9928,0.9891,0.9526,0.9108,0.8414,0.9577,0.7927,0.8389,0.8155
1,0,emovdb,1,qwen2-audio-instruct,user_labels,0.3,1.0,0.7118,0.8587,0.8631,...,0.9643,0.9942,0.9859,0.9239,0.9498,0.858,0.9462,0.7556,0.8191,0.8919
2,0,emovdb,1,qwen2-audio-instruct,user_labels,0.7,1.0,0.6502,0.7835,0.788,...,0.9446,0.9858,0.9918,0.8641,0.941,0.8212,0.9442,0.6515,0.6321,0.8523
3,0,emovdb,1,qwen2-audio-instruct,user_labels,1.0,1.0,0.6933,0.8302,0.8231,...,0.931,0.9935,0.9724,0.9608,0.9179,0.7774,0.9374,0.8199,0.7944,0.8305
4,0,emovdb,1,qwen2-audio-instruct,user_labels,1.2,1.0,0.648,0.7784,0.775,...,0.942,0.9985,0.9575,0.9241,0.8907,0.7785,0.8603,0.7188,0.7649,0.7658


In [68]:
test[0]

{'key': 'emovdb-sam-Amused-0384',
 'audio': array([-0.00119863,  0.00034247,  0.0015411 , ...,  0.00291096,
         0.00308219,  0.00308219], shape=(131361,), dtype=float32),
 'label': 'Amused',
 'gender': 'Male',
 'language': 'English'}

In [69]:
run = None
df = add_fairness_metrics_to_df(df, emotions, sensitive_attr=sensitive_attr, fairness_name='statistical_parity', run=run)
df = add_fairness_metrics_to_df(df, emotions, sensitive_attr=sensitive_attr, fairness_name='equal_opportunity', run=run)
df = add_fairness_metrics_to_df(df, emotions, sensitive_attr=sensitive_attr, fairness_name='equal_non_opportunity', run=run)
df = add_fairness_metrics_to_df(df, emotions, sensitive_attr=sensitive_attr, fairness_name='predictive_parity', run=run)
df = add_fairness_metrics_to_df(df, emotions, sensitive_attr=sensitive_attr, fairness_name='negative_predictive_parity', run=run)
df = add_fairness_metrics_to_df(df, emotions, sensitive_attr=sensitive_attr, fairness_name='negative_predictive_parity', run=run)
df = add_fairness_metrics_to_df(df, emotions, sensitive_attr=sensitive_attr, fairness_name='overall_accuracy_equality', run=run)

In [70]:
grouped = df[[hparam, 'global_f1_macro', 'global_accuracy_unweighted',
              'statistical_parity', 'equal_opportunity', 'equal_non_opportunity',
              'predictive_parity', 'negative_predictive_parity', 'overall_accuracy_equality']]
grouped = grouped.groupby([hparam]).mean().reset_index()

for col in grouped.columns:
    if col != hparam:
        grouped[col] = (grouped[col] * 100).round(2)

grouped

Unnamed: 0,temperature,global_f1_macro,global_accuracy_unweighted,statistical_parity,equal_opportunity,equal_non_opportunity,predictive_parity,negative_predictive_parity,overall_accuracy_equality
0,0.0001,68.3,82.61,2.89,7.26,1.78,6.48,1.83,5.98
1,0.3,70.45,84.73,2.18,6.54,1.49,5.52,1.38,4.46
2,0.7,67.33,80.74,2.38,8.39,1.79,5.92,1.5,4.9
3,1.0,68.42,81.86,2.65,7.14,1.64,7.51,1.78,5.85
4,1.2,66.75,79.11,2.67,7.99,1.72,8.87,1.85,6.15
5,1.5,48.16,58.59,2.54,6.64,1.7,4.07,2.32,2.07


In [71]:
best_row = grouped.loc[grouped['global_f1_macro'].idxmax()]
best_row

temperature                    0.30
global_f1_macro               70.45
global_accuracy_unweighted    84.73
statistical_parity             2.18
equal_opportunity              6.54
equal_non_opportunity          1.49
predictive_parity              5.52
negative_predictive_parity     1.38
overall_accuracy_equality      4.46
Name: 1, dtype: float64