In [1]:
import os
from pathlib import Path

cwd = Path.cwd()
if cwd.name == "notebooks":
    os.chdir(cwd.parent)

In [2]:
import pandas as pd
from mllm_emotion_classifier.utils import add_fairness_metrics_to_df
from EmoBox.EmoBox import EmoDataset

In [None]:
hparam = 'temperature' # or 'top_p'
assert hparam in ['temperature', 'top_p'], "hparam must be either 'temperature' or 'top_p'"

dataset = 'cremad' # iemocap, meld, cremad, ravdess, emovdb, tess (agegroup only),
fold = None # Set to an integer fold number if needed, else None to aggregate all folds
sensitive_attr = 'gender' # gender, age, ethnicity, race
model = 'qwen2-audio-instruct'

metadata_dir = Path('EmoBox/data/')
dataset_path = metadata_dir / dataset
n_folds = len([d for d in dataset_path.iterdir() if d.is_dir() and d.name.startswith("fold_")])
out_dir = Path('outputs') / "temperature_runs" if hparam == 'temperature' else Path('outputs') / "topp_runs"

test = EmoDataset(dataset, './', metadata_dir, fold=1, split="test")
emotions = set(test.label_map.values())

if fold is None:
    dfs = []
    for f in range(1, n_folds + 1):
        results_csv = out_dir / model / dataset / f'fold_{f}.csv'
        df_fold = pd.read_csv(results_csv)
        dfs.append(df_fold)
    df = pd.concat(dfs, ignore_index=True)
else:
    results_csv = out_dir / model / dataset / f'fold_{fold}.csv'
    df = pd.read_csv(results_csv)

print(len(df), "rows")
df.head(5)

since there is no official valid data, use random split for train valid split, with a ratio of [80, 20]
load in 2100 samples, only 2100 exists in data dir EmoBox/data
load in 700 samples, only 700 exists in data dir EmoBox/data
Num. training samples 2100
Num. valid samples 0
Num. test samples 700
Using label_map {'fear': 'Fear', 'disgust': 'Disgust', 'angry': 'Angry', 'sad': 'Sad', 'ps': 'Surprise', 'neutral': 'Neutral', 'happy': 'Happy'}
60 rows


Unnamed: 0,run,dataset,fold,model,prompt,temperature,valid_rate,global_f1_macro,global_f1_weighted,global_accuracy_unweighted,...,language_English_classwise_negative_predictive_value_Neutral,language_English_classwise_negative_predictive_value_Sad,language_English_classwise_negative_predictive_value_Surprise,language_English_classwise_f1_score_Angry,language_English_classwise_f1_score_Disgust,language_English_classwise_f1_score_Fear,language_English_classwise_f1_score_Happy,language_English_classwise_f1_score_Neutral,language_English_classwise_f1_score_Sad,language_English_classwise_f1_score_Surprise
0,0,tess,1,qwen2-audio-instruct,user_labels,0.0001,1.0,0.7272,0.7272,0.7671,...,0.8571,0.9979,0.974,0.9362,0.8291,0.8439,0.9458,0.0,0.6226,0.913
1,0,tess,1,qwen2-audio-instruct,user_labels,0.3,1.0,0.7365,0.7365,0.76,...,0.8671,1.0,0.9524,0.9515,0.8229,0.8768,0.936,0.1481,0.597,0.8235
2,0,tess,1,qwen2-audio-instruct,user_labels,0.7,1.0,0.7147,0.7147,0.7314,...,0.8759,1.0,0.939,0.9278,0.7394,0.7823,0.9215,0.2609,0.6135,0.7578
3,0,tess,1,qwen2-audio-instruct,user_labels,1.0,1.0,0.6301,0.7201,0.7243,...,0.8811,1.0,0.9724,0.8603,0.7073,0.7627,0.884,0.3193,0.6098,0.8973
4,0,tess,1,qwen2-audio-instruct,user_labels,1.2,1.0,0.6024,0.6885,0.6914,...,0.8876,1.0,0.9419,0.8663,0.6081,0.6824,0.8984,0.3871,0.6042,0.773


In [4]:
test[0]

{'key': 'tess-OAF-should-fear',
 'audio': array([-2.2856926e-05, -7.4530079e-05, -5.1498064e-05, ...,
        -1.9920158e-04, -2.2834234e-04,  0.0000000e+00],
       shape=(23805,), dtype=float32),
 'label': 'Fear',
 'agegroup': '64',
 'gender': 'Female',
 'language': 'English'}

In [4]:
run = None
df = add_fairness_metrics_to_df(df, emotions, sensitive_attr=sensitive_attr, fairness_name='statistical_parity', run=run)
df = add_fairness_metrics_to_df(df, emotions, sensitive_attr=sensitive_attr, fairness_name='equal_opportunity', run=run)
df = add_fairness_metrics_to_df(df, emotions, sensitive_attr=sensitive_attr, fairness_name='equal_non_opportunity', run=run)
df = add_fairness_metrics_to_df(df, emotions, sensitive_attr=sensitive_attr, fairness_name='predictive_parity', run=run)
df = add_fairness_metrics_to_df(df, emotions, sensitive_attr=sensitive_attr, fairness_name='negative_predictive_parity', run=run)
df = add_fairness_metrics_to_df(df, emotions, sensitive_attr=sensitive_attr, fairness_name='negative_predictive_parity', run=run)
df = add_fairness_metrics_to_df(df, emotions, sensitive_attr=sensitive_attr, fairness_name='overall_accuracy_equality', run=run)

In [5]:
grouped = df[[hparam, 'global_f1_macro', 'global_accuracy_unweighted',
              'statistical_parity', 'equal_opportunity', 'equal_non_opportunity',
              'predictive_parity', 'negative_predictive_parity', 'overall_accuracy_equality']]
grouped = grouped.groupby([hparam]).mean().reset_index()

for col in grouped.columns:
    if col != hparam:
        grouped[col] = (grouped[col] * 100).round(2)

grouped

Unnamed: 0,temperature,global_f1_macro,global_accuracy_unweighted,statistical_parity,equal_opportunity,equal_non_opportunity,predictive_parity,negative_predictive_parity,overall_accuracy_equality
0,0.0001,73.88,76.99,1.26,5.8,0.82,5.97,0.93,2.51
1,0.3,72.69,75.52,1.38,6.37,0.81,4.42,0.99,3.63
2,0.7,67.88,74.79,1.42,7.67,1.17,3.82,1.11,2.93
3,1.0,65.96,72.09,1.93,9.08,1.6,4.42,1.29,4.96
4,1.2,61.26,70.79,2.02,10.28,1.93,5.11,1.56,4.78
5,1.5,57.07,65.74,1.85,10.48,2.06,4.82,1.62,3.74


In [6]:
best_row = grouped.loc[grouped['global_f1_macro'].idxmax()]
best_row

temperature                    0.0001
global_f1_macro               73.8800
global_accuracy_unweighted    76.9900
statistical_parity             1.2600
equal_opportunity              5.8000
equal_non_opportunity          0.8200
predictive_parity              5.9700
negative_predictive_parity     0.9300
overall_accuracy_equality      2.5100
Name: 0, dtype: float64