In [1]:
import os
from pathlib import Path

cwd = Path.cwd()
if cwd.name == "notebooks":
    os.chdir(cwd.parent)

In [2]:
from EmoBox.EmoBox import EmoDataset, EmoEval

In [3]:
dataset = "cremad"
fold = 1  # different datasets have different number of folds, which can be find in data/
user_data_dir = "./" # path to EmoBox - FIXED: Changed from "Emobox" to "EmoBox"
meta_data_dir = "EmoBox/data/" # path to data folder - FIXED: Changed from "Emobox" to "EmoBox"

train = EmoDataset(dataset, user_data_dir, meta_data_dir, fold=fold, split="train")
test = EmoDataset(dataset, user_data_dir, meta_data_dir, fold=fold, split="test")

since there is no official valid data, use random split for train valid split, with a ratio of [80, 20]
load in 5586 samples, only 5586 exists in data dir EmoBox/data/
load in 1856 samples, only 1856 exists in data dir EmoBox/data/
Num. training samples 5586
Num. valid samples 0
Num. test samples 1856
Using label_map {'ANG': 'Angry', 'DIS': 'Disgust', 'FEA': 'Fear', 'HAP': 'Happy', 'NEU': 'Neutral', 'SAD': 'Sad'}
since there is no official valid data, use random split for train valid split, with a ratio of [80, 20]
load in 5586 samples, only 5586 exists in data dir EmoBox/data/
load in 1856 samples, only 1856 exists in data dir EmoBox/data/
Num. training samples 5586
Num. valid samples 0
Num. test samples 1856
Using label_map {'ANG': 'Angry', 'DIS': 'Disgust', 'FEA': 'Fear', 'HAP': 'Happy', 'NEU': 'Neutral', 'SAD': 'Sad'}


In [20]:
test[0]

{'key': 'cremad-1052-WSI-ANG-XX',
 'audio': array([-0.0055542 , -0.00469971, -0.00601196, ...,  0.        ,
         0.        ,  0.        ], shape=(36837,), dtype=float32),
 'label': 'Angry',
 'gender': 'Female',
 'age': 33,
 'race': 'Caucasian',
 'ethnicity': 'Not Hispanic',
 'language': 'English'}

In [19]:
from collections import Counter

genders, ethnicities, ages, races, languages = [], [], [], [], []
if 'gender' in test[0]:
    genders = [data['gender'] for data in test]
if 'ethnicity' in test[0]:
    ethnicities = [data['ethnicity'] for data in test]
if 'age' in test[0]:
    ages = [data['age'] for data in test]
if 'race' in test[0]:
    races = [data['race'] for data in test]
if 'language' in test[0]:
    languages = [data['language'] for data in test]

if genders:
    print("Gender distribution:", Counter(genders))
if ethnicities:
    print("Ethnicity distribution:", Counter(ethnicities))
if ages:
    print("Age distribution:", Counter(ages))
if races:
    print("Race distribution:", Counter(races))
if languages:
    print("Language distribution:", Counter(languages))

Gender distribution: Counter({'Male': 1006, 'Female': 850})
Ethnicity distribution: Counter({'Not Hispanic': 1659, 'Hispanic': 197})
Age distribution: Counter({21: 144, 25: 138, 33: 121, 22: 112, 42: 106, 24: 84, 31: 72, 30: 64, 38: 61, 29: 57, 51: 56, 34: 54, 57: 53, 62: 47, 36: 46, 48: 46, 56: 46, 23: 42, 20: 41, 45: 41, 27: 40, 32: 38, 40: 34, 61: 31, 46: 29, 49: 27, 35: 22, 37: 21, 74: 20, 52: 20, 66: 20, 28: 19, 59: 19, 58: 19, 41: 18, 53: 18, 50: 17, 44: 13})
Race distribution: Counter({'Caucasian': 1235, 'African American': 466, 'Asian': 137, 'Unknown': 18})
Language distribution: Counter({'English': 1856})


In [4]:
import numpy as np
from collections import Counter

# Get ages and sort by key
ages = []
if 'age' in test[0]:
    ages = [data['age'] for data in test]

age_counts = Counter(ages)
age_counts_sorted = dict(sorted(age_counts.items()))

print("Age Distribution (sorted):")
# for age, count in age_counts_sorted.items():
#     print(f"  {age}: {count}")

ages_array = np.array(ages)
print(f"\nAge Statistics:")
print(f"  Min: {ages_array.min()}")
print(f"  Max: {ages_array.max()}")
print(f"  Mean: {ages_array.mean():.2f}")
print(f"  Median: {np.median(ages_array):.2f}")
print(f"  Std: {ages_array.std():.2f}")

# Suggest binning strategies
print("\n--- Binning Strategies ---")

# 1. Equal-width bins
print("\n1. Equal-width bins (3 bins):")
bins_equal = [ages_array.min(), 30, 50, ages_array.max()]
labels_equal = ['Young (<30)', 'Middle (30-50)', 'Senior (>50)']
binned_equal = np.digitize(ages_array, bins_equal[:-1]) - 1
for i, label in enumerate(labels_equal):
    count = np.sum(binned_equal == i)
    print(f"  {label}: {count} ({count/len(ages)*100:.1f}%)")

# 2. Quantile-based bins (balanced)
print("\n2. Quantile-based bins (3 bins, balanced):")
quantiles = np.percentile(ages_array, [33.33, 66.67])
bins_quantile = [ages_array.min()] + list(quantiles) + [ages_array.max()]
print(f"  Bin edges: {[f'{b:.1f}' for b in bins_quantile]}")
binned_quantile = np.digitize(ages_array, bins_quantile[:-1]) - 1
labels_quantile = [f'Q1 (<{quantiles[0]:.0f})', 
                   f'Q2 ({quantiles[0]:.0f}-{quantiles[1]:.0f})', 
                   f'Q3 (>{quantiles[1]:.0f})']
for i, label in enumerate(labels_quantile):
    count = np.sum(binned_quantile == i)
    print(f"  {label}: {count} ({count/len(ages)*100:.1f}%)")

# 2. Quantile-based bins (balanced)
print("\n2. Quantile-based bins (4 bins, balanced):")
quantiles = np.percentile(ages_array, [25, 50, 75])
bins_quantile = [ages_array.min()] + list(quantiles) + [ages_array.max()]
print(f"  Bin edges: {[f'{b:.1f}' for b in bins_quantile]}")
binned_quantile = np.digitize(ages_array, bins_quantile[:-1]) - 1
labels_quantile = [f'Q1 (<{quantiles[0]:.0f})', 
                   f'Q2 ({quantiles[0]:.0f}-{quantiles[1]:.0f})', 
                   f'Q3 ({quantiles[1]:.0f}-{quantiles[2]:.0f})',
                   f'Q4 (>{quantiles[2]:.0f})']
for i, label in enumerate(labels_quantile):
    count = np.sum(binned_quantile == i)
    print(f"  {label}: {count} ({count/len(ages)*100:.1f}%)")

# 3. Domain-specific bins (common age groups)
print("\n3. Domain-specific bins:")
bins_domain = [0, 25, 35, 55, 100]
labels_domain = ['0-25', '25-35', '35-55', '55+']
binned_domain = np.digitize(ages_array, bins_domain[:-1]) - 1
for i, label in enumerate(labels_domain):
    count = np.sum(binned_domain == i)
    if count > 0:
        print(f"  {label}: {count} ({count/len(ages)*100:.1f}%)")

# 4. Binary split at median
print("\n4. Binary split at median:")
median_age = np.median(ages_array)
young = np.sum(ages_array <= median_age)
old = np.sum(ages_array > median_age)
print(f"  Young (≤{median_age:.0f}): {young} ({young/len(ages)*100:.1f}%)")
print(f"  Old (>{median_age:.0f}): {old} ({old/len(ages)*100:.1f}%)")

print("\n--- Recommendation ---")
print("Choose based on:")
print("  - Equal-width: Simple interpretation")
print("  - Quantile: Balanced sample sizes (best for fairness analysis)")
print("  - Domain-specific: Meaningful age groups")
print("  - Binary: Maximum statistical power, simplest analysis")

Age Distribution (sorted):

Age Statistics:
  Min: 20
  Max: 74
  Mean: 36.47
  Median: 33.00
  Std: 13.18

--- Binning Strategies ---

1. Equal-width bins (3 bins):
  Young (<30): 677 (36.5%)
  Middle (30-50): 813 (43.8%)
  Senior (>50): 366 (19.7%)

2. Quantile-based bins (3 bins, balanced):
  Bin edges: ['20.0', '28.0', '42.0', '74.0']
  Q1 (<28): 601 (32.4%)
  Q2 (28-42): 627 (33.8%)
  Q3 (>42): 628 (33.8%)

2. Quantile-based bins (4 bins, balanced):
  Bin edges: ['20.0', '25.0', '33.0', '46.0', '74.0']
  Q1 (<25): 423 (22.8%)
  Q2 (25-33): 428 (23.1%)
  Q3 (33-46): 537 (28.9%)
  Q4 (>46): 468 (25.2%)

3. Domain-specific bins:
  0-25: 423 (22.8%)
  25-35: 603 (32.5%)
  35-55: 575 (31.0%)
  55+: 255 (13.7%)

4. Binary split at median:
  Young (≤33): 972 (52.4%)
  Old (>33): 884 (47.6%)

--- Recommendation ---
Choose based on:
  - Equal-width: Simple interpretation
  - Quantile: Balanced sample sizes (best for fairness analysis)
  - Domain-specific: Meaningful age groups
  - Binary: 

In [None]:
def quantile_binning(attr_values: np.ndarray, n_bins: int = 4):
    quantiles = np.percentile(attr_values, [q * 100 / n_bins for q in range(1, n_bins)])
    bins_quantile = [attr_values.min()] + list(quantiles) + [attr_values.max()]
    binned_quantile = np.digitize(attr_values, bins_quantile[:-1]) - 1
    labels_quantile = []
    for i in range(n_bins):
        if i == 0:
            labels_quantile.append(f'{bins_quantile[0]:.0f}-{quantiles[0]:.0f}')
        elif i == n_bins - 1:
            labels_quantile.append(f'{quantiles[-1]:.0f}-{bins_quantile[-1]:.0f}')
        else:
            labels_quantile.append(f'{quantiles[i-1]:.0f}-{quantiles[i]:.0f}')
    
    for i, label in enumerate(labels_quantile):
        count = np.sum(binned_quantile == i)
        print(f"  {label}: {count} ({count/len(attr_values)*100:.1f}%)")

    binned_labels = np.array([labels_quantile[i] for i in binned_quantile])
    return binned_labels

quantile_binning(ages_array, n_bins=4)

  20-25: 423 (22.8%)
  25-33: 428 (23.1%)
  33-46: 537 (28.9%)
  46-74: 468 (25.2%)


array(['33-46', '20-25', '33-46', ..., '25-33', '46-74', '46-74'],
      shape=(1856,), dtype='<U5')