# Similarity Modeling 2 - Audio Domain

In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("..")

import pandas as pd

from utils import audio_tools as audioTools
from utils import gt_and_modeling_dfs as prepare_df

import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.ensemble import HistGradientBoostingClassifier

from utils import evaluation_tools as eval

In [2]:
FPS_TO_SAVE = 25

EPISODES = {
    "Muppets-02-01-01": {
        "path": "../data/raw/Muppets-02-01-01.avi",
        "train_split_timestamp": "19:30",
        "ground_truth_path": "../data/muppets-gt-2025wt/Ground_Truth_New_01.xlsx"
    },
    "Muppets-02-04-04": {
        "path": "../data/raw/Muppets-02-04-04.avi",
        "train_split_timestamp": "19:52",
        "ground_truth_path": "../data/muppets-gt-2025wt/Ground_Truth_New_04.xlsx"
    },
    "Muppets-03-04-03": {
        "path": "../data/raw/Muppets-03-04-03.avi",
        "train_split_timestamp": "19:54",
        "ground_truth_path": "../data/muppets-gt-2025wt/Ground_Truth_New_03.xlsx"
    }
}

EPISODE_NAME_TO_VIDEO_ID = {
    "Muppets-02-01-01": 211,
    "Muppets-02-04-04": 244,
    "Muppets-03-04-03": 343
}

SIM2_CHARACTER_LABEL_COLS = ["Pigs", "Miss Piggy", "Cook"]

GROUND_TRUTH = pd.read_csv("../data/processed/all_ep_gt.csv")

# Build Feature Space

In [7]:
cfg = audioTools.AudioFrameConfig(
    sr=22050,
    fps=FPS_TO_SAVE,
)

audio_sim2 = prepare_df.build_audio_feature_space_df_sim2(
    EPISODES=EPISODES,
    EPISODE_NAME_TO_VIDEO_ID=EPISODE_NAME_TO_VIDEO_ID,
    gt_df=GROUND_TRUTH,
    character_cols=SIM2_CHARACTER_LABEL_COLS,
    out_csv_path="../data/processed/feature_spaces/audio_sim2.csv",
    # frames_base_dir="../data/processed/video",
    audio_cache_dir="../data/raw/_audio_cache/",
    sr=cfg.sr,
    fps=cfg.fps,
    minf0=50,
    maxf0=500
)

display(audio_sim2.head())

Muppets-02-01-01 | audio: 100%|████████████████████████████| 38670/38670 [00:01<00:00, 25535.34it/s]
Muppets-02-04-04 | audio: 100%|████████████████████████████| 38694/38694 [00:01<00:00, 26136.72it/s]
Muppets-03-04-03 | audio: 100%|████████████████████████████| 38485/38485 [00:01<00:00, 27460.00it/s]


[SIM2 audio feature space] saved (115849, 10) -> ../data/processed/feature_spaces/audio_sim2.csv


Unnamed: 0,Video,Frame_number,Timestamp,f0,f0_voiced,onset_strength,rhythm_strength,Pigs,Miss Piggy,Cook
0,211,0,00:00.00,0.0,0,0.0,0.0,0,0,0
1,211,1,00:00.04,0.0,0,0.0,0.0,0,0,0
2,211,2,00:00.08,0.0,0,0.0,0.0,0,0,0
3,211,3,00:00.12,0.0,0,0.0,0.0,0,0,0
4,211,4,00:00.16,0.0,0,0.0,0.0,0,0,0


# Split Feature Space for Modeling

In [8]:
# --- Config ---
META_COLS = ["Video", "Frame_number", "Timestamp"]

audio_df = pd.read_csv("../data/processed/feature_spaces/audio_sim2.csv")

# Safety: keep only characters that actually exist in the CSV
SIM2_CHARACTER_LABEL_COLS = [c for c in SIM2_CHARACTER_LABEL_COLS if c in audio_df.columns]
assert len(SIM2_CHARACTER_LABEL_COLS) > 0, "No character label columns found in audio_df."

# --- 1) Split (same logic as visual) ---
train_df, test_df = prepare_df.split_feature_space_df(
    feature_df=audio_df,
    EPISODES=EPISODES,
    EPISODE_NAME_TO_VIDEO_ID=EPISODE_NAME_TO_VIDEO_ID
)

# --- 2) Build X/y ---
DROP_COLS = SIM2_CHARACTER_LABEL_COLS + META_COLS
X_train_df = train_df.drop(columns=DROP_COLS)
X_test_df  = test_df.drop(columns=DROP_COLS)

# same column order
X_test_df = X_test_df[X_train_df.columns]

col_names = X_train_df.columns.tolist()
print("Training features:", col_names)

# --- 3) Impute (f0 can be NaN) ---
imputer = SimpleImputer(strategy="mean")
X_train = imputer.fit_transform(X_train_df)
X_test  = imputer.transform(X_test_df)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test  = scaler.transform(X_test)

# --- 5) Ground truth ---
y_true_df = test_df[SIM2_CHARACTER_LABEL_COLS].copy()

# will be filled by models
y_pred_df = pd.DataFrame(index=test_df.index)


[split] Muppets-02-01-01 | Video=211 | train=29251, test=9419
[split] Muppets-02-04-04 | Video=244 | train=29801, test=8893
[split] Muppets-03-04-03 | Video=343 | train=29851, test=8634
[FINAL SPLIT] train=(88903, 10), test=(26946, 10)
Training features: ['f0', 'f0_voiced', 'onset_strength', 'rhythm_strength']


# Model Training

## SVM

In [9]:
print("=== SVM (Linear) ===")

for char in SIM2_CHARACTER_LABEL_COLS:
    print(f"\n[TRAINING] Character: {char}")

    y_train = train_df[char].values
    y_test  = test_df[char].values

    clf = LinearSVC(
        C=1.0,
        class_weight="balanced",
        max_iter=5000
    )

    clf.fit(X_train, y_train)

    # hard predictions
    y_pred = clf.predict(X_test)

    # decision scores (for ROC / PR)
    y_score = clf.decision_function(X_test)

    # store
    y_pred_df[f"{char}_present"] = y_pred
    y_pred_df[f"{char}_score"] = y_score

    print(f"[OK] {char} | positives in test: {y_test.sum()} / {len(y_test)}")

=== SVM (Linear) ===

[TRAINING] Character: Pigs
[OK] Pigs | positives in test: 5695 / 26946

[TRAINING] Character: Miss Piggy
[OK] Miss Piggy | positives in test: 1677 / 26946

[TRAINING] Character: Cook
[OK] Cook | positives in test: 1315 / 26946


## GradBoost

In [10]:
print("=== Gradient Boosting ===")

for char in SIM2_CHARACTER_LABEL_COLS:
    print(f"\n[TRAINING] Character: {char}")

    y_train = train_df[char].values
    y_test  = test_df[char].values

    clf = HistGradientBoostingClassifier(
        max_depth=5,
        learning_rate=0.05,
        max_iter=200,
        class_weight="balanced",
        random_state=497
    )

    clf.fit(X_train, y_train)

    # probability for positive class
    y_score = clf.predict_proba(X_test)[:, 1]
    y_pred  = (y_score >= 0.5).astype(int)

    # store
    y_pred_df[f"{char}_present"] = y_pred
    y_pred_df[f"{char}_score"] = y_score

    print(f"[OK] {char} | positives in test: {y_test.sum()} / {len(y_test)}")

=== Gradient Boosting ===

[TRAINING] Character: Pigs
[OK] Pigs | positives in test: 5695 / 26946

[TRAINING] Character: Miss Piggy
[OK] Miss Piggy | positives in test: 1677 / 26946

[TRAINING] Character: Cook
[OK] Cook | positives in test: 1315 / 26946


# Evaluation

## SVM

In [11]:
svm_metrics, svm_overall_map = eval.evaluate_multiclass(
    y_true_df=y_true_df,     # ground truth labels
    y_pred_df=y_pred_df,     # contains *_present and *_score
    characters=SIM2_CHARACTER_LABEL_COLS
)

Mean Average Precision (MAP) per character:
Pigs: MAP=0.322
Miss Piggy: MAP=0.088
Cook: MAP=0.028

Overall MAP (all characters): 0.146


## GradBoost

In [12]:
gb_metrics, gb_overall_map = eval.evaluate_multiclass(
    y_true_df=y_true_df,
    y_pred_df=y_pred_df,
    characters=SIM2_CHARACTER_LABEL_COLS
)

Mean Average Precision (MAP) per character:
Pigs: MAP=0.322
Miss Piggy: MAP=0.088
Cook: MAP=0.028

Overall MAP (all characters): 0.146
