# Similarity Modeling 2 - Video and Audio Fused

In [24]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("..")

import utils.gt_and_modeling_dfs as prepare_df
import utils.evaluation_tools as eval

import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import HistGradientBoostingClassifier

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [12]:
FPS_TO_SAVE = 25

EPISODES = {
    "Muppets-02-01-01": {
        "path": "../data/raw/Muppets-02-01-01.avi",
        "train_split_timestamp": "19:30",
        "ground_truth_path": "../data/muppets-gt-2025wt/Ground_Truth_New_01.xlsx"
    },
    "Muppets-02-04-04": {
        "path": "../data/raw/Muppets-02-04-04.avi",
        "train_split_timestamp": "19:52",
        "ground_truth_path": "../data/muppets-gt-2025wt/Ground_Truth_New_04.xlsx"
    },
    "Muppets-03-04-03": {
        "path": "../data/raw/Muppets-03-04-03.avi",
        "train_split_timestamp": "19:54",
        "ground_truth_path": "../data/muppets-gt-2025wt/Ground_Truth_New_03.xlsx"
    }
}

EPISODE_NAME_TO_VIDEO_ID = {
    "Muppets-02-01-01": 211,
    "Muppets-02-04-04": 244,
    "Muppets-03-04-03": 343
}

SIM2_CHARACTER_LABEL_COLS = ["Pigs", "Miss Piggy", "Cook"]

META_COLS = ["Video", "Frame_number", "Timestamp"]

# Merge Feature Spaces

In [None]:
audio_df  = pd.read_csv("../data/processed/feature_spaces/audio_sim2.csv")
visual_df = pd.read_csv("../data/processed/feature_spaces/visual_sim2.csv")

# --- 0) Safety: keep only labels that exist ---
SIM2_CHARACTER_LABEL_COLS = [
    c for c in SIM2_CHARACTER_LABEL_COLS if c in audio_df.columns
]
assert len(SIM2_CHARACTER_LABEL_COLS) > 0, "No character label columns found."

# --- 1) Merge feature spaces (early fusion) ---
fused_df = audio_df.merge(visual_df,
    on=["Video", "Frame_number", "Timestamp"],
    how="inner"
)

fused_df.to_csv("../data/processed/feature_spaces/fused_sim2.csv", index=False)

## Prepare splits

In [26]:
FUSED_DF_SIM2 = pd.read_csv("../data/processed/feature_spaces/fused_sim2.csv")
RENAMED_CHAR_COLS = ['Piggy', 'OtherPigs', 'Chef']

# --- 2) Split ---
train_df, test_df = prepare_df.split_feature_space_df(
    feature_df=FUSED_DF_SIM2,
    EPISODES=EPISODES,
    EPISODE_NAME_TO_VIDEO_ID=EPISODE_NAME_TO_VIDEO_ID
)

# --- 3) Build X/y ---
DROP_COLS = SIM2_CHARACTER_LABEL_COLS + RENAMED_CHAR_COLS +META_COLS

X_train_df = train_df.drop(columns=DROP_COLS)
X_test_df  = test_df.drop(columns=DROP_COLS)

# ensure identical column order
X_test_df = X_test_df[X_train_df.columns]

# numeric safety (important)
X_train_df = X_train_df.select_dtypes(include="number")
X_test_df  = X_test_df.select_dtypes(include="number")

print("Training features:", X_train_df.columns.tolist())

# --- 4) Impute + scale ---
imputer = SimpleImputer(strategy="mean")
X_train = imputer.fit_transform(X_train_df)
X_test  = imputer.transform(X_test_df)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test  = scaler.transform(X_test)

# --- 5) Ground truth ---
y_true_df = test_df[SIM2_CHARACTER_LABEL_COLS].copy()

# will be filled by models
y_pred_df = pd.DataFrame(index=test_df.index)

[split] Muppets-02-01-01 | Video=211 | train=29251, test=9419
[split] Muppets-02-04-04 | Video=244 | train=29801, test=8893
[split] Muppets-03-04-03 | Video=343 | train=29851, test=8634
[FINAL SPLIT] train=(88903, 51), test=(26946, 51)
Training features: ['f0', 'f0_voiced', 'onset_strength', 'rhythm_strength', 'lbp_0', 'lbp_1', 'lbp_2', 'lbp_3', 'lbp_4', 'lbp_5', 'lbp_6', 'lbp_7', 'lbp_8', 'lbp_9', 'lbp_10', 'lbp_11', 'lbp_12', 'lbp_13', 'lbp_14', 'lbp_15', 'lbp_16', 'lbp_17', 'lbp_18', 'lbp_19', 'lbp_20', 'lbp_21', 'lbp_22', 'lbp_23', 'lbp_24', 'lbp_25', 'lbp_26', 'lbp_27', 'lbp_28', 'lbp_29', 'lbp_30', 'lbp_31', 'hog_mean', 'hog_std', 'flow_mag_mean', 'flow_mag_std', 'flow_horiz_ratio']


# Model Training

In [27]:
models = {}

for ch in SIM2_CHARACTER_LABEL_COLS:
    print(f"\n[TRAINING] Character: {ch}")

    y_train = train_df[ch].values
    y_test  = test_df[ch].values

    clf = HistGradientBoostingClassifier(
        loss="log_loss",
        learning_rate=0.05,
        max_depth=6,
        max_iter=300,
        l2_regularization=1.0,
        early_stopping=True,
        random_state=497
    )

    clf.fit(X_train, y_train)
    models[ch] = clf

    # hard 0/1 prediction
    y_pred = clf.predict(X_test)

    y_pred_df[f"{ch}_present"] = y_pred


[TRAINING] Character: Pigs

[TRAINING] Character: Miss Piggy

[TRAINING] Character: Cook


## Evaluation

In [28]:
# --- Evaluate HistGradientBoostingClassifier predictions ---
hgb_metrics, hgb_overall_map = eval.evaluate_multiclass(
    y_true_df=y_true_df,     # ground truth labels
    y_pred_df=y_pred_df,     # contains *_present and *_score
    characters=SIM2_CHARACTER_LABEL_COLS
)

print("Overall MAP (HistGB):", hgb_overall_map)

Mean Average Precision (MAP) per character:
Pigs: MAP=0.237
Miss Piggy: MAP=0.076
Cook: MAP=0.638

Overall MAP (all characters): 0.317
Overall MAP (HistGB): 0.31710478940093056


# Multimodal Fusion: Discussion and Conclusions


# Final Remarks

