# Similarity Modeling 1 - Audio and Video Fused

In [2]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("..")

import utils.gt_and_modeling_dfs as prepare_df
import utils.evaluation_tools as eval

import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier


In [10]:
FPS_TO_SAVE = 25

EPISODES = {
    "Muppets-02-01-01": {
        "path": "../data/raw/Muppets-02-01-01.avi",
        "train_split_timestamp": "19:30",
        "ground_truth_path": "../data/muppets-gt-2025wt/Ground_Truth_New_01.xlsx"
    },
    "Muppets-02-04-04": {
        "path": "../data/raw/Muppets-02-04-04.avi",
        "train_split_timestamp": "19:52",
        "ground_truth_path": "../data/muppets-gt-2025wt/Ground_Truth_New_04.xlsx"
    },
    "Muppets-03-04-03": {
        "path": "../data/raw/Muppets-03-04-03.avi",
        "train_split_timestamp": "19:54",
        "ground_truth_path": "../data/muppets-gt-2025wt/Ground_Truth_New_03.xlsx"
    }
}

EPISODE_NAME_TO_VIDEO_ID = {
    "Muppets-02-01-01": 211,
    "Muppets-02-04-04": 244,
    "Muppets-03-04-03": 343
}

SIM1_CHARACTER_LABEL_COLS = ["Kermit", "StatlerWaldorf", "Fozzie Bear"]

META_COLS = ["Video", "Frame_number", "Timestamp", "frame"]

# Merge Feature Spaces

In [11]:
audio_df  = pd.read_csv("../data/processed/feature_spaces/audio_sim1.csv")
visual_df = pd.read_csv("../data/processed/feature_spaces/visual_sim1.csv")
visual_df = visual_df.drop(columns=SIM1_CHARACTER_LABEL_COLS)

# --- 0) Safety: keep only labels that exist ---
SIM1_CHARACTER_LABEL_COLS = [
    c for c in SIM1_CHARACTER_LABEL_COLS if c in audio_df.columns
]
assert len(SIM1_CHARACTER_LABEL_COLS) > 0, "No character label columns found."

# --- 1) Merge feature spaces (early fusion) ---
fused_df = audio_df.merge(visual_df,
    on=["Video", "Frame_number", "Timestamp"],
    how="inner"
)

fused_df.to_csv("../data/processed/feature_spaces/fused_sim1.csv", index=False)

## Prepare splits

In [12]:
FUSED_DF_SIM1 = pd.read_csv("../data/processed/feature_spaces/fused_sim1.csv")

# --- 2) Split ---
train_df, test_df = prepare_df.split_feature_space_df(
    feature_df=FUSED_DF_SIM1,
    EPISODES=EPISODES,
    EPISODE_NAME_TO_VIDEO_ID=EPISODE_NAME_TO_VIDEO_ID
)

# --- 3) Build X/y ---
DROP_COLS = SIM1_CHARACTER_LABEL_COLS  + META_COLS

X_train_df = train_df.drop(columns=DROP_COLS)
X_test_df  = test_df.drop(columns=DROP_COLS)

# ensure identical column order
X_test_df = X_test_df[X_train_df.columns]

# numeric safety (important)
X_train_df = X_train_df.select_dtypes(include="number")
X_test_df  = X_test_df.select_dtypes(include="number")

print("Training features:", X_train_df.columns.tolist())

# --- 4) Impute + scale ---
imputer = SimpleImputer(strategy="mean")
X_train = imputer.fit_transform(X_train_df)
X_test  = imputer.transform(X_test_df)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test  = scaler.transform(X_test)

# --- 5) Ground truth ---
y_true_df = test_df[SIM1_CHARACTER_LABEL_COLS].copy()

# will be filled by models
y_pred_df = pd.DataFrame(index=test_df.index)

[split] Muppets-02-01-01 | Video=211 | train=29251, test=9421
[split] Muppets-02-04-04 | Video=244 | train=29801, test=8895
[split] Muppets-03-04-03 | Video=343 | train=29851, test=8636
[FINAL SPLIT] train=(88903, 57), test=(26952, 57)
Training features: ['mfcc_0', 'mfcc_1', 'mfcc_2', 'mfcc_3', 'mfcc_4', 'mfcc_5', 'mfcc_6', 'mfcc_7', 'mfcc_8', 'mfcc_9', 'mfcc_10', 'mfcc_11', 'mfcc_12', 'mfcc_d1_0', 'mfcc_d1_1', 'mfcc_d1_2', 'mfcc_d1_3', 'mfcc_d1_4', 'mfcc_d1_5', 'mfcc_d1_6', 'mfcc_d1_7', 'mfcc_d1_8', 'mfcc_d1_9', 'mfcc_d1_10', 'mfcc_d1_11', 'mfcc_d1_12', 'mfcc_d2_0', 'mfcc_d2_1', 'mfcc_d2_2', 'mfcc_d2_3', 'mfcc_d2_4', 'mfcc_d2_5', 'mfcc_d2_6', 'mfcc_d2_7', 'mfcc_d2_8', 'mfcc_d2_9', 'mfcc_d2_10', 'mfcc_d2_11', 'mfcc_d2_12', 'spectral_centroid', 'f0', 'dom_H', 'dom_S', 'dom_V', 'green_frac', 'edge_mean', 'eye_blob_count', 'eye_horizontal_align', 'eye_pupil_contrast', 'brown_rhythm']


# Model Training

In [14]:
models = {}

for ch in SIM1_CHARACTER_LABEL_COLS:
    print(f"\n[TRAINING] Character: {ch}")

    y_train = train_df[ch].values
    y_test  = test_df[ch].values

    knn = KNeighborsClassifier(
    n_neighbors=5,
        weights="distance" # -ve convolution
    )

    knn.fit(X_train, y_train)
    models[ch] = knn

    # hard 0/1 prediction
    y_pred = knn.predict(X_test)

    y_pred_df[f"{ch}_present"] = y_pred


[TRAINING] Character: Kermit

[TRAINING] Character: StatlerWaldorf

[TRAINING] Character: Fozzie Bear


## Evaluation

In [15]:
# --- Evaluate HistGradientBoostingClassifier predictions ---
knn_metrics, knn_overall_map = eval.evaluate_multiclass(
    y_true_df=y_true_df,     # ground truth labels
    y_pred_df=y_pred_df,     # contains *_present and *_score
    characters=SIM1_CHARACTER_LABEL_COLS
)

print("Overall MAP:", knn_overall_map)

Mean Average Precision (MAP) per character:
Kermit: MAP=0.600
StatlerWaldorf: MAP=0.120
Fozzie Bear: MAP=0.300

Overall MAP (all characters): 0.340
Overall MAP: 0.3400793111471246
