# Similarity Modeling 2 - Video and Audio Combined

In [None]:
import sys
from pathlib import Path

# add project root to PYTHONPATH
PROJECT_ROOT = Path("..").resolve()
if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

import utils.evaluation_tools as evaluation_tools

import pandas as pd
import numpy as np


In [None]:
%pip install praat-parselmouth

In [None]:
CHAR_COLS = ["Pigs", "Miss Piggy", "Cook"]
KEY_COLS = ["Video", "Frame_number"] 

audio_pred = pd.read_csv("../data/processed/preds/audio_sim2_pred.csv")   
visual_pred = pd.read_csv("../data/processed/preds/visual_sim2_pred.csv") 

df = audio_pred.merge(
    visual_pred,
    on=KEY_COLS,
    suffixes=("_audio", "_visual"),
    how="inner"
)

# fused scores
weights = {
    "Pigs": (0.6, 0.4),
    "Miss Piggy": (0.2, 0.8),
    "Cook": (0.8, 0.2)
}

for ch in CHAR_COLS:
    wa, wv = weights[ch]
    df[f"{ch}_score"] = wa*df[f"{ch}_score_audio"] + wv*df[f"{ch}_score_visual"]
    df[f"{ch}_present"] = (df[f"{ch}_score"] >= 0.5).astype(int)

gt = pd.read_csv("../data/processed/feature_spaces/visual_sim2.csv")[KEY_COLS + CHAR_COLS]
gt = gt.merge(df[KEY_COLS + [f"{c}_score" for c in CHAR_COLS] + [f"{c}_present" for c in CHAR_COLS]],
              on=KEY_COLS, how="inner")

metrics_fused, overall_fused = evaluation_tools.evaluate_multiclass(
    y_true_df=gt[CHAR_COLS],
    y_pred_df=gt,
    characters=CHAR_COLS
)

print("Overall MAP (Fused):", overall_fused)

In [None]:
df_out = gt[["Video", "Frame_number"] + 
            [f"{c}_score" for c in CHAR_COLS] + 
            [f"{c}_present" for c in CHAR_COLS]]

df_out.to_csv("../data/processed/preds/fused_sim1_pred.csv", index=False)

# Multimodal Fusion: Discussion and Conclusions


# Final Remarks

