In [7]:
import json
import shutil
import traceback
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path

import cv2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import src.api.controllers.generate_embeddings as generate_embeddings
import torch
import torchvision.transforms.functional as F
from sqlalchemy.orm import Session
from src.config import GAZE_FOVEA_FOV, TOBII_FOV_X
from src.db import engine
from src.db.models import Recording, SimRoomClass
from src.api.controllers.gaze_segmentation import (
    get_gaze_points,
    match_frames_to_gaze,
    parse_gazedata_file,
    mask_was_viewed
)
from src.utils import cv2_video_fps, cv2_video_frame_count, cv2_video_resolution
from torchvision.ops import masks_to_boxes
from torchvision.transforms import InterpolationMode
from ultralytics import FastSAM
from tqdm import tqdm
import faiss
from typing import Any
import re

In [2]:
with open("experiment_metadata.json") as file:
    experiment_metadata = json.load(file)
    trial_recordings_metadata = experiment_metadata["trial_recordings_metadata"]
    trial_recording_uuids = list(trial_recordings_metadata.keys())
    labeling_same_background_uuid = experiment_metadata["labeling_same_background_uuid"]
    labeling_diff_background_uuid = experiment_metadata["labeling_diff_background_uuid"]

with Session(engine) as session:
    trial_recordings = (
        session.query(Recording).filter(Recording.uuid.in_(trial_recording_uuids)).all()
    )

In [4]:
GROUND_TRUTH_PATH = Path("data/ground_truth.csv")
ground_truth_df = pd.read_csv(GROUND_TRUTH_PATH)
ground_truth_df.head()

Unnamed: 0,recording_uuid,frame_idx,class_id,mask_area
0,67b71a70-da64-467a-9fb6-91bc29265fd1,223,1,5473
1,67b71a70-da64-467a-9fb6-91bc29265fd1,42,1,1449
2,67b71a70-da64-467a-9fb6-91bc29265fd1,86,1,1405
3,67b71a70-da64-467a-9fb6-91bc29265fd1,242,1,8207
4,67b71a70-da64-467a-9fb6-91bc29265fd1,21,1,1479


In [23]:
import pandas as pd

def naive_frame_predictions(candidate_df: pd.DataFrame, confidence_threshold: float) -> pd.DataFrame:
    """
    For each frame and object, selects the candidate class with the minimum min_distance.
    Then, for each object id (i.e. track), performs majority voting across frames using only confident predictions.
    If an object never has a confident prediction (min_distance <= confidence_threshold), its prediction is -1.
    Finally, returns a DataFrame with separate rows for each combination of frame_idx, object_id, predicted class, and the corresponding min_distance.
    
    Parameters:
        candidate_df (pd.DataFrame): DataFrame with columns:
            "frame_idx", "object_id", "class_id", "min_distance", (and optionally others)
        confidence_threshold (float): The maximum min_distance for a prediction to be considered confident.
        
    Returns:
        pd.DataFrame: DataFrame with columns:
            "frame_idx", "object_id", "predicted_class", "min_distance"
    """
    # Step 1: For each (frame_idx, object_id) pair, select the candidate with the lowest min_distance.
    frame_level_preds = candidate_df.loc[
        candidate_df.groupby(["frame_idx", "object_id"])["min_distance"].idxmin()
    ].copy()
    
    # Mark each prediction as confident if its min_distance is below or equal to the threshold.
    frame_level_preds["confident"] = frame_level_preds["min_distance"] <= confidence_threshold
    
    # Step 2: For each object_id (across frames), perform majority voting using only confident predictions.
    def majority_vote_or_unknown(group: pd.DataFrame):
        confident_votes = group[group["confident"]]
        if confident_votes.empty:
            return -1
        else:
            # mode()[0] returns one candidate in case of tie.
            return confident_votes["class_id"].mode()[0]
    
    object_predictions = frame_level_preds.groupby("object_id").apply(majority_vote_or_unknown).reset_index()
    object_predictions.columns = ["object_id", "final_predicted_class"]
    
    # Step 3: Merge the object-level predictions back into the frame-level DataFrame.
    # This retains the min_distance value for each frame's candidate.
    frame_predictions_with_obj = frame_level_preds.merge(
        object_predictions, on="object_id", how="left"
    )
    
    # Select and rename the columns for the final output.
    final_df = frame_predictions_with_obj[["frame_idx", "object_id", "final_predicted_class", "min_distance"]].copy()
    final_df.rename(columns={"final_predicted_class": "predicted_class"}, inplace=True)
    
    # Optionally, sort for readability.
    final_df.sort_values(["frame_idx", "object_id"], inplace=True)
    
    return final_df

In [31]:
def compare_predictions_to_ground_truth(predictions_df: pd.DataFrame, 
                                          gt_df: pd.DataFrame, 
                                          ignore_unknown: bool = True) -> pd.DataFrame:
    """
    Compares predictions_df to gt_df on a frame-level basis and computes extended metrics.
    
    Parameters:
        predictions_df (pd.DataFrame): DataFrame containing columns:
            "frame_idx", "object_id", "predicted_class", "min_distance", etc.
        gt_df (pd.DataFrame): Ground truth DataFrame containing columns:
            "frame_idx", "class_id". Ground truth does not contain unknown predictions.
        ignore_unknown (bool): If True, unknown predictions (i.e., predicted_class == -1)
            are removed before computing metrics.
    
    Returns:
        merged_df (pd.DataFrame): A DataFrame merging ground truth with predictions. It includes:
            - frame_idx, class_id (ground truth)
            - predicted_class_set (set of predicted classes for that frame)
            - correct (True if ground truth is in predicted_class_set, False otherwise)
            - TP, FP, FN for each frame
            - per-frame precision (purity) and recall
        metrics (dict): A dictionary containing overall metrics: frame-level accuracy, 
            micro precision, micro recall, F1 score, and average purity.
    """
    # Group predictions by frame_idx to form a set of predicted classes for each frame.
    frame_preds = predictions_df.groupby("frame_idx")["predicted_class"].agg(lambda x: set(x)).reset_index()
    frame_preds.rename(columns={"predicted_class": "predicted_class_set"}, inplace=True)
    
    # Merge with ground truth on frame_idx.
    merged_df = gt_df.merge(frame_preds, on="frame_idx", how="left")
    
    # Define a function to compute per-frame metrics.
    def compute_metrics(row):
        # Start with the predicted set; if missing, use empty set.
        pred_set = row.get("predicted_class_set", set())
        if not isinstance(pred_set, set):
            pred_set = set()
        if ignore_unknown:
            pred_set = {p for p in pred_set if p != -1}
        gt = row["class_id"]
        
        # True positive: 1 if ground truth is in pred_set.
        TP = 1 if gt in pred_set else 0
        # False negative: 1 if ground truth is not predicted.
        FN = 0 if TP == 1 else 1
        # False positives: any extra predictions besides the ground truth.
        FP = (len(pred_set) - 1) if TP == 1 else len(pred_set)
        
        # Per-frame precision (purity): defined only if there's at least one prediction.
        if len(pred_set) > 0:
            precision = TP / (TP + FP)
        else:
            precision = np.nan  # or 0, depending on your preference
        # Per-frame recall: here it's either 1 (if predicted) or 0 (if not).
        recall = TP  # since FN is 1 when TP==0 and denominator is always 1.
        correct = True if TP == 1 else False
        
        return pd.Series({
            "predicted_class_set": pred_set,
            "correct": correct,
            "TP": TP,
            "FP": FP,
            "FN": FN,
            "precision": precision,
            "recall": recall
        })
    
    # Apply the per-row metric computation.
    metrics_df = merged_df.apply(compute_metrics, axis=1)
    merged_df = pd.concat([merged_df, metrics_df], axis=1)
    
    # Compute overall (micro-average) metrics.
    total_TP = merged_df["TP"].sum()
    total_FP = merged_df["FP"].sum()
    total_FN = merged_df["FN"].sum()
    
    micro_precision = total_TP / (total_TP + total_FP) if (total_TP + total_FP) > 0 else np.nan
    micro_recall = total_TP / (total_TP + total_FN) if (total_TP + total_FN) > 0 else np.nan
    micro_f1 = (2 * micro_precision * micro_recall / (micro_precision + micro_recall)
                if (micro_precision + micro_recall) > 0 else np.nan)
    
    # Frame-level accuracy: proportion of frames correctly predicted.
    frame_accuracy = merged_df["correct"].mean()
    
    # Average purity (average per-frame precision, ignoring frames with no predictions).
    avg_purity = merged_df["precision"].dropna().mean()
    
    metrics = {
        "frame_accuracy": frame_accuracy,
        "micro_precision": micro_precision,
        "micro_recall": micro_recall,
        "micro_f1": micro_f1,
        "average_purity": avg_purity
    }
        
    return merged_df, metrics

In [33]:

target_trial_recording = trial_recordings[0]
GROUNDING_DATASETS_PATH = Path("data/grounding_datasets")
FINAL_PREDICTIONS_PATH = Path("data/final_predictions")

pattern = r"grounding_dataset_k=(\d+)_samples=(\d+)"
for grounding_dataset_path in (GROUNDING_DATASETS_PATH / target_trial_recording.uuid).iterdir():
    match = re.search(pattern, grounding_dataset_path.stem)

    if match:
        k = int(match.group(1))
        num_samples = int(match.group(2))
    else:
        raise ValueError("Filename does not match the expected pattern.")
    
    grounding_df = pd.read_csv(grounding_dataset_path)
    predictions_df = naive_frame_predictions(grounding_df, confidence_threshold=0.5)

    predictions_path = FINAL_PREDICTIONS_PATH / target_trial_recording.uuid
    if predictions_path.exists():
        shutil.rmtree(predictions_path)
    predictions_path.mkdir(parents=True, exist_ok=True)

    predictions_df.to_csv(predictions_path / f"predictions_k={k}_samples={num_samples}.csv", index=False)

    gt_df = ground_truth_df[
        ground_truth_df["recording_uuid"] == target_trial_recording.uuid
    ].copy()

    merged_df, metrics = compare_predictions_to_ground_truth(
        predictions_df, gt_df, ignore_unknown=True
    )

    print(f"Metrics for {grounding_dataset_path.stem}:")
    for key, value in metrics.items():
        print(f"{key}: {value}")

    break

Metrics for grounding_dataset_k=100_samples=400:
frame_accuracy: 0.7994791666666666
micro_precision: 0.9109792284866469
micro_recall: 0.7994791666666666
micro_f1: 0.8515950069348127
average_purity: 0.9341692789968652


  object_predictions = frame_level_preds.groupby("object_id").apply(majority_vote_or_unknown).reset_index()


In [19]:
target_trial_recording.uuid

'2fe01600-c057-40ee-8434-4e9e0688ca2d'