In [None]:
import json
import shutil
import traceback
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path

import cv2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import src.api.controllers.generate_embeddings as generate_embeddings
import torch
import torchvision.transforms.functional as F
from sqlalchemy.orm import Session
from src.config import GAZE_FOVEA_FOV, TOBII_FOV_X
from src.db import engine
from src.db.models import Recording, SimRoomClass
from src.api.controllers.gaze_segmentation import (
    get_gaze_points,
    match_frames_to_gaze,
    parse_gazedata_file,
    mask_was_viewed
)
from src.utils import cv2_video_fps, cv2_video_frame_count, cv2_video_resolution
from torchvision.ops import masks_to_boxes
from torchvision.transforms import InterpolationMode
from ultralytics import FastSAM
from tqdm import tqdm
from src.config import CHECKPOINTS_PATH

2025-04-06 22:35:30.821770: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-06 22:35:30.829524: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1743971730.839448  281740 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1743971730.842884  281740 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-06 22:35:30.854622: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [2]:
with open("experiment_metadata.json") as file:
    experiment_metadata = json.load(file)
    trial_recordings_metadata = experiment_metadata["trial_recordings_metadata"]
    trial_recording_uuids = list(trial_recordings_metadata.keys())
    labeling_same_background_uuid = experiment_metadata["labeling_same_background_uuid"]
    labeling_diff_background_uuid = experiment_metadata["labeling_diff_background_uuid"]

with Session(engine) as session:
    trial_recordings = (
        session.query(Recording).filter(Recording.uuid.in_(trial_recording_uuids)).all()
    )

# Segmenting and Tracking based on Gaze Data, and grounding based on previously built Vector Index

There's a few considerations that might be interesting in an experimental context:
1. Selection of `k` in top-k results from the database?
2. Segmentation quality (IOU?, Confidence?)
3. Adding padding to the bounding boxes?
4. Indexing, search parameters? (which ones exist)
5. Merging of same-frame ROIs or not?
6. Importance of metrics (average, min, max, variance, ?)

In [3]:
class GazeSegmentationJob:
    def __init__(
        self,
        video_path: Path,
        gaze_data_path: Path,
        results_path: Path,
        fovea_fov: float = GAZE_FOVEA_FOV,
        fov_x: float = TOBII_FOV_X,
        checkpoint_path: str = "checkpoints/FastSAM-x.pt",
        output_video_path: Path | None = None,
    ):
        self.video_path = video_path
        self.gaze_data_path = gaze_data_path
        self.fovea_fov = fovea_fov
        self.fov_x = fov_x

        # Set up the results directory.
        self.results_path = results_path
        if self.results_path.exists():
            shutil.rmtree(self.results_path, ignore_errors=True)
            self.results_path.mkdir(parents=True, exist_ok=True)
        self.results_path.mkdir(parents=True, exist_ok=True)

        # Load the FastSAM model.
        self.model = FastSAM(checkpoint_path)

        # Video properties.
        self.resolution = cv2_video_resolution(self.video_path)
        self.aspect_ratio = self.resolution[1] / self.resolution[0]  # W / H
        self.fps = cv2_video_fps(self.video_path)
        self.viewed_radius = int((self.fovea_fov / self.fov_x) * self.resolution[1])
        self.frame_count = cv2_video_frame_count(self.video_path)

        # Set up the output video.
        if output_video_path is not None:
            self.video_result = cv2.VideoWriter(
                str(output_video_path),
                cv2.VideoWriter_fourcc(*"mp4v"),
                self.fps,
                (self.resolution[1], self.resolution[0]),
            )
        else:
            self.video_result = None

        # Parse gaze data.
        self.gaze_data = parse_gazedata_file(self.gaze_data_path)
        self.gaze_points = get_gaze_points(self.gaze_data, self.resolution)

        # Map frame indexes to gaze points.
        self.frame_gaze_mapping = match_frames_to_gaze(
            self.frame_count, self.gaze_points, self.fps
        )

    def get_gaze_position(self, frame_idx: int) -> tuple[int, int] | None:
        """
        Get the gaze position for a frame index.
        """
        gaze_points = self.frame_gaze_mapping[frame_idx]
        if len(gaze_points) == 0:
            return None
        return gaze_points[0].position

    def mask_too_large(self, mask: torch.Tensor) -> bool:
        """
        Check if the mask area is less than or equal to 30% of the frame area.

        Args:
            mask: A tensor containing a single mask of shape (H, W)

        Returns:
            bool: True if the mask's area is less than or equal to 30% of the frame area, False otherwise.
        """
        height, width = mask.shape
        frame_area = height * width
        max_mask_area = 0.1 * frame_area

        mask_area = mask.sum()
        return mask_area >= max_mask_area

    def run(self):
        with ThreadPoolExecutor() as executor:
            for frame_idx, results in enumerate(
                self.model.track(source=str(self.video_path), imgsz=640, stream=True)
            ):
                try:
                    gaze_position = self.get_gaze_position(frame_idx)
                    if gaze_position is None:
                        continue

                    boxes = []
                    rois = []
                    object_ids = []
                    for result in results:
                        mask = F.resize(
                            result.masks[0].data,
                            self.resolution,
                            interpolation=InterpolationMode.NEAREST,
                        ).squeeze()

                        if not self.mask_too_large(mask) and mask_was_viewed(
                            mask, gaze_position
                        ):
                            box = masks_to_boxes(mask.unsqueeze(0)).int().cpu().numpy()[0]
                            x1, y1, x2, y2 = box
                            roi = results[0].orig_img[y1:y2, x1:x2, :]
                            roi = cv2.cvtColor(roi, cv2.COLOR_BGR2RGB)
                            boxes.append(box)
                            rois.append(roi)
                            object_ids.append(int(result.boxes.id[0]))

                    if len(boxes) > 0:
                        # Offload saving with thread pool (asynchronously)
                        rois_array = np.empty(len(rois), dtype=object)
                        for i, roi in enumerate(rois):
                            rois_array[i] = roi

                        executor.submit(
                            np.savez_compressed,
                            self.results_path / f"{frame_idx}.npz",
                            boxes=boxes,
                            rois=rois_array,
                            object_ids=object_ids,
                            frame_idx=frame_idx,
                            gaze_position=gaze_position,
                        )

                except Exception as e:
                    print(f"Error processing frame {frame_idx}: {e}")
                    traceback.print_exc()

In [None]:
GAZE_SEGMENTATION_RESULTS_PATH = Path("data/gaze_segmentation_results")
GAZE_SEGMENTATION_RESULTS_PATH.mkdir(parents=True, exist_ok=True)

def process_recording(recording: Recording):
    """
    Process a recording for gaze segmentation.
    """
    recording_uuid = recording.uuid
    video_path = Path(recording.video_path)
    gaze_data_path = Path(recording.gaze_data_path)
    results_path = GAZE_SEGMENTATION_RESULTS_PATH / recording_uuid

    if results_path.exists():
        shutil.rmtree(results_path, ignore_errors=True)
    results_path.mkdir(parents=True, exist_ok=True)

    job = GazeSegmentationJob(
        video_path=video_path,
        gaze_data_path=gaze_data_path,
        results_path=results_path,
        fovea_fov=GAZE_FOVEA_FOV,
        fov_x=TOBII_FOV_X,
        checkpoint_path=CHECKPOINTS_PATH / "FastSAM-x.pt",
    )
    job.run()

for recording in tqdm(trial_recordings, desc="Processing recordings"):
    process_recording(recording)

Processing recordings:   0%|          | 0/14 [00:00<?, ?it/s]

Downloading https://github.com/ultralytics/assets/releases/download/v8.3.0/FastSAM-x.pt to 'checkpoints/FastSAM-x.pt'...


100%|██████████| 138M/138M [00:04<00:00, 29.4MB/s]



video 1/1 (frame 1/2039) /home/zilian/projects/bachelorproef/experiments/controlled_experiment/data/recordings/2fe01600-c057-40ee-8434-4e9e0688ca2d.mp4: 384x640 92 objects, 41.8ms
video 1/1 (frame 2/2039) /home/zilian/projects/bachelorproef/experiments/controlled_experiment/data/recordings/2fe01600-c057-40ee-8434-4e9e0688ca2d.mp4: 384x640 91 objects, 11.5ms
video 1/1 (frame 3/2039) /home/zilian/projects/bachelorproef/experiments/controlled_experiment/data/recordings/2fe01600-c057-40ee-8434-4e9e0688ca2d.mp4: 384x640 90 objects, 11.2ms
video 1/1 (frame 4/2039) /home/zilian/projects/bachelorproef/experiments/controlled_experiment/data/recordings/2fe01600-c057-40ee-8434-4e9e0688ca2d.mp4: 384x640 90 objects, 11.7ms
video 1/1 (frame 5/2039) /home/zilian/projects/bachelorproef/experiments/controlled_experiment/data/recordings/2fe01600-c057-40ee-8434-4e9e0688ca2d.mp4: 384x640 89 objects, 11.2ms
video 1/1 (frame 6/2039) /home/zilian/projects/bachelorproef/experiments/controlled_experiment/data

Processing recordings:   7%|▋         | 1/14 [02:37<34:06, 157.43s/it]


video 1/1 (frame 1/1365) /home/zilian/projects/bachelorproef/experiments/controlled_experiment/data/recordings/32f02db7-adc0-4556-a2da-ed2ba60a58c9.mp4: 384x640 87 objects, 12.1ms
video 1/1 (frame 2/1365) /home/zilian/projects/bachelorproef/experiments/controlled_experiment/data/recordings/32f02db7-adc0-4556-a2da-ed2ba60a58c9.mp4: 384x640 85 objects, 11.2ms
video 1/1 (frame 3/1365) /home/zilian/projects/bachelorproef/experiments/controlled_experiment/data/recordings/32f02db7-adc0-4556-a2da-ed2ba60a58c9.mp4: 384x640 90 objects, 10.6ms
video 1/1 (frame 4/1365) /home/zilian/projects/bachelorproef/experiments/controlled_experiment/data/recordings/32f02db7-adc0-4556-a2da-ed2ba60a58c9.mp4: 384x640 92 objects, 11.4ms
video 1/1 (frame 5/1365) /home/zilian/projects/bachelorproef/experiments/controlled_experiment/data/recordings/32f02db7-adc0-4556-a2da-ed2ba60a58c9.mp4: 384x640 93 objects, 11.6ms
video 1/1 (frame 6/1365) /home/zilian/projects/bachelorproef/experiments/controlled_experiment/data

Processing recordings:  14%|█▍        | 2/14 [04:11<24:04, 120.36s/it]


video 1/1 (frame 1/1368) /home/zilian/projects/bachelorproef/experiments/controlled_experiment/data/recordings/5235be94-da01-43b5-8827-92a51d32ce30.mp4: 384x640 80 objects, 11.7ms
video 1/1 (frame 2/1368) /home/zilian/projects/bachelorproef/experiments/controlled_experiment/data/recordings/5235be94-da01-43b5-8827-92a51d32ce30.mp4: 384x640 80 objects, 12.4ms
video 1/1 (frame 3/1368) /home/zilian/projects/bachelorproef/experiments/controlled_experiment/data/recordings/5235be94-da01-43b5-8827-92a51d32ce30.mp4: 384x640 83 objects, 11.1ms
video 1/1 (frame 4/1368) /home/zilian/projects/bachelorproef/experiments/controlled_experiment/data/recordings/5235be94-da01-43b5-8827-92a51d32ce30.mp4: 384x640 85 objects, 14.3ms
video 1/1 (frame 5/1368) /home/zilian/projects/bachelorproef/experiments/controlled_experiment/data/recordings/5235be94-da01-43b5-8827-92a51d32ce30.mp4: 384x640 84 objects, 13.3ms
video 1/1 (frame 6/1368) /home/zilian/projects/bachelorproef/experiments/controlled_experiment/data

Processing recordings:  21%|██▏       | 3/14 [05:50<20:16, 110.57s/it]


video 1/1 (frame 1/1552) /home/zilian/projects/bachelorproef/experiments/controlled_experiment/data/recordings/67823ccd-a1f0-4cde-b954-3b9e5fe160c1.mp4: 384x640 82 objects, 13.2ms
video 1/1 (frame 2/1552) /home/zilian/projects/bachelorproef/experiments/controlled_experiment/data/recordings/67823ccd-a1f0-4cde-b954-3b9e5fe160c1.mp4: 384x640 81 objects, 12.1ms
video 1/1 (frame 3/1552) /home/zilian/projects/bachelorproef/experiments/controlled_experiment/data/recordings/67823ccd-a1f0-4cde-b954-3b9e5fe160c1.mp4: 384x640 84 objects, 13.0ms
video 1/1 (frame 4/1552) /home/zilian/projects/bachelorproef/experiments/controlled_experiment/data/recordings/67823ccd-a1f0-4cde-b954-3b9e5fe160c1.mp4: 384x640 86 objects, 12.6ms
video 1/1 (frame 5/1552) /home/zilian/projects/bachelorproef/experiments/controlled_experiment/data/recordings/67823ccd-a1f0-4cde-b954-3b9e5fe160c1.mp4: 384x640 87 objects, 11.8ms
video 1/1 (frame 6/1552) /home/zilian/projects/bachelorproef/experiments/controlled_experiment/data

Processing recordings:  29%|██▊       | 4/14 [07:33<17:53, 107.31s/it]


video 1/1 (frame 1/2065) /home/zilian/projects/bachelorproef/experiments/controlled_experiment/data/recordings/67b71a70-da64-467a-9fb6-91bc29265fd1.mp4: 384x640 93 objects, 10.7ms
video 1/1 (frame 2/2065) /home/zilian/projects/bachelorproef/experiments/controlled_experiment/data/recordings/67b71a70-da64-467a-9fb6-91bc29265fd1.mp4: 384x640 92 objects, 10.0ms
video 1/1 (frame 3/2065) /home/zilian/projects/bachelorproef/experiments/controlled_experiment/data/recordings/67b71a70-da64-467a-9fb6-91bc29265fd1.mp4: 384x640 93 objects, 10.0ms
video 1/1 (frame 4/2065) /home/zilian/projects/bachelorproef/experiments/controlled_experiment/data/recordings/67b71a70-da64-467a-9fb6-91bc29265fd1.mp4: 384x640 91 objects, 15.7ms
video 1/1 (frame 5/2065) /home/zilian/projects/bachelorproef/experiments/controlled_experiment/data/recordings/67b71a70-da64-467a-9fb6-91bc29265fd1.mp4: 384x640 91 objects, 10.5ms
video 1/1 (frame 6/2065) /home/zilian/projects/bachelorproef/experiments/controlled_experiment/data