In [1]:
%cd ../..

/home/zilian/projects/bachelorproef


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [2]:
from pathlib import Path
import numpy as np
from src.db import engine
from sqlalchemy.orm import Session
from src.db.models import Recording, SimRoomClass
import matplotlib.pyplot as plt
from src.utils import extract_frames_to_dir
import src.api.jobs.generate_embeddings as generate_embeddings 
import time

2025-03-18 00:22:37.708666: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-18 00:22:37.715977: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1742253757.724485   89245 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1742253757.727024   89245 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-18 00:22:37.736529: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [3]:
with Session(engine) as session:
    recording = Recording.get("39f5164f-873d-4d6b-be6b-e1d5db79c02a")
    if recording is None:
        raise ValueError("Recording not found")

    video_path = recording.video_path
    gaze_data_path = recording.gaze_data_path
    
class_id_to_name = SimRoomClass.get_id_to_name_map()

In [4]:
import concurrent.futures
import os
import shutil
import tempfile
import time
from pathlib import Path

import cv2
import numpy as np
import torch
import torchvision.transforms.functional as F
from src.config import GAZE_FOVEA_FOV, TOBII_FOV_X
from src.logic.glasses.gaze import (
    get_gaze_points,
    match_frames_to_gaze,
    parse_gazedata_file,
)
from src.utils import cv2_video_fps, cv2_video_resolution, cv2_video_frame_count
from torchvision.ops import masks_to_boxes
from tqdm import tqdm
from ultralytics import FastSAM
from ultralytics.engine.results import Results
from torchvision.transforms import InterpolationMode
import traceback
from src.aliases import UInt8Array


class GazeSegmentationJob:
    def __init__(
        self,
        video_path: Path,
        gaze_data_path: Path,
        results_path: Path,
        batch_size: int = 50,
        fovea_fov: float = GAZE_FOVEA_FOV,
        fov_x: float = TOBII_FOV_X,
        checkpoint_path: str = "checkpoints/FastSAM-x.pt",
        output_video_path: Path | None = None
    ):
        self.video_path = video_path
        self.gaze_data_path = gaze_data_path
        self.batch_size = batch_size
        self.fovea_fov = fovea_fov
        self.fov_x = fov_x

        # Set up the results directory.
        self.results_path = results_path
        if self.results_path.exists():
            shutil.rmtree(self.results_path, ignore_errors=True)
            self.results_path.mkdir(parents=True, exist_ok=True)
        self.results_path.mkdir(parents=True, exist_ok=True)

        # Load the FastSAM model.
        self.model = FastSAM(checkpoint_path)

        # Video properties.
        self.resolution = cv2_video_resolution(self.video_path)
        self.aspect_ratio = self.resolution[1] / self.resolution[0] # W / H
        self.fps = cv2_video_fps(self.video_path)
        self.viewed_radius = int((self.fovea_fov / self.fov_x) * self.resolution[1])
        self.frame_count = cv2_video_frame_count(self.video_path)

        # Set up the output video.
        if output_video_path is not None:
            self.video_result = cv2.VideoWriter(
                str(output_video_path),
                cv2.VideoWriter_fourcc(*"mp4v"),
                self.fps,
                (self.resolution[1], self.resolution[0]),
            )
        else:
            self.video_result = None
                

        # Parse gaze data.
        self.gaze_data = parse_gazedata_file(self.gaze_data_path)
        self.gaze_points = get_gaze_points(self.gaze_data, self.resolution)

        # Map frame indexes to gaze points.
        self.frame_gaze_mapping = match_frames_to_gaze(
            self.frame_count, self.gaze_points, self.fps
        )

    def get_gaze_position(self, frame_idx: int) -> tuple[int, int] | None:
        """
        Get the gaze position for a frame index.
        """
        gaze_points = self.frame_gaze_mapping[frame_idx]
        if len(gaze_points) == 0:
            return None
        return gaze_points[0].position

    def filter_large_masks(self, masks: torch.Tensor) -> torch.Tensor:
        """
        Filter out masks with area greater than 30% of the frame area

        Args:
            masks: tensor containing masks of shape (N, H, W)
        """
        if len(masks) == 0:
            return masks

        _, height, width = masks.shape
        frame_area = height * width
        max_mask_area = 0.3 * frame_area

        mask_areas = masks.sum(dim=(1, 2))
        filtered_masks = masks[mask_areas <= max_mask_area]
        return filtered_masks

    def filter_viewed_masks(self, masks: torch.Tensor, gaze_position: tuple[float, float]) -> torch.Tensor:
        """
        Filter out masks that are not within the viewed radius of the gaze point.
        Masks should be resized to the size of the original frame before calling this function.

        Args:
            masks: tensor containing masks of shape (N, H, W)
            gaze_position: tuple containing the gaze position (x, y)
        """
        if len(masks) == 0:
            return masks

        sample_mask = masks[0]
        height, width = sample_mask.shape
        device = sample_mask.device

        # Create a circular mask centered at the gaze point.
        y = torch.arange(0, height, device=device).view(-1, 1).repeat(1, width)
        x = torch.arange(0, width, device=device).view(1, -1).repeat(height, 1)
        dist_sq = (x - gaze_position[0]) ** 2 + (y - gaze_position[1]) ** 2
        circular_mask = (
            (dist_sq <= self.viewed_radius**2).float().unsqueeze(0)
        )  # (1, H, W)

        # Apply the circular mask.
        masked_masks = masks * circular_mask
        mask_areas = masked_masks.sum(dim=(1, 2))
        return masks[mask_areas > 0]

    def run(self):
        for frame_idx, results in enumerate(self.model.track(source=str(self.video_path), imgsz=1056, stream=True)):
            continue
            # try:
            #     gaze_position = self.get_gaze_position(frame_idx)
            #     frame = results[0].orig_img

            #     if gaze_position is None:
            #         if self.video_result is not None:
            #             self.write_video_result(frame)
            #         continue
                
            #     masks = torch.stack([result.masks.data[0] for result in results])
            #     masks = F.resize(masks.data, self.resolution, interpolation=InterpolationMode.NEAREST)
            #     filtered_masks = self.filter_large_masks(masks)
            #     viewed_masks = self.filter_viewed_masks(filtered_masks, gaze_position)

            #     boxes = masks_to_boxes(viewed_masks).int().cpu().numpy()

            #     # Extract regions of interest (ROI) for each bounding box.
            #     rois = np.empty(len(boxes), dtype=object)
            #     for i, box in enumerate(boxes):
            #         x1, y1, x2, y2 = box
            #         roi = frame[y1:y2, x1:x2, :]
            #         rois[i] = cv2.cvtColor(roi, cv2.COLOR_BGR2RGB)

            #     if self.video_result is not None:
            #         self.write_video_result(frame, viewed_masks, boxes, gaze_position)

            #     np.savez_compressed(
            #         self.results_path / f"{frame_idx}.npz",
            #         boxes=boxes,
            #         rois=rois,
            #     )

            #     # After saving or processing the frame:
            #     # del masks, filtered_masks, viewed_masks, boxes, frame
            #     # del results  # if no longer needed
            #     torch.cuda.empty_cache()
            # except Exception as e:
            #     print(f"Error processing frame {frame_idx}: {e}")
            #     traceback.print_exc()  

        if self.video_result is not None:
            self.video_result.release()

        
    
    def write_video_result(
        self, 
        frame: UInt8Array, 
        masks: torch.Tensor | None = None,
        boxes: torch.Tensor | None = None,
        gaze_position: tuple[int, int] | None = None
    ):
        """
        Write the frame to the output video with the masks and bounding boxes overlayed.
        """
        if masks is not None:
            masks = masks.cpu().numpy()
            masks = (masks * 255).astype(np.uint8)

            for mask in masks:
                mask = cv2.cvtColor(mask.squeeze(), cv2.COLOR_GRAY2BGR)
                frame = cv2.addWeighted(frame, 1, mask, 0.1, 0)

        if boxes is not None:
            for box in boxes:
                x1, y1, x2, y2 = box
                cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)

        if gaze_position is not None:
            cv2.circle(frame, gaze_position, 10, (255, 0, 0), -1)

        self.video_result.write(frame)
        


In [5]:
RESULTS_PATH = Path("notebooks/analysis-pipeline/data/fastsam_gaze_segmentation_results")
RESULTS_PATH.mkdir(exist_ok=True)

# if len(list(RESULTS_PATH.iterdir())) == 0:
gaze_sam_job = GazeSegmentationJob(
    video_path=video_path,
    gaze_data_path=gaze_data_path,
    results_path=RESULTS_PATH,
)

gaze_sam_job.run()

# RESULT_VIDEO_PATH = Path("notebooks/analysis-pipeline/data/gaze_output.mp4")
# if not RESULT_VIDEO_PATH.exists():
#     gaze_sam_job.create_video_from_results(RESULT_VIDEO_PATH)


video 1/1 (frame 1/2544) /home/zilian/projects/bachelorproef/data/recordings/39f5164f-873d-4d6b-be6b-e1d5db79c02a.mp4: 608x1056 135 objects, 41.4ms
video 1/1 (frame 2/2544) /home/zilian/projects/bachelorproef/data/recordings/39f5164f-873d-4d6b-be6b-e1d5db79c02a.mp4: 608x1056 134 objects, 15.8ms
video 1/1 (frame 3/2544) /home/zilian/projects/bachelorproef/data/recordings/39f5164f-873d-4d6b-be6b-e1d5db79c02a.mp4: 608x1056 137 objects, 15.3ms
video 1/1 (frame 4/2544) /home/zilian/projects/bachelorproef/data/recordings/39f5164f-873d-4d6b-be6b-e1d5db79c02a.mp4: 608x1056 138 objects, 15.4ms
video 1/1 (frame 5/2544) /home/zilian/projects/bachelorproef/data/recordings/39f5164f-873d-4d6b-be6b-e1d5db79c02a.mp4: 608x1056 141 objects, 15.6ms
video 1/1 (frame 6/2544) /home/zilian/projects/bachelorproef/data/recordings/39f5164f-873d-4d6b-be6b-e1d5db79c02a.mp4: 608x1056 142 objects, 15.2ms
video 1/1 (frame 7/2544) /home/zilian/projects/bachelorproef/data/recordings/39f5164f-873d-4d6b-be6b-e1d5db79c0

In [6]:
gaze_segmentation_results = list(RESULTS_PATH.iterdir())
gaze_segmentation_results.sort(key=lambda x: int(x.stem))

rois_per_frame = []
boxes_per_frame = []

for result_path in gaze_segmentation_results:
    result = np.load(result_path, allow_pickle=True)
    rois_per_frame.append(result["rois"])
    boxes_per_frame.append(result["boxes"])

In [7]:
frame_idx = 1300
for i, rois in enumerate(rois_per_frame):
    if i != frame_idx:
        continue
    plt.imshow(rois[0])
    plt.axis("off")
    plt.title(f"Frame {i}")
    plt.show()
    break

In [8]:
dinov2 = generate_embeddings.load_model()
index = generate_embeddings.read_index(Path("notebooks/analysis-pipeline/data/embeddings.index"))

In [9]:
import numpy as np
import matplotlib.pyplot as plt

rois = rois_per_frame[frame_idx]
embeddings, _, _ = list(generate_embeddings.get_embeddings(dinov2, rois))[0]
distances, indices = generate_embeddings.search_index(index, embeddings, k=50)

for i, (roi, embedding) in enumerate(zip(rois, embeddings)):
    D, I = distances[i], indices[i]
    # Map class IDs to names
    class_names = [class_id_to_name[idx] for idx in I]
    
    # Group distances by class name
    class_to_distances = {}
    for class_idx, distance in zip(I, D):
        class_name = class_id_to_name[class_idx]
        class_to_distances.setdefault(class_name, []).append(distance)
    
    # Calculate metrics for each class
    avg_distances = {}
    min_distances = {}
    max_distances = {}
    var_distances = {}
    
    for cls, dists in class_to_distances.items():
        avg_distances[cls] = np.mean(dists)
        min_distances[cls] = np.min(dists)
        max_distances[cls] = np.max(dists)
        var_distances[cls] = np.var(dists)

    print(var_distances)
    
    # Determine the top class based on average distance
    sorted_classes = sorted(avg_distances.items(), key=lambda x: x[1])
    
    # Create one figure with two subplots side by side
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 6))
    
    # Left subplot: Display the ROI with the top class in the title
    ax1.imshow(roi)
    ax1.set_title(f"ROI {i} - Top class: {sorted_classes[0][0]} (avg: {sorted_classes[0][1]:.4f})")
    ax1.axis('off')
    
    # Right subplot: Grouped bar chart for various distance metrics per class
    classes = list(avg_distances.keys())
    x = np.arange(len(classes))
    width = 0.2  # width of each bar
    
    # Extract metric values for each class
    avg_vals = [avg_distances[cls] for cls in classes]
    min_vals = [min_distances[cls] for cls in classes]
    max_vals = [max_distances[cls] for cls in classes]
    var_vals = [var_distances[cls] for cls in classes]
    
    # Create grouped bars
    ax2.bar(x - 1.5 * width, avg_vals, width, label='Avg')
    ax2.bar(x - 0.5 * width, min_vals, width, label='Min')
    ax2.bar(x + 0.5 * width, max_vals, width, label='Max')
    ax2.bar(x + 1.5 * width, var_vals, width, label='Variance')
    
    ax2.set_xticks(x)
    ax2.set_xticklabels(classes, rotation=45)
    ax2.set_xlabel("Class Name")
    ax2.set_ylabel("Distance")
    ax2.set_title(f"ROI {i} - Distance Metrics per Class")
    ax2.legend()
    
    plt.tight_layout()
    plt.show()


IndexError: list index out of range