In [22]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [23]:
import sys
sys.path.append('/content/drive/MyDrive/Deep_Learning/Final Project')

In [24]:
# Define the root directory path for the project in Google Drive
# Update this path to match your own Google Drive directory structure
PROJECT_PATH = "/content/drive/MyDrive/Deep_Learning/Final Project"

In [25]:
!pip install motmetrics -q
!pip install ipynb



In [26]:
%cd /content/drive/MyDrive/Deep_Learning/Final Project/

/content/drive/MyDrive/Deep_Learning/Final Project


In [27]:
# Import required libraries for deep learning, computer vision, and tracking
import os
import cv2
import glob
import torch
import configparser
import numpy as np
import pandas as pd
import torch.nn as nn
import motmetrics as mm
import matplotlib.pyplot as plt
import torch.nn.functional as F
from tqdm import tqdm
from pathlib import Path
from random import randint
from torchvision.utils import draw_bounding_boxes
from torchvision.io import read_image, write_jpeg, write_video
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor

# Compatibility patch for NumPy 2.x: motmetrics library requires np.asfarray
# This function was deprecated in NumPy 2.0, so we provide a fallback implementation
if not hasattr(np, "asfarray"):
    np.asfarray = lambda a: np.asarray(a, dtype=float)

In [28]:
"""
Siamese Network for person re-identification and similarity computation.
This network takes pairs of images and generates embeddings that can be compared
using cosine similarity to determine if they represent the same person.
Input: RGB images of size 3x128x64 (CxHxW)
"""
class Siamese_Network(torch.nn.Module):
    def __init__(self):
        super(Siamese_Network, self).__init__()
        # Convolutional layers: progressively increase channels while reducing spatial dimensions
        self.conv1 = torch.nn.Conv2d(3, 64, kernel_size=3)
        self.conv2 = torch.nn.Conv2d(64, 128, kernel_size=3)
        self.conv3 = torch.nn.Conv2d(128, 128, kernel_size=3)
        # Fully connected layers: map flattened features to 256-dimensional embedding
        self.fc1 = torch.nn.Linear(128*14*6, 256)
        self.fc2 = torch.nn.Linear(256, 256)

    def forward_one(self, x):
        """Process a single image through the network to generate an embedding"""
        x = F.relu(F.max_pool2d(self.conv1(x), 2))
        x = F.relu(F.max_pool2d(self.conv2(x), 2))
        x = F.relu(F.max_pool2d(self.conv3(x), 2))
        x = torch.flatten(x)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

    def forward(self, x1, x2):
        """Process a pair of images and return their embeddings"""
        return self.forward_one(x1), self.forward_one(x2)

In [29]:
"""
Gallery Dataset for Siamese Network Training
Creates positive pairs (same person) and negative pairs (different persons) from a gallery of images.
Images are expected to be named with format: {person_id}_{other_info}.jpg
"""
class Gallery(torch.utils.data.Dataset):
    errorCount = 0
    
    def __init__(self, path, transform=None, max_pairs_per_id=50, max_neg_pairs_per_id=50):
        """
        Args:
            path: Directory containing person images
            transform: Optional image transformations
            max_pairs_per_id: Maximum positive pairs to generate per person ID
            max_neg_pairs_per_id: Maximum negative pairs to generate per person ID
        """
        self.path = path
        self.imgs = sorted([x for x in os.listdir(path) if x.endswith('.jpg')])
        self.transform = transform
        self.max_pairs_per_id = max_pairs_per_id
        self.max_neg_pairs_per_id = max_neg_pairs_per_id
        self.pairs = []
        self.labels = []
        self._create_pairs()

    def _create_pairs(self):
        """Generate positive and negative image pairs for training"""
        # Group images by person ID (extracted from filename prefix)
        person_images = {}
        for img_name in self.imgs:
            person_id = img_name.split('_')[0]
            if person_id not in person_images:
                person_images[person_id] = []
            person_images[person_id].append(img_name)

        # Create pairs for each person
        for person_id, images in person_images.items():
            # Positive pairs: all combinations of images from the same person
            positive_pairs = [(images[i], images[j]) for i in range(len(images)) for j in range(i + 1, len(images))]
            # Sample a subset to limit dataset size
            positive_pairs = random.sample(positive_pairs, min(len(positive_pairs), self.max_pairs_per_id))
            self.pairs.extend(positive_pairs)
            self.labels.extend([1] * len(positive_pairs))

            # Negative pairs: images from different persons
            other_person_ids = list(person_images.keys())
            other_person_ids.remove(person_id)
            negative_pairs = []
            for other_id in random.sample(other_person_ids, min(len(other_person_ids), self.max_neg_pairs_per_id)):
                negative_pairs.append((images[0], person_images[other_id][0]))
            self.pairs.extend(negative_pairs)
            self.labels.extend([0] * len(negative_pairs))

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        """Load and return a pair of images with their label (1=same person, 0=different)"""
        img1_name, img2_name = self.pairs[idx]
        img1_path = os.path.join(self.path, img1_name)
        img2_path = os.path.join(self.path, img2_name)

        try:
            # Load images and normalize to [0, 1] range
            img1 = read_image(img1_path).float() / 255.0
            img2 = read_image(img2_path).float() / 255.0
        except (RuntimeError, OSError) as e:
            # Handle corrupted or missing images with placeholder zeros
            Gallery.errorCount += 1
            print(f"Error#: {Gallery.errorCount}. Error loading {img1_path} or {img2_path}: {e}")
            img1 = torch.zeros(3, 128, 64)
            img2 = torch.zeros(3, 128, 64)

        label = torch.tensor(self.labels[idx], dtype=torch.float32)

        return img1, img2, label

In [30]:
"""
MOT16 Training Dataset
Loads MOT16 training sequences with ground truth annotations for object detection fine-tuning.
Each sequence contains images and corresponding bounding boxes with person IDs.
"""
class MOT16TrainDataset(torch.utils.data.Dataset):
    def __init__(self, root):
        """
        Args:
            root: Root directory containing MOT16 training sequences (MOT16-02, MOT16-04, etc.)
        """
        self.root = root
        self.imgs = []
        self.targets = []
        
        # Process each sequence subdirectory
        for subdir in os.listdir(root):
            # Collect all frame images from the sequence
            next_dir = list(sorted(os.listdir(os.path.join(root, subdir, "img1"))))
            next_dir = [os.path.join(subdir, "img1", filename) for filename in next_dir]
            self.imgs += next_dir

            # Load ground truth annotations (format: frame, id, x, y, w, h, conf, ...)
            gt = np.genfromtxt(os.path.join(root, subdir, "gt", "gt.txt"), delimiter=',', dtype=int)

            # Filter to only include person detections (class ID = 1)
            person_mask = (gt[:, PERSON_COL] == 1)
            gt = gt[person_mask, :]

            # Convert MOT16 format (x, y, w, h) to PyTorch format (x1, y1, x2, y2)
            # Note: MOT16 'top' column is actually bottom y-coordinate (y-axis is inverted)
            bots = gt[:, TOP_COL]
            lefts = gt[:, LEFT_COL]
            tops = bots + gt[:, HEIGHT_COL]  # Calculate top from bottom + height
            rights = lefts + gt[:, WIDTH_COL]  # Calculate right from left + width
            boxes = np.column_stack((lefts, bots, rights, tops))
            person_ids = gt[:, ID_COL]

            # Create target dictionaries for each frame
            for i in range(len(next_dir)):
                d = {}
                # Get all detections for this frame (frames are 1-indexed in MOT16)
                mask = (gt[:, FRAME_COL] == i+1)
                req_boxes = boxes[mask, :]
                req_person_ids = person_ids[mask]
                d['boxes'] = torch.tensor(req_boxes, dtype=torch.float)
                d['labels'] = torch.ones(mask.shape, dtype=torch.int64)  # All are person class
                d['person_ids'] = torch.tensor(req_person_ids, dtype=torch.int64)
                self.targets.append(d)

        print('Length of dataset: ', len(self.imgs))

    def __getitem__(self, idx):
        """Load image and corresponding annotations for a given index"""
        if(idx > len(self.imgs)):
            return None, None

        img_path = os.path.join(self.root, self.imgs[idx])
        # Normalize image to [0, 1] range
        img = torch.div(read_image(img_path).float(), 255.0)

        return img, self.targets[idx]

    def __len__(self):
        return len(self.imgs)

In [31]:
"""
MOT16 Test Dataset
Loads MOT16 test sequence images without ground truth annotations.
Used for inference and tracking evaluation.
"""
class MOT16TestDataset(torch.utils.data.Dataset):
    def __init__(self, root):
        """
        Args:
            root: Directory containing a single MOT16 test sequence (e.g., MOT16-03)
        """
        self.root = root
        self.imgs = list(sorted(os.listdir(os.path.join(root, "img1"))))

    def __getitem__(self, idx):
        """Load and return a single test image"""
        if(idx > len(self.imgs)):
            return None, None

        img_path = os.path.join(self.root, "img1", self.imgs[idx])
        # Normalize image to [0, 1] range
        img = torch.div(read_image(img_path).float(), 255.0)

        return img

    def __len__(self):
        return len(self.imgs)

In [32]:
"""
Load and configure Faster R-CNN detector model for person detection.
The model is initialized with ImageNet pretrained weights and fine-tuned for person detection.
"""
def get_detector_model(load_weights=True):
    """
    Args:
        load_weights: If True, load fine-tuned weights from model_param_path
    
    Returns:
        Configured Faster R-CNN model ready for inference or training
    """
    # Initialize with ImageNet pretrained weights
    model = fasterrcnn_resnet50_fpn(weights='DEFAULT')
    
    # Replace the classification head to match our number of classes (background + person)
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, NUM_CLASSES)
    
    # Load fine-tuned weights if available
    if load_weights:
        if os.path.exists(model_param_path):
            model.load_state_dict(torch.load(model_param_path, weights_only=True))
    
    return model

In [None]:
!unzip -q f"{PROJECT_PATH}/MOT16.zip" -d /content/MOT16_data/

In [33]:
# Configuration: Define file paths and tracking parameters
# Model paths
similarity_model_path = f'{PROJECT_PATH}/siamese_network.pth'  # Siamese network for person re-identification
model_param_path = f'{PROJECT_PATH}/models_FasterRCNN/bbox_detector.pth'  # Faster R-CNN detector weights

# Data paths
sequence_dir = f'{PROJECT_PATH}/MOT16/test/MOT16-03'  # MOT16 test sequence directory
gt_txt_path = os.path.join(sequence_dir, "gt/gt.txt")  # Ground truth annotations (if available)

# Output paths
output_txt_path = f'{PROJECT_PATH}/results_FasterRCNN/FasterRCNN_tracker_results.txt'  # Tracking results in MOT format
video_path = f'{PROJECT_PATH}/results_FasterRCNN/tracked_test_video.mp4'  # Visualized tracking video
csv_path = f'{PROJECT_PATH}/results_FasterRCNN/FasterRCNN_tracker_metrics.csv'  # Evaluation metrics

# Detection and tracking parameters
BBOX_SCORE_THRESH = 0.7  # Minimum confidence threshold for bounding box detections
NUM_CLASSES = 2  # Number of classes: background (0) and person (1)

In [34]:
# Visualization Functions: Generate annotated video from tracking results

def get_sequence_info(sequence_dir):
    """
    Extract sequence metadata (FPS, resolution) from seqinfo.ini file.
    
    Args:
        sequence_dir: Path to MOT16 sequence directory
    
    Returns:
        fps: Frames per second
        (width, height): Video resolution tuple
    """
    seqinfo_path = os.path.join(sequence_dir, "seqinfo.ini")
    config = configparser.ConfigParser()
    config.read(seqinfo_path)
    fps = None
    width = None
    height = None
    fps = config.getint("Sequence", "frameRate", fallback=fps)
    width = config.getint("Sequence", "imWidth", fallback=width)
    height = config.getint("Sequence", "imHeight", fallback=height)
    return fps, (width, height)

def visualize_sequence(sequence_dir, results_file, output_video_path):
    """
    Create an annotated video showing tracked persons with colored bounding boxes and IDs.
    Each tracked person is assigned a unique color for visual distinction.
    
    Args:
        sequence_dir: Path to MOT16 sequence directory containing images
        results_file: Path to tracking results file (MOT format: frame,id,x,y,w,h,conf,...)
        output_video_path: Path where the output video will be saved
    """
    # Load tracking results from text file
    results_df = pd.read_csv(results_file, header=None)
    results_df.columns = ["frame","id","x","y","w","h","conf","x3","y3","z3"]

    # Get all frame images from the sequence
    image_paths = sorted(glob.glob(os.path.join(sequence_dir, "img1", "*.jpg")))

    # Initialize video writer with sequence metadata
    fps, frame_size = get_sequence_info(sequence_dir)
    first_img = cv2.imread(image_paths[0])
    frame_size = (first_img.shape[1], first_img.shape[0])  # Use actual image dimensions
    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    video_writer = cv2.VideoWriter(output_video_path, fourcc, fps, frame_size)

    # Generate consistent color palette for each track ID (seed for reproducibility)
    max_id = int(results_df["id"].max()) + 1
    np.random.seed(42)  # Fixed seed ensures same colors across runs
    colors = np.random.randint(0, 255, size=(max_id, 3), dtype=np.uint8)

    # Annotate each frame with bounding boxes and track IDs
    for img_path in image_paths:
        frame_num = int(os.path.splitext(os.path.basename(img_path))[0])
        img = cv2.imread(img_path)
        if img is None:
            continue

        # Get all detections for this frame
        frame_data = results_df[results_df["frame"] == frame_num]
        for _, row in frame_data.iterrows():
            track_id = int(row["id"])
            color = tuple(map(int, colors[track_id]))
            x, y, w_box, h_box = int(row["x"]), int(row["y"]), int(row["w"]), int(row["h"])
            # Draw bounding box
            cv2.rectangle(img, (x, y), (x + w_box, y + h_box), color, 2)
            # Draw track ID label above the box
            cv2.putText(img, str(track_id), (x, y - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
        video_writer.write(img)
    
    video_writer.release()
    print(f"Video saved to {output_video_path}")

In [35]:
# Evaluation Metrics: Compute MOTChallenge standard metrics for tracking performance

def evaluate_mot_sequence(gt_txt_path, output_txt_path, csv_path, max_iou=0.5):
    """
    Evaluate tracking performance using MOTChallenge standard metrics.
    Computes metrics such as MOTA, MOTP, IDF1, and other tracking statistics.
    
    Args:
        gt_txt_path: Path to ground truth tracking file (MOT format)
        output_txt_path: Path to tracker output file (MOT format)
        csv_path: Path to save evaluation metrics CSV
        max_iou: Maximum IoU threshold for considering a detection as a match (default: 0.5)
    
    Returns:
        Metrics summary including MOTA, MOTP, IDF1, etc.
    """
    # Load ground truth and tracker output files
    gt = pd.read_csv(gt_txt_path, header=None)
    tr = pd.read_csv(output_txt_path, header=None)

    # Extract standard MOT format columns: frame, id, x, y, w, h, conf
    gt = gt.iloc[:, :7]
    tr = tr.iloc[:, :7]

    gt.columns = ["frame","id","x","y","w","h","conf"]
    tr.columns = ["frame","id","x","y","w","h","conf"]

    # Filter to only include valid detections (confidence > 0)
    # In MOT format, conf > 0 indicates valid evaluation boxes
    gt = gt[gt["conf"] > 0].copy()
    tr = tr[tr["conf"] > 0].copy()

    # Initialize MOTAccumulator for computing tracking metrics
    acc = mm.MOTAccumulator(auto_id=True)

    # Process each frame to compute detection-to-track associations
    frames = sorted(gt["frame"].unique())
    for f in frames:
        gt_frame = gt[gt["frame"]==f]
        tr_frame = tr[tr["frame"]==f]

        gt_ids = gt_frame["id"].tolist()
        tr_ids = tr_frame["id"].tolist()

        gt_boxes = gt_frame[["x","y","w","h"]].values
        tr_boxes = tr_frame[["x","y","w","h"]].values

        # Compute IoU-based distance matrix between ground truth and tracker detections
        # Distance = 1 - IoU, so lower distance means better match
        distances = 1 - mm.distances.iou_matrix(gt_boxes, tr_boxes, max_iou=max_iou)
        acc.update(gt_ids, tr_ids, distances)

    # Compute MOTChallenge standard metrics
    mh = mm.metrics.create()
    summary = mh.compute(acc, metrics=mm.metrics.motchallenge_metrics)
    summary.to_csv(csv_path)
    print(f"[OK] Metrics saved to {csv_path}")
    print(summary)

In [36]:
# Image Preprocessing: Resize cropped person images to fixed size for Siamese network

def resize_image(image):
    """
    Resize an image to the standard input size for the Siamese network using adaptive pooling.
    This ensures all person crops are normalized to the same dimensions (128x64) regardless
    of their original size, which is required for the network architecture.
    
    Args:
        image: Input image tensor of shape [C, H, W] (typically a cropped person bounding box)
    
    Returns:
        Resized image tensor of shape [C, 128, 64]
    """
    return F.adaptive_avg_pool2d(image, (128, 64))


In [37]:
from torchvision import transforms
from PIL import Image, ImageDraw, ImageFont

def main():
    """
    Main tracking pipeline: Multi-object tracking using Faster R-CNN detection and Siamese network re-identification.
    
    The pipeline processes video frames sequentially:
    1. Detect persons in each frame using Faster R-CNN
    2. Extract embeddings for each detection using Siamese network
    3. Associate detections to existing tracks using cosine similarity
    4. Update tracks or create new ones for unmatched detections
    5. Remove stale tracks that haven't been seen recently
    6. Save results in MOT format and generate visualization
    """
    # Initialize device (GPU if available, else CPU)
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

    bbox_detector = get_detector_model(load_weights=False)
    bbox_detector.load_state_dict(torch.load(model_param_path))
    bbox_detector.to(device)
    bbox_detector.eval()

    similarity_model = Siamese_Network().to(device)
    similarity_model.load_state_dict(torch.load(similarity_model_path, weights_only=True))
    similarity_model.eval()
    similarity_model = Siamese_Network_Extended(similarity_model)

    img_set = MOT16TestDataset(sequence_dir)

    # Tracking state management
    next_id = 0
    tracked_persons = {}  # {id: {"embeddings": [last N frame embeddings], "color": (R,G,B), "last_seen": frame_idx}}
    MAX_EMBEDDING_HISTORY = 5  # Keep embeddings from last N frames
    MAX_FRAMES_MISSING = 30    # Max frames allowed missing
    SIMILARITY_THRESHOLD = 0.6 # Threshold below which it is considered a new target


    # Open output file for writing tracking results in MOT format
    output_file = open(output_txt_path, "w")

    # Process each frame in the sequence with progress bar
    for i, img in tqdm(enumerate(img_set), total=len(img_set), desc="Processing frames", unit="frame"):
        img_device = img.to(device)

        # Step 1: Object Detection - Detect all persons in the current frame
        bbox_pred = bbox_detector([img_device])[0]
        scores = bbox_pred["scores"]
        boxes = bbox_pred["boxes"]
        # Filter detections by confidence threshold to remove low-confidence detections
        mask = (scores[:] >= BBOX_SCORE_THRESH)
        boxes = boxes[mask].type(torch.int32)
        scores = scores[mask]

        # Step 2: Extract embeddings for each detected person
        # For each detection, crop the person region, resize it, and compute embedding
        current_detections = []
        for box, score in zip(boxes, scores):
            x1, y1, x2, y2 = box.tolist()
            # Validate bounding box dimensions (must have positive width and height)
            if x2 - x1 > 0 and y2 - y1 > 0:
                # Crop person region and resize to standard size (128x64)
                cropped = resize_image(img_device[:, y1:y2, x1:x2]).to(device)
                # Generate embedding using Siamese network for person re-identification
                embedding = similarity_model.get_embedding(cropped)
                current_detections.append(
                    {
                        "box": box,
                        "embedding": embedding,
                        "score": float(score.item()),  # Detection confidence score
                    }
                )

        # Step 3: Data Association - Match detections to existing tracks
        # Initialize: all detections start as unmatched
        unmatched_detections = list(range(len(current_detections)))
        matched_tracks = {}

        # For each existing track, find the best matching detection
        for person_id, person_data in list(tracked_persons.items()):
            if len(unmatched_detections) == 0:
                # No detections left to match, mark track as not seen in this frame
                person_data["last_seen"] = i - 1
                continue

            best_detection_idx = None
            best_similarity = -float('inf')

            # Use multiple recent embeddings (templates) for robust matching
            # This helps handle appearance changes over time
            person_embeddings = person_data["embeddings"]

            # Evaluate similarity between track and each unmatched detection
            for det_idx in unmatched_detections:
                detection = current_detections[det_idx]

                # Compute cosine similarity with each historical embedding
                similarities = []
                for template_embedding in person_embeddings:
                    similarity = F.cosine_similarity(
                        detection["embedding"].unsqueeze(0),
                        template_embedding.unsqueeze(0)
                    ).item()
                    similarities.append(similarity)

                # Use average similarity across all templates for more stable matching
                avg_similarity = sum(similarities) / len(similarities)

                # Track the best match
                if avg_similarity > best_similarity:
                    best_similarity = avg_similarity
                    best_detection_idx = det_idx

            # Associate detection to track if similarity exceeds threshold
            if best_similarity > SIMILARITY_THRESHOLD and best_detection_idx is not None:
                matched_tracks[person_id] = {
                    "detection_idx": best_detection_idx,
                    "similarity": best_similarity
                }
                unmatched_detections.remove(best_detection_idx)

        # Step 4: Update matched tracks with new detections
        for person_id, match_info in matched_tracks.items():
            det_idx = match_info["detection_idx"]
            detection = current_detections[det_idx]

            person_data = tracked_persons[person_id]

            # Update embedding history: add new embedding, maintain sliding window
            person_data["embeddings"].append(detection["embedding"])
            if len(person_data["embeddings"]) > MAX_EMBEDDING_HISTORY:
                person_data["embeddings"].pop(0)  # Remove oldest embedding

            # Update track state: current frame, position, and detection confidence
            person_data["last_seen"] = i
            person_data["current_box"] = detection["box"]
            person_data["score"] = detection.get("score", 1.0)

        # Step 5: Initialize new tracks for unmatched detections
        # Unmatched detections are assumed to be new persons entering the scene
        for det_idx in unmatched_detections:
            detection = current_detections[det_idx]
            tracked_persons[next_id] = {
                "embeddings": [detection["embedding"]],  # Initialize with first embedding
                "color": (randint(0, 255), randint(0, 255), randint(0, 255)),  # Random color for visualization
                "last_seen": i,
                "current_box": detection["box"],
                "score": detection.get("score", 1.0),
            }
            next_id += 1

        # Step 6: Remove stale tracks that haven't been seen for too long
        # This handles cases where persons leave the scene or are occluded
        for person_id in list(tracked_persons.keys()):
            if i - tracked_persons[person_id]["last_seen"] > MAX_FRAMES_MISSING:
                del tracked_persons[person_id]

        # Step 7: Save tracking results and visualize current frame
        annotations = []
        for person_id, person_data in tracked_persons.items():
            # Only process tracks that were seen in the current frame
            if person_data["last_seen"] == i:
                box = person_data["current_box"]
                x1, y1, x2, y2 = box.tolist()
                w = x2 - x1
                h = y2 - y1
                conf = float(person_data.get("score", 1.0))

                # Write tracking result in MOT format: frame,id,x,y,w,h,conf,x3,y3,z3
                # Frame numbers are 1-indexed in MOT format (i+1)
                frame_idx_for_txt = i + 1
                output_file.write(
                    f"{frame_idx_for_txt},{person_id},{x1},{y1},{w},{h},{conf:.4f},1,1,1\n"
                )

                # Collect annotations for visualization
                annotations.append((box, person_data["color"], str(person_id)))

        # Draw bounding boxes and track IDs on the frame
        if annotations:
            all_boxes = torch.stack([ann[0] for ann in annotations])
            all_colors = [ann[1] for ann in annotations]
            all_labels = [ann[2] for ann in annotations]

            annotated_img = draw_bounding_boxes_with_labels(
                img,
                all_boxes.cpu(),
                labels=all_labels,
                colors=all_colors,
                width=4,
            )
        else:
            annotated_img = img

        # Save annotated frame image
        write_jpeg(torch.mul(annotated_img, 255.0).to(torch.uint8), f'./out/{i:05d}.jpg')

    # Finalize: Close output file and generate results
    output_file.close()
    print(f"Tracking results saved to: {output_txt_path}")

    # Generate visualization video with tracked bounding boxes
    visualize_sequence(sequence_dir, output_txt_path, video_path)

    # Compute evaluation metrics if ground truth is available
    if os.path.exists(gt_txt_path):
        evaluate_mot_sequence(gt_txt_path, output_txt_path, csv_path, max_iou=0.5)
    else:
        print("No ground truth available - this is a test sequence without annotations")


# Visualization Utility: Draw bounding boxes with text labels on images

def draw_bounding_boxes_with_labels(image, boxes, labels=None, colors=None, width=1):
    """
    Enhanced bounding box drawing function that adds text labels above each box.
    This extends torchvision's draw_bounding_boxes to include track ID labels.
    
    Args:
        image: Input image tensor [C, H, W] in range [0, 1]
        boxes: Bounding box coordinates [N, 4] in (x1, y1, x2, y2) format
        labels: List of text labels [N] (typically track IDs)
        colors: List of RGB color tuples [N] for each bounding box
        width: Line width for bounding box borders
    
    Returns:
        Image tensor with bounding boxes and labels drawn
    """
    image_with_boxes = draw_bounding_boxes(image, boxes, colors=colors, width=width)

    if labels is None:
        return image_with_boxes

    image_pil = transforms.ToPILImage()(image_with_boxes)
    draw = ImageDraw.Draw(image_pil)

    try:
        font = ImageFont.truetype("arial.ttf", 20)
    except IOError:
        font = ImageFont.load_default()

    for i, box in enumerate(boxes):
        x1, y1, x2, y2 = box.tolist()
        color = colors[i] if colors is not None else (255, 255, 255)
        text = labels[i]

        if hasattr(draw, 'textbbox'):
            bbox = draw.textbbox((0, 0), text, font=font)
            text_w = bbox[2] - bbox[0]
            text_h = bbox[3] - bbox[1]
        else:
            text_w, text_h = draw.textsize(text, font=font)

        draw.rectangle([(x1, y1 - text_h - 2), (x1 + text_w, y1)], fill=color)
        draw.text((x1, y1 - text_h - 2), text, fill=(0, 0, 0), font=font)

    return transforms.ToTensor()(image_pil)

# Hypothetical extension of Siamese network to add get_embedding()
# Siamese Network Extension: Wrapper to enable single-image embedding extraction
# The original Siamese network requires pairs of images, but for tracking we need
# to extract embeddings from individual detections. This wrapper enables that functionality.

class Siamese_Network_Extended(nn.Module):
    """
    Wrapper around Siamese_Network to enable embedding extraction from single images.
    The original network expects pairs, but for tracking we need single-image embeddings.
    """
    def __init__(self, original_model):
        super(Siamese_Network_Extended, self).__init__()
        self.original_model = original_model
        # Check if model has a direct encoder (for potential optimization)
        self.encoder = original_model.encoder if hasattr(original_model, 'encoder') else None

    def forward(self, x1, x2):
        """Standard forward pass with image pairs"""
        return self.original_model(x1, x2)

    def get_embedding(self, x):
        """
        Extract embedding for a single image.
        Since the Siamese network expects pairs, we use a dummy second image.
        
        Args:
            x: Single image tensor [C, H, W]
        
        Returns:
            Embedding vector for the input image
        """
        if self.encoder:
            return self.encoder(x)
        else:
            # Use dummy second image to satisfy network's pair requirement
            dummy = torch.zeros_like(x)
            out_x, _ = self.original_model(x, dummy)
            return out_x

# Main part

In [38]:
# Main Execution: Run the complete tracking pipeline
# This cell executes the full tracking workflow: detection, association, and evaluation

import os
# Create output directory for annotated frame images
os.makedirs(f"{PROJECT_PATH}/out/", exist_ok=True)

if __name__ == "__main__":
    main()


Processing frames: 100%|██████████| 1500/1500 [42:54<00:00,  1.72s/frame]


Tracking results saved to: /content/drive/MyDrive/Deep_Learning/Final Project/results_FasterRCNN/FasterRCNN_tracker_results.txt
Video saved to /content/drive/MyDrive/Deep_Learning/Final Project/results_FasterRCNN/tracked_test_video.mp4
No ground truth, testing sequence
