<a href="https://colab.research.google.com/github/jewoolee0502/ComputerVision/blob/main/ECSE415_A5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ECSE 415: Introduction to Computer Vision
###### Jewoo Lee - 260910789
###### Anthony Bonta - 261053688

## Assignment 5: Video Analysis

### Library Requirements

In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Installs
!pip install -q ultralytics
!pip install -q ultralytics deep-sort-realtime
!pip install -q kaggle

# Imports
import os
import cv2
import glob
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import csv

from ultralytics import YOLO
from deep_sort_realtime.deepsort_tracker import DeepSort
from scipy.optimize import linear_sum_assignment

Mounted at /content/drive
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m48.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m99.7 MB/s[0m eta [36m0:00:00[0m
[?25hCreating new Ultralytics Settings v0.0.6 file ✅ 
View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.


#### Path

In [2]:
path = '/content/drive/MyDrive/McGill/2025/Fall 2025/ECSE 415/A5/'

object_tracking_root = os.path.join(path, "Object_Tracking")

### 1. Data Preparation

In [3]:
task1_images_dir = os.path.join(object_tracking_root, "Task1", "images")
task1_video_path = os.path.join(object_tracking_root, "task1_input.mp4")

print("Task1 images dir:", task1_images_dir)
print("Output video path:", task1_video_path)

Task1 images dir: /content/drive/MyDrive/McGill/2025/Fall 2025/ECSE 415/A5/Object_Tracking/Task1/images
Output video path: /content/drive/MyDrive/McGill/2025/Fall 2025/ECSE 415/A5/Object_Tracking/task1_input.mp4


In [4]:
FPS = 14 # given fps value

def images_to_video(images_dir, output_path, fps):
  # all images are .jpg
  image_files = sorted(glob.glob(os.path.join(images_dir, "*.jpg")))
  print(f"Found {len(image_files)} images in {images_dir}")

  if len(image_files) == 0:
    raise RuntimeError(f"No .jpg image files found in {images_dir}")

  # read the first image and get its dimensions
  first_img = cv2.imread(image_files[0])
  height, width = first_img.shape[:2]
  frame_size = (width, height)
  print(f"Target frame size: {frame_size}")

  # set up the video writer
  fourcc = cv2.VideoWriter_fourcc(*"mp4v")
  writer = cv2.VideoWriter(output_path, fourcc, fps, frame_size)

  if not writer.isOpened():
    raise RuntimeError(f"VideoWriter could not be opened for {output_path}")

  # write all images as frames
  for idx, img_path in enumerate(image_files):
    frame = cv2.imread(img_path)

    if frame is None:
      print(f"Skipping unreadable image: {img_path}")
      continue

    # resize every frame matches the first image's size
    frame = cv2.resize(frame, frame_size)
    writer.write(frame)

  writer.release()
  print(f"Video saved to: {output_path}")

images_to_video(task1_images_dir, task1_video_path, FPS)

Found 429 images in /content/drive/MyDrive/McGill/2025/Fall 2025/ECSE 415/A5/Object_Tracking/Task1/images
Target frame size: (1920, 1080)
Video saved to: /content/drive/MyDrive/McGill/2025/Fall 2025/ECSE 415/A5/Object_Tracking/task1_input.mp4


### 2. Model Implementation

In [5]:
output_video_path = os.path.join(object_tracking_root, "task1.mp4")
output_txt_path = os.path.join(object_tracking_root, "task1.txt")

print("Output video path:", output_video_path)
print("Output text path:", output_txt_path)

def draw_and_log_box(frame, frame_idx, track, txt_file, color=(0, 0, 255)):
  track_id = int(track.track_id)
  x1, y1, x2, y2 = track.to_ltrb()

  x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
  w = x2 - x1
  h = y2 - y1

  # draw bounding box
  cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2)

  # draw label slightly above the box
  label_pos = (x1, max(0, y1 - 8))
  cv2.putText(
      frame,
      f"ID: {track_id}",
      label_pos,
      cv2.FONT_HERSHEY_SIMPLEX,
      0.5,
      color,
      2
  )

  # write tracking line
  if txt_file is not None:
    txt_file.write(f"{frame_idx},{track_id},{x1},{y1},{w},{h}\n")

Output video path: /content/drive/MyDrive/McGill/2025/Fall 2025/ECSE 415/A5/Object_Tracking/task1.mp4
Output text path: /content/drive/MyDrive/McGill/2025/Fall 2025/ECSE 415/A5/Object_Tracking/task1.txt


In [6]:
conf_threshold = 0.3

yolo_model = YOLO("yolov8s.pt")
tracker = DeepSort(max_age=30, n_init=3, max_iou_distance=0.8)

cap = cv2.VideoCapture(task1_video_path) # open the input video
if not cap.isOpened():
  raise RuntimeError("Cannot open video")

# output video dimensions
w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

fps_val = cap.get(cv2.CAP_PROP_FPS)
if fps_val <= 0:
  fps_val = FPS

fourcc = cv2.VideoWriter_fourcc(*"mp4v")
writer = cv2.VideoWriter(output_video_path, fourcc, fps_val, (w, h))

txt_file = open(output_txt_path, "w")

tracks_memory = []
frame_idx = 1

while True:
  ok, frame = cap.read()
  if not ok:
    break

  # run the YOLO model
  det = yolo_model(frame, conf=conf_threshold, verbose=False)[0]

  # convert YOLO detections into DeepSORT format
  det_list = []
  if det.boxes is not None:
    for b in det.boxes:
      cls_id = int(b.cls[0])
      conf = float(b.conf[0])

      if cls_id != 0:
        continue

      x1, y1, x2, y2 = b.xyxy[0].tolist()
      det_list.append(([x1, y1, x2 - x1, y2 - y1], conf, "person"))

  # track
  tracks = tracker.update_tracks(det_list, frame=frame)

  # draw & log only confirmed tracks
  for trk in tracks:
    if not trk.is_confirmed():
      continue

    draw_and_log_box(frame, frame_idx, trk, txt_file)

    # saving in memory
    x1, y1, x2, y2 = trk.to_ltrb()
    x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
    tracks_memory.append((frame_idx, trk.track_id, x1, y1, x2 - x1, y2 - y1))

  writer.write(frame)

  if frame_idx % 50 == 0:
    print("Processed Frames:", frame_idx)

  frame_idx += 1

cap.release()
writer.release()
txt_file.close()

print("\nCompleted!")
print("Video saved to:", output_video_path)
print("Text file saved to:", output_txt_path)

[KDownloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolov8s.pt to 'yolov8s.pt': 100% ━━━━━━━━━━━━ 21.5MB 311.2MB/s 0.1s
Processed Frames: 50
Processed Frames: 100
Processed Frames: 150
Processed Frames: 200
Processed Frames: 250
Processed Frames: 300
Processed Frames: 350
Processed Frames: 400

Completed!
Video saved to: /content/drive/MyDrive/McGill/2025/Fall 2025/ECSE 415/A5/Object_Tracking/task1.mp4
Text file saved to: /content/drive/MyDrive/McGill/2025/Fall 2025/ECSE 415/A5/Object_Tracking/task1.txt


### 3. Model Evaluation

In [7]:
def bbox_iou(box_a, box_b):
  ax, ay, aw, ah = box_a
  bx, by, bw, bh = box_b

  # convert to corner coordinates
  ax2, ay2 = ax + aw, ay + ah
  bx2, by2 = bx + bw, by + bh

  # find intersection rectangle coordinates
  ix1 = max(ax, bx)
  iy1 = max(ay, by)
  ix2 = min(ax2, bx2)
  iy2 = min(ay2, by2)

  # calculate intersection area
  iw = max(0.0, ix2 - ix1)
  ih = max(0.0, iy2 - iy1)
  inter = iw * ih

  if inter <= 0:
    return 0.0

  # calculate union area
  area_a = aw * ah
  area_b = bw * bh
  union = area_a + area_b - inter

  if union <= 0:
    return 0.0

  # iou ratio
  return inter / union

In [8]:
def load_tracks_file(file_path):
  tracks = {}

  with open(file_path, "r") as f:
    for line in f:
      parts = line.strip().split(",")
      if len(parts) < 6:
        continue

      # parse CSV fields
      frame_idx = int(float(parts[0]))
      obj_id = int(float(parts[1]))
      x = float(parts[2])
      y = float(parts[3])
      w = float(parts[4])
      h = float(parts[5])

      # store
      box = (x, y, w, h)
      if frame_idx not in tracks:
        tracks[frame_idx] = []

      tracks[frame_idx].append((obj_id, box))

  return tracks

In [9]:
def compute_mota(gt_tracks, pred_tracks, iou_threshold=0.5):
  # initialize
  total_fn = 0
  total_fp = 0
  total_idsw = 0
  total_gt = 0

  last_track_for_gt = {}   # gt_id -> pred_id

  all_frames = sorted(set(gt_tracks.keys()) | set(pred_tracks.keys()))

  for frame in all_frames:
    gt_list = gt_tracks.get(frame, [])
    pred_list = pred_tracks.get(frame, [])

    num_gt = len(gt_list)
    num_pred = len(pred_list)
    total_gt += num_gt

    # edge cases
    if num_gt == 0 and num_pred == 0:
      continue
    if num_gt == 0:
      total_fp += num_pred
      continue
    if num_pred == 0:
      total_fn += num_gt
      continue

    # iou matrix: rows = GT, cols = predictions
    iou_mat = np.zeros((num_gt, num_pred), dtype=float)
    for gi, (gt_id, gt_box) in enumerate(gt_list):
      for pj, (pr_id, pr_box) in enumerate(pred_list):
        iou_mat[gi, pj] = bbox_iou(gt_box, pr_box)

    # Hungarian algorithm to find optimal GT-prediction matching
    cost_mat = 1.0 - iou_mat
    row_ind, col_ind = linear_sum_assignment(cost_mat)

    matched_gt_idx = set()
    matched_pred_idx = set()
    gt_to_pred_idx = {}

    # accept only pairs with iou >= threshold
    for r, c in zip(row_ind, col_ind):
      if iou_mat[r, c] >= iou_threshold:
        matched_gt_idx.add(r)
        matched_pred_idx.add(c)
        gt_to_pred_idx[r] = c

    # false negatives: GT not matched
    total_fn += (num_gt - len(matched_gt_idx))

    # false positives: predictions not matched
    total_fp += (num_pred - len(matched_pred_idx))

    # identity switches
    for gi, pi in gt_to_pred_idx.items():
      gt_id, _ = gt_list[gi]
      pr_id, _ = pred_list[pi]

      prev_pr_id = last_track_for_gt.get(gt_id)
      if prev_pr_id is not None and prev_pr_id != pr_id:
        total_idsw += 1

      last_track_for_gt[gt_id] = pr_id

  # mota calculation
  if total_gt == 0:
    mota = 0.0
  else:
    mota = 1.0 - (total_fn + total_fp + total_idsw) / total_gt

  return mota, total_fn, total_fp, total_idsw, total_gt

In [10]:
# paths
gt_path = os.path.join(object_tracking_root, "Task1", "gt", "gt.txt")
pred_path = output_txt_path

print("GT file:", gt_path)
print("Pred file:", pred_path)

# load data
gt_data = load_tracks_file(gt_path)
pred_data = load_tracks_file(pred_path)

# compute MOTA
mota, FN, FP, IDSW, GT_total = compute_mota(gt_data, pred_data, iou_threshold=0.5)

print("\n=== MOTA evaluation for Task 1 ===")
print("Total GT objects:", GT_total)
print("False Negatives:", FN)
print("False Positives:", FP)
print("ID Switches:", IDSW)
print(f"MOTA: {mota:.4f}")

GT file: /content/drive/MyDrive/McGill/2025/Fall 2025/ECSE 415/A5/Object_Tracking/Task1/gt/gt.txt
Pred file: /content/drive/MyDrive/McGill/2025/Fall 2025/ECSE 415/A5/Object_Tracking/task1.txt

=== MOTA evaluation for Task 1 ===
Total GT objects: 26647
False Negatives: 17228
False Positives: 2011
ID Switches: 242
MOTA: 0.2689


### 4. Prediction & Kaggle Competition

In [8]:
# Task2 image folder and input video path
task2_images_dir = os.path.join(object_tracking_root, "Task2", "images")
task2_input_video_path = os.path.join(object_tracking_root, "task2_input.mp4")

print("Task2 images dir :", task2_images_dir)
print("Task2 input video:", task2_input_video_path)

# use the same helper from Part 1
images_to_video(task2_images_dir, task2_input_video_path, FPS)

Task2 images dir : /content/drive/MyDrive/McGill/2025/Fall 2025/ECSE 415/A5/Object_Tracking/Task2/images
Task2 input video: /content/drive/MyDrive/McGill/2025/Fall 2025/ECSE 415/A5/Object_Tracking/task2_input.mp4
Found 1050 images in /content/drive/MyDrive/McGill/2025/Fall 2025/ECSE 415/A5/Object_Tracking/Task2/images
Target frame size: (1920, 1080)
Video saved to: /content/drive/MyDrive/McGill/2025/Fall 2025/ECSE 415/A5/Object_Tracking/task2_input.mp4


In [12]:
# output paths for Task2
task2_output_video = os.path.join(object_tracking_root, "task2.mp4")
task2_counts_csv   = os.path.join(object_tracking_root, "group50_object_tracking.csv")

print("Task2 annotated video:", task2_output_video)
print("Task2 counts csv:", task2_counts_csv)

conf_threshold = 0.1

# if you want, you can reuse existing yolo_model / tracker,
# or re-create them here (same as Part 2):
yolo_model = YOLO("yolov8l.pt")
tracker = DeepSort(max_age=30, n_init=3, max_iou_distance=0.7)

cap2 = cv2.VideoCapture(task2_input_video_path)
if not cap2.isOpened():
    raise RuntimeError("Cannot open Task2 input video")

w2 = int(cap2.get(cv2.CAP_PROP_FRAME_WIDTH))
h2 = int(cap2.get(cv2.CAP_PROP_FRAME_HEIGHT))

fps2 = cap2.get(cv2.CAP_PROP_FPS)
if fps2 <= 0:
    fps2 = FPS

fourcc = cv2.VideoWriter_fourcc(*"mp4v")
writer2 = cv2.VideoWriter(task2_output_video, fourcc, fps2, (w2, h2))

frame_idx = 1
frame_counts = []   # list of (frame_number, count)

while True:
    ok, frame = cap2.read()
    if not ok:
        break

    # YOLO detection
    det = yolo_model(frame, conf=conf_threshold, verbose=False)[0]

    det_list = []
    if det.boxes is not None:
        for b in det.boxes:
            cls_id = int(b.cls[0])
            conf   = float(b.conf[0])
            if cls_id != 0:     # keep only "person"
                continue

            x1, y1, x2, y2 = b.xyxy[0].tolist()
            det_list.append(([x1, y1, x2 - x1, y2 - y1], conf, "person"))

    # DeepSORT tracking
    tracks = tracker.update_tracks(det_list, frame=frame)

    # draw & count confirmed tracks
    people_in_frame = 0
    for trk in tracks:
        if not trk.is_confirmed():
            continue

        draw_and_log_box(frame, frame_idx, trk, txt_file=None)  # we won't log to txt here
        people_in_frame += 1

    # store count for this frame
    frame_counts.append((frame_idx, people_in_frame))

    writer2.write(frame)

    if frame_idx % 50 == 0:
        print(f"Task2 – processed frame {frame_idx}, count = {people_in_frame}")

    frame_idx += 1

cap2.release()
writer2.release()

print("Finished Task2 tracking.")
print("Annotated Task2 video saved at:", task2_output_video)

Task2 annotated video: /content/drive/MyDrive/McGill/2025/Fall 2025/ECSE 415/A5/Object_Tracking/task2.mp4
Task2 counts csv: /content/drive/MyDrive/McGill/2025/Fall 2025/ECSE 415/A5/Object_Tracking/group50_object_tracking.csv
Task2 – processed frame 50, count = 35
Task2 – processed frame 100, count = 34
Task2 – processed frame 150, count = 33
Task2 – processed frame 200, count = 28
Task2 – processed frame 250, count = 32
Task2 – processed frame 300, count = 34
Task2 – processed frame 350, count = 38
Task2 – processed frame 400, count = 35
Task2 – processed frame 450, count = 33
Task2 – processed frame 500, count = 33
Task2 – processed frame 550, count = 38
Task2 – processed frame 600, count = 34
Task2 – processed frame 650, count = 33
Task2 – processed frame 700, count = 28
Task2 – processed frame 750, count = 25
Task2 – processed frame 800, count = 24
Task2 – processed frame 850, count = 28
Task2 – processed frame 900, count = 31
Task2 – processed frame 950, count = 28
Task2 – processe

In [15]:
with open(task2_counts_csv, "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["Number", "Count"])
    for frame_num, cnt in frame_counts:
        writer.writerow([frame_num, cnt])

print("Saved Task2 pedestrian counts to:", task2_counts_csv)

Saved Task2 pedestrian counts to: /content/drive/MyDrive/McGill/2025/Fall 2025/ECSE 415/A5/Object_Tracking/group50_object_tracking.csv


In [13]:
# # trial (scale added)

# # output paths for Task2
# task2_output_video = os.path.join(object_tracking_root, "task2.mp4")
# task2_counts_csv   = os.path.join(object_tracking_root, "group50_object_tracking.csv")

# print("Task2 annotated video:", task2_output_video)
# print("Task2 counts csv     :", task2_counts_csv)

# conf_threshold = 0.05          # slightly lower for crowded scenes

# # models
# yolo_model = YOLO("yolov8l.pt")
# tracker = DeepSort(max_age=30, n_init=3, max_iou_distance=0.7)

# cap2 = cv2.VideoCapture(task2_input_video_path)
# if not cap2.isOpened():
#     raise RuntimeError("Cannot open Task2 input video")

# w2 = int(cap2.get(cv2.CAP_PROP_FRAME_WIDTH))
# h2 = int(cap2.get(cv2.CAP_PROP_FRAME_HEIGHT))

# fps2 = cap2.get(cv2.CAP_PROP_FPS)
# if fps2 <= 0:
#     fps2 = FPS

# fourcc = cv2.VideoWriter_fourcc(*"mp4v")
# writer2 = cv2.VideoWriter(task2_output_video, fourcc, fps2, (w2, h2))

# frame_idx = 1
# frame_counts = []   # list of (frame_number, count)

# scale = 2  # upscale factor for YOLO

# while True:
#     ok, frame = cap2.read()
#     if not ok:
#         break

#     # ----------------------------------------------------
#     # 1) YOLO detection on an upscaled frame
#     # ----------------------------------------------------
#     big_frame = cv2.resize(frame, None, fx=scale, fy=scale)

#     det = yolo_model(
#         big_frame,
#         conf=conf_threshold,
#         imgsz=1280,
#         iou=0.45,
#         verbose=False
#     )[0]

#     det_list = []
#     person_det_count = 0

#     if det.boxes is not None:
#         for b in det.boxes:
#             cls_id = int(b.cls[0])
#             conf   = float(b.conf[0])
#             if cls_id != 0:      # keep only "person"
#                 continue

#             x1, y1, x2, y2 = b.xyxy[0].tolist()

#             # map back to original frame coordinates
#             x1 /= scale
#             y1 /= scale
#             x2 /= scale
#             y2 /= scale

#             det_list.append(([x1, y1, x2 - x1, y2 - y1], conf, "person"))
#             person_det_count += 1

#     # *** COUNT FOR KAGGLE: number of YOLO person detections ***
#     people_in_frame = person_det_count
#     frame_counts.append((frame_idx, people_in_frame))

#     # ----------------------------------------------------
#     # 2) DeepSORT tracking (for visualization only)
#     # ----------------------------------------------------
#     tracks = tracker.update_tracks(det_list, frame=frame)

#     for trk in tracks:
#         # draw only tracks that were updated this frame
#         if getattr(trk, "time_since_update", 0) != 0:
#             continue
#         draw_and_log_box(frame, frame_idx, trk, txt_file=None)

#     writer2.write(frame)

#     if frame_idx % 50 == 0:
#         print(f"Task2 – processed frame {frame_idx}, YOLO count = {people_in_frame}")

#     frame_idx += 1

# cap2.release()
# writer2.release()

# print("Finished Task2 tracking.")
# print("Annotated Task2 video saved at:", task2_output_video)

# # write CSV
# with open(task2_counts_csv, "w", newline="") as f:
#     writer = csv.writer(f)
#     writer.writerow(["Number", "Count"])
#     for frame_num, cnt in frame_counts:
#         writer.writerow([frame_num, cnt])

# print("Saved Task2 pedestrian counts to:", task2_counts_csv)
# print("Total frames:", len(frame_counts))


In [18]:
# trial 2 (scale and division just for left corner)

# Task2 images and input video
task2_images_dir       = os.path.join(object_tracking_root, "Task2", "images")
task2_input_video_path = os.path.join(object_tracking_root, "task2_input.mp4")

print("Task2 images dir :", task2_images_dir)
print("Task2 input video:", task2_input_video_path)

# Build the Task2 input video from images (reuse your Part 1 helper)
images_to_video(task2_images_dir, task2_input_video_path, FPS)

# Output paths for Task2
task2_output_video = os.path.join(object_tracking_root, "task2.mp4")
task2_counts_csv   = os.path.join(object_tracking_root, "group50_object_tracking.csv")

print("Task2 annotated video:", task2_output_video)
print("Task2 counts csv     :", task2_counts_csv)

# ----------------------------------------------------------------------
# YOLO + DeepSORT setup
# ----------------------------------------------------------------------
conf_threshold = 0.3        # low for high recall; you can tweak to 0.07 or 0.1
tile_scale     = 2.0         # how much to zoom each tile

yolo_model = YOLO("yolov8l.pt")
tracker    = DeepSort(max_age=30, n_init=3, max_iou_distance=0.7)

# ----------------------------------------------------------------------
# NMS helpers (to merge detections from 4 tiles)
# ----------------------------------------------------------------------
def iou_xyxy(a, b):
    """
    IoU between two boxes in (x1, y1, x2, y2) format.
    """
    xa1, ya1, xa2, ya2 = a
    xb1, yb1, xb2, yb2 = b

    ix1 = max(xa1, xb1)
    iy1 = max(ya1, yb1)
    ix2 = min(xa2, xb2)
    iy2 = min(ya2, yb2)

    iw = max(0.0, ix2 - ix1)
    ih = max(0.0, iy2 - iy1)
    inter = iw * ih

    if inter <= 0:
        return 0.0

    area_a = max(0.0, xa2 - xa1) * max(0.0, ya2 - ya1)
    area_b = max(0.0, xb2 - xb1) * max(0.0, yb2 - yb1)
    return inter / (area_a + area_b - inter + 1e-6)

def nms_boxes(boxes, iou_thr=0.6):
    """
    Simple NMS over a list of boxes.

    boxes: list of (x1, y1, x2, y2, conf)
    returns: filtered list of same format
    """
    if not boxes:
        return []

    # sort by confidence (high to low)
    boxes_sorted = sorted(boxes, key=lambda b: b[4], reverse=True)
    kept = []

    for box in boxes_sorted:
        x1, y1, x2, y2, conf = box
        keep = True
        for kb in kept:
            if iou_xyxy((x1, y1, x2, y2), kb[:4]) > iou_thr:
                keep = False
                break
        if keep:
            kept.append(box)

    return kept

# ----------------------------------------------------------------------
# Video I/O
# ----------------------------------------------------------------------
cap2 = cv2.VideoCapture(task2_input_video_path)
if not cap2.isOpened():
    raise RuntimeError("Cannot open Task2 input video")

w2 = int(cap2.get(cv2.CAP_PROP_FRAME_WIDTH))
h2 = int(cap2.get(cv2.CAP_PROP_FRAME_HEIGHT))

fps2 = cap2.get(cv2.CAP_PROP_FPS)
if fps2 <= 0:
    fps2 = FPS

fourcc = cv2.VideoWriter_fourcc(*"mp4v")
writer2 = cv2.VideoWriter(task2_output_video, fourcc, fps2, (w2, h2))

# ----------------------------------------------------------------------
# Main loop: tile → detect → merge → track → count
# ----------------------------------------------------------------------
frame_idx     = 1
frame_counts  = []   # list of (frame_number, count)

while True:
    ok, frame = cap2.read()
    if not ok:
        break

    H, W = frame.shape[:2]
    mid_x = W // 2
    mid_y = H // 2

    # 4 tiles: TL, TR, BL, BR
    tiles = [
        (0,      0,      mid_x, mid_y),  # top-left
        (mid_x,  0,      W,     mid_y),  # top-right
        (0,      mid_y,  mid_x, H),      # bottom-left
        (mid_x,  mid_y,  W,     H),      # bottom-right
    ]

    all_boxes = []   # will store (x1, y1, x2, y2, conf) in full-frame coords

    # 1) Run YOLO on each upscaled tile
    for (tx1, ty1, tx2, ty2) in tiles:
        tile = frame[ty1:ty2, tx1:tx2]
        if tile.size == 0:
            continue

        # upscale tile
        tile_big = cv2.resize(tile, None, fx=tile_scale, fy=tile_scale)

        # YOLO on tile
        det_tile = yolo_model(
            tile_big,
            conf=conf_threshold,
            imgsz=1280,
            iou=0.5,
            verbose=False
        )[0]

        if det_tile.boxes is None:
            continue

        for b in det_tile.boxes:
            cls_id = int(b.cls[0])
            conf   = float(b.conf[0])
            if cls_id != 0:       # keep only 'person'
                continue

            bx1, by1, bx2, by2 = b.xyxy[0].tolist()

            # scale back to original frame coordinates and add tile offset
            x1 = bx1 / tile_scale + tx1
            y1 = by1 / tile_scale + ty1
            x2 = bx2 / tile_scale + tx1
            y2 = by2 / tile_scale + ty1

            all_boxes.append((x1, y1, x2, y2, conf))

    # 2) NMS across tiles to avoid double-counting near boundaries
    final_boxes = nms_boxes(all_boxes, iou_thr=0.6)

    # 3) Build DeepSORT detections from merged boxes & count people
    det_list = []
    for (x1, y1, x2, y2, conf) in final_boxes:
        det_list.append(([x1, y1, x2 - x1, y2 - y1], conf, "person"))

    # Kaggle count = number of merged detections
    people_in_frame = len(final_boxes)
    frame_counts.append((frame_idx, people_in_frame))

    # 4) DeepSORT tracking for visualization only (IDs on video)
    tracks = tracker.update_tracks(det_list, frame=frame)

    for trk in tracks:
        if not trk.is_confirmed():
            continue
        # reuse your helper; no logging here, only drawing
        draw_and_log_box(frame, frame_idx, trk, txt_file=None)

    writer2.write(frame)

    if frame_idx % 50 == 0:
        print(f"Task2 – processed frame {frame_idx}, count = {people_in_frame}")

    frame_idx += 1

cap2.release()
writer2.release()

print("Finished Task2 tracking.")
print("Annotated Task2 video saved at:", task2_output_video)

# ----------------------------------------------------------------------
# Write Kaggle CSV: Number,Count
# ----------------------------------------------------------------------
with open(task2_counts_csv, "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["Number", "Count"])
    for frame_num, cnt in frame_counts:
        writer.writerow([frame_num, cnt])

print("Saved Task2 pedestrian counts to:", task2_counts_csv)
print("Total frames:", len(frame_counts))

Task2 images dir : /content/drive/MyDrive/McGill/2025/Fall 2025/ECSE 415/A5/Object_Tracking/Task2/images
Task2 input video: /content/drive/MyDrive/McGill/2025/Fall 2025/ECSE 415/A5/Object_Tracking/task2_input.mp4
Found 1050 images in /content/drive/MyDrive/McGill/2025/Fall 2025/ECSE 415/A5/Object_Tracking/Task2/images
Target frame size: (1920, 1080)
Video saved to: /content/drive/MyDrive/McGill/2025/Fall 2025/ECSE 415/A5/Object_Tracking/task2_input.mp4
Task2 annotated video: /content/drive/MyDrive/McGill/2025/Fall 2025/ECSE 415/A5/Object_Tracking/task2.mp4
Task2 counts csv     : /content/drive/MyDrive/McGill/2025/Fall 2025/ECSE 415/A5/Object_Tracking/group50_object_tracking.csv
Task2 – processed frame 50, count = 38
Task2 – processed frame 100, count = 37
Task2 – processed frame 150, count = 36
Task2 – processed frame 200, count = 35
Task2 – processed frame 250, count = 42
Task2 – processed frame 300, count = 48
Task2 – processed frame 350, count = 51
Task2 – processed frame 400, count