In [2]:
import cv2
import os
import numpy as np
import matplotlib
from matplotlib import pyplot as plt
import torch

from pathlib import Path
import sklearn
import skimage
from skimage import __version__ as skimage_version
from sklearn import __version__ as sklearn_version


path = "/content/"
debug = True

from google.colab import drive
drive.mount(path+'/drive/')

if debug == True:
    A5_savepath = path+'/drive/MyDrive/03 McGill EE Semester 5 (Fall 2025)/ECSE 415 (Clark)/A5/'

print(f"Python version:           {os.sys.version.split()[0]}")
print(f"OpenCV version:           {cv2.__version__}")
print(f"NumPy version:            {np.__version__}")
print(f"Matplotlib version:       {matplotlib.__version__}")
print(f"PyTorch version:          {torch.__version__}")
print(f"scikit-image version:     {skimage_version}")
print(f"scikit-learn version:     {sklearn_version}")
print("Path: " + path)


ModuleNotFoundError: No module named 'google.colab'

In [None]:
if debug == False:
    os.chdir(path)
    !pip install kaggle
    !mkdir -p ~/.kaggle
    !cp /content/drive/MyDrive/Kaggle_API/kaggle.json ~/.kaggle/
    !chmod 600 ~/.kaggle/kaggle.json
    !kaggle competitions download -c ecse-415-video-analysis
    !unzip -q ecse-415-video-analysis.zip -d .

In [None]:
# ===== PART 1: Data Preparation (10 points) =====
# Convert Task1 images to video at 14 FPS and save as task1_input.mp4

task1_dir = Path(path, "Tracking", "Task1")
csv1_path = Path(task1_dir, "gt", "gt.txt")
ground_truth_task1 = np.loadtxt(csv1_path, delimiter=",")

images1_path = Path(task1_dir, "images")

# Get all image files and sort them by filename to ensure correct order
image_files = sorted([f for f in os.listdir(images1_path) if f.endswith(('.jpg', '.png', '.jpeg'))])

print(f"Found {len(image_files)} images in {images1_path}")

if len(image_files) == 0:
    print("ERROR: No images found!")
else:
    # Read first image to get dimensions
    first_img_path = Path(images1_path, image_files[0])
    first_frame = cv2.imread(str(first_img_path))
    H, W, _ = first_frame.shape
    
    print(f"Video dimensions: {W}x{H}")
    
    # Set up VideoWriter
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')  # Codec
    fps = 14.0  # Required FPS for Part 1
    frame_size = (W, H)
    
    # Set output path
    if debug == True:
        mp4_path = os.path.join(A5_savepath, 'task1_input.mp4')
    else:
        mp4_path = 'task1_input.mp4'
    
    writer = cv2.VideoWriter(mp4_path, fourcc, fps, frame_size)
    
    # Check if writer opened successfully
    if not writer.isOpened():
        print("ERROR: Could not open VideoWriter!")
    else:
        # Write all frames to video
        for img_file in image_files:
            img_path = Path(images1_path, img_file)
            img_bgr = cv2.imread(str(img_path))
            
            if img_bgr is not None:
                # cv2.VideoWriter expects BGR format
                writer.write(img_bgr)
            else:
                print(f"Warning: Could not read {img_file}")
        
        writer.release()
        print(f"✓ Part 1 Complete: Wrote video to {mp4_path}")
        print(f"  Total frames: {len(image_files)}")
        print(f"  FPS: {fps}")
        print(f"  Duration: {len(image_files)/fps:.2f} seconds")

In [None]:
# ===== PART 2: Model Implementation (40 points) =====
# Install required packages for YOLOv8 + DeepSORT

# Install ultralytics (YOLOv8)
!pip install ultralytics

# Install deep-sort-realtime
!pip install deep-sort-realtime

print("✓ Packages installed successfully!")

In [None]:
# ===== PART 2: YOLOv8 + DeepSORT Tracking Implementation =====

from ultralytics import YOLO
from deep_sort_realtime.deepsort_tracker import DeepSort

# Initialize YOLOv8 model
print("Loading YOLOv8 model...")
model = YOLO('yolov8n.pt')  # Using nano model for speed, can use yolov8s.pt, yolov8m.pt for better accuracy
print("✓ YOLOv8 model loaded")

# Initialize DeepSORT tracker
print("Initializing DeepSORT tracker...")
tracker = DeepSort(
    max_age=30,           # Maximum frames to keep track alive without detections
    n_init=3,             # Number of consecutive frames for track confirmation
    max_iou_distance=0.7, # Maximum IOU distance for matching
    embedder="mobilenet", # Feature extractor
    half=True,            # Use FP16 for speed
    embedder_gpu=torch.cuda.is_available()
)
print("✓ DeepSORT tracker initialized")

# Open input video
cap = cv2.VideoCapture(mp4_path)

# Get video properties
fps_input = cap.get(cv2.CAP_PROP_FPS)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

print(f"\nInput Video Properties:")
print(f"  Resolution: {width}x{height}")
print(f"  FPS: {fps_input}")
print(f"  Total frames: {total_frames}")

# Set up output video writer
if debug == True:
    output_path = os.path.join(A5_savepath, 'task1.mp4')
    tracking_results_path = os.path.join(A5_savepath, 'task1_tracking.txt')
else:
    output_path = 'task1.mp4'
    tracking_results_path = 'task1_tracking.txt'

fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_path, fourcc, fps_input, (width, height))

# Open file to save tracking results
tracking_file = open(tracking_results_path, 'w')

# Process video frame by frame
frame_idx = 0
tracking_data = []

print("\nProcessing video with YOLOv8 + DeepSORT...")

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break
    
    frame_idx += 1
    
    # Run YOLOv8 detection (class 0 is 'person' in COCO dataset)
    results = model(frame, classes=[0], verbose=False)  # Only detect persons
    
    # Extract detections for DeepSORT
    detections = []
    for result in results:
        boxes = result.boxes
        for box in boxes:
            # Get bounding box coordinates
            x1, y1, x2, y2 = box.xyxy[0].cpu().numpy()
            conf = box.conf[0].cpu().numpy()
            
            # Convert to [left, top, width, height] format for DeepSORT
            bbox = [x1, y1, x2 - x1, y2 - y1]
            
            # DeepSORT expects: ([left, top, width, height], confidence, class_name)
            detections.append((bbox, conf, 'person'))
    
    # Update tracker with detections
    tracks = tracker.update_tracks(detections, frame=frame)
    
    # Draw bounding boxes and IDs on frame
    for track in tracks:
        if not track.is_confirmed():
            continue
        
        track_id = track.track_id
        ltrb = track.to_ltrb()  # Get [left, top, right, bottom]
        
        x1, y1, x2, y2 = map(int, ltrb)
        bb_left = x1
        bb_top = y1
        bb_width = x2 - x1
        bb_height = y2 - y1
        
        # Save tracking result: <frame>, <id>, <bb_left>, <bb_top>, <bb_width>, <bb_height>
        tracking_file.write(f"{frame_idx},{track_id},{bb_left},{bb_top},{bb_width},{bb_height}\n")
        tracking_data.append([frame_idx, track_id, bb_left, bb_top, bb_width, bb_height])
        
        # Draw bounding box
        cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
        
        # Draw tracking ID
        label = f'ID: {track_id}'
        label_size, _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.6, 2)
        label_y = max(y1 - 10, label_size[1] + 10)
        
        # Draw background for text
        cv2.rectangle(frame, (x1, label_y - label_size[1] - 10), 
                     (x1 + label_size[0], label_y), (0, 255, 0), -1)
        
        # Draw text
        cv2.putText(frame, label, (x1, label_y - 5), 
                   cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 0), 2)
    
    # Add frame number to video
    cv2.putText(frame, f'Frame: {frame_idx}/{total_frames}', (10, 30), 
               cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2)
    
    # Write frame to output video
    out.write(frame)
    
    # Progress indicator
    if frame_idx % 50 == 0:
        print(f"  Processed frame {frame_idx}/{total_frames} ({frame_idx/total_frames*100:.1f}%)")

# Release resources
cap.release()
out.release()
tracking_file.close()

print(f"\n✓ Part 2 Complete: Object tracking finished!")
print(f"  Output video: {output_path}")
print(f"  Tracking results: {tracking_results_path}")
print(f"  Total frames processed: {frame_idx}")
print(f"  Total tracked detections: {len(tracking_data)}")

In [None]:
# ===== Display Sample Tracked Frame and Statistics =====

import pandas as pd

# Load tracking results
tracking_df = pd.DataFrame(tracking_data, columns=['frame', 'id', 'bb_left', 'bb_top', 'bb_width', 'bb_height'])

print("Tracking Statistics:")
print(f"  Total detections: {len(tracking_df)}")
print(f"  Unique track IDs: {tracking_df['id'].nunique()}")
print(f"  Frames with detections: {tracking_df['frame'].nunique()}")
print(f"  Average detections per frame: {len(tracking_df) / tracking_df['frame'].nunique():.2f}")

# Show distribution of track IDs
print("\nTrack ID distribution (top 10):")
print(tracking_df['id'].value_counts().head(10))

# Display a sample tracked frame
sample_frame_num = total_frames // 2  # Middle frame
cap_sample = cv2.VideoCapture(output_path)
cap_sample.set(cv2.CAP_PROP_POS_FRAMES, sample_frame_num)
ret, sample_frame = cap_sample.read()
cap_sample.release()

if ret:
    # Convert BGR to RGB for matplotlib
    sample_frame_rgb = cv2.cvtColor(sample_frame, cv2.COLOR_BGR2RGB)
    
    plt.figure(figsize=(12, 8))
    plt.imshow(sample_frame_rgb)
    plt.title(f'Sample Tracked Frame (Frame {sample_frame_num})')
    plt.axis('off')
    plt.tight_layout()
    plt.show()
    
print("\n✓ All outputs generated successfully!")