In [89]:
# Import packages

import torch
import torchvision
from torchvision import transforms as T
import torch.nn.functional as F

from PIL import Image
import cv2

import numpy as np
from ultralytics import YOLO

import math

from time import time
import matplotlib.pyplot as plt

import pandas as pd

In [3]:
# COCO class names

class_names = ["person" , "bicycle" , "car" , "motorcycle" , "airplane" , "bus" , "train" , "truck" , "boat" , "traffic light" , "fire hydrant" , "street sign" , "stop sign" , "parking meter" , "bench" , "bird" , "cat" , "dog" , "horse" , "sheep" , "cow" , "elephant" , "bear" , "zebra" , "giraffe" , "hat" , "backpack" , "umbrella" , "shoe" , "eye glasses" , "handbag" , "tie" , "suitcase" ,
"frisbee" , "skis" , "snowboard" , "sports ball" , "kite" , "baseball bat" ,
"baseball glove" , "skateboard" , "surfboard" , "tennis racket" , "bottle" ,
"plate" , "wine glass" , "cup" , "fork" , "knife" , "spoon" , "bowl" ,
"banana" , "apple" , "sandwich" , "orange" , "broccoli" , "carrot" , "hot dog" ,
"pizza" , "donut" , "cake" , "chair" , "couch" , "potted plant" , "bed" ,
"mirror" , "dining table" , "window" , "desk" , "toilet" , "door" , "tv" ,
"laptop" , "mouse" , "remote" , "keyboard" , "cell phone" , "microwave" ,
"oven" , "toaster" , "sink" , "refrigerator" , "blender" , "book" ,
"clock" , "vase" , "scissors" , "teddy bear" , "hair drier" , "toothbrush" , "hair brush"]

In [None]:
# Load faster rcnn
faster_rcnn = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained = True)

In [None]:
# Load single shot detector
ssd = torchvision.models.detection.ssd300_vgg16(pretrained = True)

In [6]:
# put to evaluation mode to not track gradients - just for inference
faster_rcnn.eval()

FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=0.0)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=0.0)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=0.0)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=0.0)
          (relu): ReLU(

In [7]:
# put to evaluation mode to not track gradients - just for inference
ssd.eval()

SSD(
  (backbone): SSDFeatureExtractorVGG(
    (features): Sequential(
      (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): ReLU(inplace=True)
      (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (3): ReLU(inplace=True)
      (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (6): ReLU(inplace=True)
      (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (8): ReLU(inplace=True)
      (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (11): ReLU(inplace=True)
      (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (13): ReLU(inplace=True)
      (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (15): ReLU(inplace=

In [8]:
# load yolov8 large model
yolo = YOLO('../Yolo-Weights/yolov8l.pt')

In [105]:
# transform to tensors
transform = torchvision.transforms.ToTensor()

___

# **Detection on Video**

In [76]:
import cv2
import torch
from time import time

# Open the video file for processing
cap = cv2.VideoCapture("video.mp4")

# Initialize a dictionary to store results
results_fr = {
    'inference_time': [],  # To store the time taken for inference
    'conf_scores': [],     # To store confidence scores of detections
    'class_names': [],     # To store class names for each detection
    'bboxes': []           # To store bounding boxes of detections
}

while True:
    # Read a frame from the video
    success, img = cap.read()

    # Break the loop if no frame is captured (end of video)
    if not success:
        break

    # Apply transformations to the frame (e.g., resizing, normalization)
    transformed_img = transform(img)

    # Start timing the inference process
    start_time = time()
    
    # Perform object detection on the transformed image
    pred = faster_rcnn([transformed_img])
    
    # End timing the inference process
    end_time = time()

    # Extract bounding boxes, labels, and scores from the prediction
    bboxes, labels, scores = pred[0]['boxes'], pred[0]['labels'], pred[0]['scores']

    # Filter out predictions with scores below the threshold (0.8 in this case)
    num = torch.argwhere(scores > 0.8).shape[0]

    # Initialize lists to store results for the current frame
    classes = []
    boxes = []
    conf_scores = []

    for i in range(num):
        # Convert bounding box coordinates to integers
        x1, y1, x2, y2 = bboxes[i].detach().numpy().astype('int')
        boxes.append([x1, y1, x2, y2])
        
        # Draw the bounding box on the original image
        cv2.rectangle(img, (x1, y1), (x2, y2), (255, 0, 0), 1)
        
        # Get the class name from the label
        class_name = class_names[labels.detach().numpy()[i] - 1]
        classes.append(class_name)
        
        # Calculate and round the confidence score
        conf_score = round(scores.detach().numpy()[i] * 100)
        conf_scores.append(conf_score)
        
        # Put text with the class name on the image
        cv2.putText(img, class_name, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 1, cv2.LINE_AA)

    # Append results of the current frame to the results dictionary
    results_fr['bboxes'].append(boxes)
    results_fr['conf_scores'].append(conf_scores)
    results_fr['class_names'].append(classes)
    results_fr['inference_time'].append(end_time - start_time)

    # Display the frame with bounding boxes and labels
    cv2.imshow('Frame', img)

    # Break the loop if 'c' or 'C' is pressed
    key = cv2.waitKey(1) & 0xFF
    if key == ord('q'):
        break

# Release the video capture object and close all OpenCV windows
cap.release()
cv2.destroyAllWindows()

In [83]:
cap = cv2.VideoCapture("video.mp4")

results_s = {'inference_time' : [],
           'conf_scores' : [],
           'class_names' : [],
           'bboxes' : []}

i = 0

while True:
  success, img = cap.read()

  if not success:
    break

  transformed_img = transform(img)

  start_time = time()
  pred = ssd([transformed_img])
  end_time = time()

  bboxes, labels, scores = pred[0]['boxes'], pred[0]['labels'], pred[0]['scores']

  num = torch.argwhere(scores > 0.8).shape[0]

  classes = []
  boxes = []
  conf_scores = []

  for i in range(num):
    x1, y1, x2, y2 = bboxes[i].detach().numpy().astype('int')
    boxes.append([x1, y1, x2, y2])
    cv2.rectangle(img, (x1, y1), (x2, y2), (255, 0, 0), 1)
    class_name = class_names[labels.detach().numpy()[i] - 1]
    classes.append(class_name)
    conf_score = round(scores.detach().numpy()[i] * 100)
    conf_scores.append(conf_score)
    cv2.putText(img, class_name, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 1, cv2.LINE_AA)

  results_s['bboxes'].append(boxes)
  results_s['conf_scores'].append(conf_scores)
  results_s['class_names'].append(classes)
  results_s['inference_time'].append(end_time - start_time)

  cv2.imshow('Frame', img)

  key = cv2.waitKey(1) & 0xFF

  if key == ord('q'):
    break

cap.release()
cv2.destroyAllWindows()



In [90]:
def yolo_transform(img):
    # Convert the image from BGR (OpenCV default) to RGB color space
    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    
    # Convert the RGB image to a PyTorch tensor and normalize pixel values to the range [0, 1]
    img_tensor = torch.tensor(img_rgb).float() / 255.0
    
    # Rearrange the tensor dimensions from (H, W, C) to (C, H, W) as required by PyTorch models
    img_tensor = img_tensor.permute(2, 0, 1)
    
    return img_tensor

In [102]:
import cv2
import torch
import torchvision.transforms.functional as F
from time import time

# Open the video file for processing
cap = cv2.VideoCapture("video.mp4")

# Initialize a dictionary to store results
results_y = {
    'inference_time': [],  # To store the time taken for inference
    'conf_scores': [],     # To store confidence scores of detections
    'class_names': [],     # To store class names for each detection
    'bboxes': []           # To store bounding boxes of detections
}

# Counter for frames
i = 0

while True:
    # Read a frame from the video
    success, img = cap.read()

    # Break the loop if no frame is captured (end of video)
    if not success:
        break

    # Transform the frame (e.g., resize and normalize)
    transformed_img = transform(img)

    # Resize the image dimensions to be multiples of 32 for YOLO
    new_height = (transformed_img.shape[1] // 32) * 32
    new_width = (transformed_img.shape[2] // 32) * 32
    
    # Use bilinear interpolation to resize the image
    transformed_img_resized = F.interpolate(
        transformed_img.unsqueeze(dim=0), 
        size=(new_height, new_width), 
        mode='bilinear', 
        align_corners=False
    )

    # Start timing the inference process
    start_time = time()
    
    # Perform object detection on the resized image
    preds = yolo(transformed_img_resized)
    
    # End timing the inference process
    end_time = time()

    # Initialize lists to store results for the current frame
    classes = []
    boxes = []
    conf_scores = []

    # Process each prediction
    for pred in preds:
        # Extract bounding boxes from the prediction
        box = pred.boxes
        
        for b in box:
            # Extract confidence score and convert to percentage
            conf_score = round(b.conf[0].cpu().numpy() * 100)
            
            # Filter out predictions with low confidence
            if conf_score > 80:
                # Extract bounding box coordinates
                x1, y1, x2, y2 = b.xyxy[0].cpu().numpy().astype('int')
                boxes.append([x1, y1, x2, y2])
                
                # Draw the bounding box on the original image
                cv2.rectangle(img, (x1, y1), (x2, y2), (255, 0, 0), 1)
                
                # Append confidence score and class name to lists
                conf_scores.append(conf_score)
                class_name = class_names[int(b.cls[0].cpu().numpy())]
                classes.append(class_name)
                
                # Put text with the class name on the image
                cv2.putText(img, class_name, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 1, cv2.LINE_AA)

    # Append results of the current frame to the results dictionary
    results_y['bboxes'].append(boxes)
    results_y['conf_scores'].append(conf_scores)
    results_y['class_names'].append(classes)
    results_y['inference_time'].append(end_time - start_time)

    # Display the frame with bounding boxes and labels
    cv2.imshow('Frame', img)

    # Break the loop if 'c' or 'C' is pressed
    key = cv2.waitKey(1) & 0xFF
    if key == ord('c') or key == ord('C'):
        break

# Release the video capture object and close all OpenCV windows
cap.release()
cv2.destroyAllWindows()


0: 352x640 6 persons, 1 boat, 1 backpack, 154.4ms
Speed: 708.1ms preprocess, 154.4ms inference, 7.0ms postprocess per image at shape (1, 3, 352, 640)

0: 352x640 6 persons, 1 boat, 1 backpack, 1 handbag, 69.6ms
Speed: 2.0ms preprocess, 69.6ms inference, 6.0ms postprocess per image at shape (1, 3, 352, 640)

0: 352x640 7 persons, 1 boat, 1 handbag, 17.0ms
Speed: 1.0ms preprocess, 17.0ms inference, 5.9ms postprocess per image at shape (1, 3, 352, 640)

0: 352x640 6 persons, 1 boat, 1 handbag, 17.7ms
Speed: 0.7ms preprocess, 17.7ms inference, 6.0ms postprocess per image at shape (1, 3, 352, 640)

0: 352x640 7 persons, 1 boat, 19.2ms
Speed: 1.0ms preprocess, 19.2ms inference, 7.0ms postprocess per image at shape (1, 3, 352, 640)

0: 352x640 7 persons, 1 boat, 19.0ms
Speed: 1.0ms preprocess, 19.0ms inference, 7.5ms postprocess per image at shape (1, 3, 352, 640)

0: 352x640 7 persons, 1 boat, 18.7ms
Speed: 1.0ms preprocess, 18.7ms inference, 8.3ms postprocess per image at shape (1, 3, 352,

In [103]:
results_yolo_df = pd.DataFrame(results_y) # dataframe for yolo results
results_yolo_df.tail() # last 5 rows

Unnamed: 0,inference_time,conf_scores,class_names,bboxes
276,0.02667,"[90, 84]","[person, person]","[[598, 146, 639, 254], [262, 156, 288, 262]]"
277,0.028099,"[87, 84, 82]","[person, person, person]","[[598, 146, 638, 253], [255, 156, 287, 261], [..."
278,0.027832,"[86, 85, 83]","[person, person, person]","[[597, 145, 633, 254], [248, 156, 287, 261], [..."
279,0.031247,"[86, 84, 81]","[person, person, person]","[[242, 156, 287, 261], [536, 153, 575, 253], [..."
280,0.027755,"[87, 83, 81]","[person, person, person]","[[239, 156, 287, 261], [591, 145, 623, 255], [..."


In [104]:
print(f"Average inference time YOLO: {results_yolo_df['inference_time'].mean()}") # average inference time yolo

Average inference time YOLO: 0.033976564203717106


In [78]:
results_fr_df = pd.DataFrame(results_fr) # dataframe for faster rcnn
results_fr_df.tail() # last 5 rows

Unnamed: 0,inference_time,conf_scores,class_names,bboxes
276,3.998757,"[100, 100, 100, 100, 99, 99, 91, 87]","[person, person, person, person, person, perso...","[[598, 150, 635, 260], [261, 158, 288, 267], [..."
277,3.638347,"[100, 100, 100, 99, 99, 98, 86]","[person, person, person, person, person, perso...","[[259, 158, 288, 267], [598, 149, 633, 261], [..."
278,3.889045,"[100, 100, 100, 100, 100, 99, 89]","[person, person, person, person, person, perso...","[[597, 149, 631, 261], [201, 176, 235, 266], [..."
279,2.614484,"[100, 100, 100, 100, 99, 99]","[person, person, person, person, person, person]","[[595, 149, 627, 260], [200, 175, 233, 267], [..."
280,3.239111,"[100, 100, 100, 99, 99, 97]","[person, person, person, person, person, person]","[[594, 148, 624, 260], [536, 156, 571, 259], [..."


In [79]:
print(f"Average inference time Faster RCNN: {results_fr_df['inference_time'].mean()}") # average inference time faster rcnn

Average inference time Faster RCNN: 2.606488109059181


In [80]:
results_s_df = pd.DataFrame(results_s) # dataframe for ssd
results_s_df.tail() # last 5 rows

Unnamed: 0,inference_time,conf_scores,class_names,bboxes
276,0.407242,"[99, 89]","[person, person]","[[543, 160, 584, 261], [203, 174, 240, 268]]"
277,0.226633,"[98, 90]","[person, person]","[[540, 163, 581, 261], [202, 173, 239, 268]]"
278,0.437428,"[97, 87]","[person, person]","[[539, 162, 579, 262], [201, 173, 238, 268]]"
279,0.416263,"[95, 90]","[person, person]","[[537, 159, 576, 263], [589, 150, 627, 257]]"
280,0.315461,"[94, 93, 91]","[person, person, person]","[[587, 150, 624, 258], [536, 158, 572, 262], [..."


In [81]:
print(f"Average inference time SSD: {results_s_df['inference_time'].mean()}") # average inference time ssd

Average inference time SSD: 0.39548616052946584


___

# **Tracking with DeepSORT**

In [115]:
from deep_sort.utils.parser import get_config
from deep_sort.deep_sort import DeepSort

deep_sort_weights = 'deep_sort/deep/checkpoint/ckpt.t7' # Path to the DeepSort model
tracker = DeepSort(model_path=deep_sort_weights, max_age=70) # Initialize the DeepSort tracker

In [131]:
import cv2

# Open the video file for processing
cap = cv2.VideoCapture("video.mp4")

# Retrieve and store the width of the video frames
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))

# Retrieve and store the height of the video frames
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

# Retrieve and store the frames per second (FPS) of the video
fps = cap.get(cv2.CAP_PROP_FPS)

In [132]:
device = 'cuda' if torch.cuda.is_available() else 'cpu' # check if GPU is available
device

'cuda'

In [133]:
frames = []  # Initialize an empty list to store the frames
unique_track_ids = set()  # Initialize an empty set to store the unique track IDs

In [134]:
# Define the codec and create VideoWriter object
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
output_path = 'output1.mp4'
out = cv2.VideoWriter(output_path, fourcc, fps, (frame_width, frame_height))

In [None]:
# Initialize variables for tracking FPS and elapsed time
i = 0
counter, fps, elapsed = 0, 0, 0
start_time = time()  # Record the start time for FPS calculation

# Dictionary to store tracking results, including confidence scores, class names, bounding boxes, and track IDs
results_track = {
    'conf_scores': [],
    'class_names': [],
    'bboxes': [],
    'track_id': []
}

# Start reading the video frames
while cap.isOpened():
    ret, frame = cap.read()  # Read a frame from the video

    if ret:
        # Convert the frame from BGR to RGB format (YOLO typically expects RGB)
        og_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frame = og_frame.copy()

        # Load the YOLOv8 model
        model = YOLO("yolov8l.pt")

        # Run the model on the current frame with specified settings (device, classes, confidence threshold)
        results = model(frame, device=0, classes=0, conf=0.8)

        # Process the results from YOLO for each detection
        for result in results:
            boxes = result.boxes  # Extract bounding boxes
            cls = boxes.cls.tolist()  # Extract class indices
            xyxy = boxes.xyxy  # Extract bounding box coordinates (x1, y1, x2, y2)
            conf = boxes.conf  # Extract confidence scores
            xywh = boxes.xywh  # Extract bounding box coordinates (x, y, width, height)
            
            # Map class indices to class names
            for class_index in cls:
                class_name = class_names[int(class_index)]

        # Convert predictions to numpy arrays and detach from computation graph
        pred_cls = np.array(cls)
        conf = conf.detach().cpu().numpy()
        xyxy = xyxy.detach().cpu().numpy()
        bboxes_xywh = xywh.cpu().numpy()

        # Store the tracking results in the results_track dictionary
        results_track['conf_scores'].extend(conf.tolist())
        results_track['class_names'].extend([class_names[int(index)] for index in cls])
        results_track['bboxes'].extend(xyxy.tolist())
        
        # Update the tracker with new bounding boxes and confidence scores
        tracks = tracker.update(bboxes_xywh, conf, og_frame)

        # Iterate over the tracks to draw bounding boxes and track IDs on the frame
        for track in tracker.tracker.tracks:
            track_id = track.track_id  # Retrieve the track ID
            hits = track.hits  # Number of hits for the track (how often it was detected)
            x1, y1, x2, y2 = track.to_tlbr()  # Convert bounding box format to (top-left, bottom-right)
            w = x2 - x1  # Calculate the width of the bounding box
            h = y2 - y1  # Calculate the height of the bounding box

            # Define colors for bounding boxes based on track ID
            red_color = (0, 0, 255)  # Red color
            blue_color = (255, 0, 0)  # Blue color
            green_color = (0, 255, 0)  # Green color

            # Cycle through colors based on track ID
            color_id = track_id % 3
            if color_id == 0:
                color = red_color
            elif color_id == 1:
                color = blue_color
            else:
                color = green_color

            # Draw the bounding box with the appropriate color
            cv2.rectangle(og_frame, (int(x1), int(y1)), (int(x1 + w), int(y1 + h)), color, 2)

            # Draw the class name and track ID above the bounding box
            text_color = (0, 0, 0)  # Black color for text
            cv2.putText(og_frame, f"{class_name}-{track_id}", (int(x1) + 10, int(y1) - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, text_color, 1, cv2.LINE_AA)
            
            # Add the track ID to the set of unique track IDs
            unique_track_ids.add(track_id)

        # Count the number of unique persons being tracked
        person_count = len(unique_track_ids)

        # Update the FPS calculation based on elapsed time
        current_time = time()
        elapsed = (current_time - start_time)
        counter += 1
        if elapsed > 1:
            fps = counter / elapsed
            counter = 0
            start_time = current_time

        # Display the person count on the frame
        cv2.putText(og_frame, f"Person Count: {person_count}", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)

        # Store the processed frame for output
        frames.append(og_frame)

        # Write the frame to the output video file
        out.write(cv2.cvtColor(og_frame, cv2.COLOR_RGB2BGR))

        # Display the frame with bounding boxes and tracking info
        cv2.imshow("Video", og_frame)
        if cv2.waitKey(1) & 0xFF == ord('q'):  # Press 'q' to exit the loop
            break

# Release resources after processing is done
cap.release()
out.release()
cv2.destroyAllWindows()