<a href="https://colab.research.google.com/github/AndyCatruna/DSM/blob/main/Lab_05b_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Object Detection

Object detection refers to the task of predicting the bounding boxes and classes of the objects in the image.

In this lab we will train and evaluate a small object detection model for educational purposes.

However, if you want to utilize efficient pre-trained models for detection or fine-tune them, we recommend checking out [ultralytics](https://github.com/ultralytics/ultralytics) which makes inference and training very easy.

<img src="https://visionplatform.ai/wp-content/uploads/2024/01/object-detection.png" widht=500>

In [None]:
!pip install -q torchmetrics pycocotools

In [None]:
import sys

import numpy as np
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchvision.datasets import VOCDetection
from torchvision import transforms
from torchvision.models.detection import ssdlite320_mobilenet_v3_large
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
from torchmetrics.detection.mean_ap import MeanAveragePrecision
from torchvision.ops import nms

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
def count_parameters(model):
  return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [None]:
label_map = {
            "aeroplane": 1, "bicycle": 2, "bird": 3, "boat": 4, "bottle": 5,
            "bus": 6, "car": 7, "cat": 8, "chair": 9, "cow": 10,
            "diningtable": 11, "dog": 12, "horse": 13, "motorbike": 14, "person": 15,
            "pottedplant": 16, "sheep": 17, "sofa": 18, "train": 19, "tvmonitor": 20
        }

We will use the [Pascal VOC Dataset](http://host.robots.ox.ac.uk/pascal/VOC/voc2012/index.html) for this part of the lab as well.

In [None]:
''' Wrapper over the dataset in which we resize the bounding boxes to our image sizes '''
class VOCDetectionDataset(VOCDetection):
    def __init__(self, root, image_set, transform=None, download=False):
        super().__init__(root=root, image_set=image_set, download=download)

        self.transform = transform

        self.num_classes = 21

    def __getitem__(self, idx):
        image, target = super().__getitem__(idx)

        # Get original image dimensions
        orig_width = int(target["annotation"]["size"]["width"])
        orig_height = int(target["annotation"]["size"]["height"])

        # Convert image to tensor and resize
        image = self.transform(image)

        # Adjust bounding boxes for the resized image
        boxes = []
        labels = []
        for obj in target["annotation"]["object"]:
            bndbox = obj["bndbox"]
            xmin = int(bndbox["xmin"])
            ymin = int(bndbox["ymin"])
            xmax = int(bndbox["xmax"])
            ymax = int(bndbox["ymax"])

            # Scale bounding boxes
            xmin = xmin * 128 / orig_width
            ymin = ymin * 128 / orig_height
            xmax = xmax * 128 / orig_width
            ymax = ymax * 128 / orig_height

            boxes.append([xmin, ymin, xmax, ymax])
            class_name = obj["name"]
            labels.append(label_map[class_name])
        boxes = torch.tensor(boxes, dtype=torch.float32)
        labels = torch.tensor(labels, dtype=torch.int64)

        # Create the target dictionary
        target = {
            "boxes": boxes,
            "labels": labels,
            "image_id": torch.tensor([idx])
        }

        return image, target

We will work with only a subset of the dataset, so the training does not take too long.

In [None]:
''' Code for obtaining the dataset '''

# As we deal with variable lengths for the targets (variable number of objects) we can't stack labels into a tensor
# We use collate_fn to stack the images and targets into separate tuples

def collate_fn(batch):
    images, targets = zip(*batch)
    return images, targets

train_transform = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.ToTensor(),
    transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
])

test_transform = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.ToTensor(),
    transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
])

train_dataset = VOCDetectionDataset(
    root='./data',
    image_set='train',
    download=True,
    transform=train_transform
)

# Obtain training subset
train_dataset = torch.utils.data.Subset(train_dataset, np.random.choice(len(train_dataset), 2000))

trainloader = DataLoader(
    train_dataset,
    batch_size=128,
    shuffle=False,
    num_workers=4,
    collate_fn=collate_fn
)

test_dataset = VOCDetectionDataset(
    root='./data',
    image_set='val',
    download=True,
    transform=test_transform,
)

# Obtain testing subset
test_dataset = torch.utils.data.Subset(test_dataset, np.random.choice(len(test_dataset), 200))

testloader = DataLoader(
    test_dataset,
    batch_size=128,
    shuffle=False,
    num_workers=4,
    collate_fn=collate_fn
)

In [None]:
''' Code for visualizing the ground truth bounding boxes and predicted bounding boxes '''

def visualize_images_and_boxes(images, targets, predictions=None, num_samples=5, confidence_threshold=0.3, iou_threshold=0.5):
    num_samples = min(num_samples, len(images))
    num_cols = 2 if predictions is not None else 1
    fig, axs = plt.subplots(num_samples, num_cols, figsize=(8, 4 * num_samples))

    for i, (image, target) in enumerate(zip(images[:num_samples], targets[:num_samples])):
        # De-normalize the image
        mean = torch.tensor([0.485, 0.456, 0.406]).view(3, 1, 1)
        std = torch.tensor([0.229, 0.224, 0.225]).view(3, 1, 1)
        image = image * std + mean
        image = torch.clamp(image, 0, 1)

        image_np = image.permute(1, 2, 0).numpy()

        boxes = target["boxes"].cpu().numpy()
        labels = target["labels"].cpu().numpy()

        ax_gt = axs[i]
        if num_cols == 2:
            ax_gt = axs[i, 0]
        ax_gt.imshow(image_np)
        ax_gt.set_title("Ground Truth")


        for box, label in zip(boxes, labels):
            x_min, y_min, x_max, y_max = box
            width, height = x_max - x_min, y_max - y_min

            rect = plt.Rectangle((x_min, y_min), width, height, linewidth=2, edgecolor='r', facecolor='none')
            ax_gt.add_patch(rect)

            label_name = [k for k, v in label_map.items() if v == label][0]  # Convert label index to name
            ax_gt.text(x_min, y_min - 5, label_name, color='red', fontsize=12, weight='bold', backgroundcolor='white')

        ax_gt.axis('off')

        if predictions is not None:
            # Filter predictions with low scores
            keep = predictions[i]["scores"] > confidence_threshold
            predictions[i]["boxes"] = predictions[i]["boxes"][keep]
            predictions[i]["labels"] = predictions[i]["labels"][keep]
            predictions[i]["scores"] = predictions[i]["scores"][keep]

            # Apply non-maximum suppression
            keep = nms(predictions[i]["boxes"], predictions[i]["scores"], iou_threshold=iou_threshold)
            predictions[i]["boxes"] = predictions[i]["boxes"][keep]
            predictions[i]["labels"] = predictions[i]["labels"][keep]
            predictions[i]["scores"] = predictions[i]["scores"][keep]

            pred_boxes = predictions[i]["boxes"].cpu().numpy()
            pred_labels = predictions[i]["labels"].cpu().numpy()
            pred_scores = predictions[i]["scores"].cpu().numpy()

            ax_pred = axs[i, 1]
            ax_pred.imshow(image_np)
            ax_pred.set_title("Predictions")

            for box, label, score in zip(pred_boxes, pred_labels, pred_scores):
                x_min, y_min, x_max, y_max = box
                width, height = x_max - x_min, y_max - y_min

                rect = plt.Rectangle((x_min, y_min), width, height, linewidth=2, edgecolor='g', facecolor='none')
                ax_pred.add_patch(rect)

                label_name = [k for k, v in label_map.items() if v == label][0]
                ax_pred.text(x_min, y_min - 5, f"{label_name} ({score:.2f})", color='green', fontsize=12, weight='bold', backgroundcolor='white')

            ax_pred.axis('off')


    plt.tight_layout()
    plt.show()

In [None]:
# Visualize 5 images and bounding boxes from the train_loader
images, targets = next(iter(trainloader))

visualize_images_and_boxes(images, targets, num_samples=5)

We will train a Single-Shot-Detector (SSD) architecture that uses a MobileNet_v3 backbone.

Details about the backbone can be found [here](https://arxiv.org/abs/1905.02244)

The backbone is tasked with extracting relevant features.

Details about SSD can be found [here](https://arxiv.org/abs/1512.02325)

**High-Level Overview of Object Detection**

The detection takes as input the feature maps extracted by the backbone and outputs the bounding box predictions along with class scores.

It works by dividing the image into a grid cell. Each grid cell is tasked with predicting the object that has the center inside it.

For each grid cell the model predicts a number of bounding boxes, along with the confidence and class scores.

A lot of predicted bounding boxes will have no object inside it - the model will predict background class for that prediction.

A lot of predicted bounding boxes will be overlapping on the same object - we will use Non-Maximum Supression (NMS) to eliminate overlapping predictions and keep the one with the highest confidence. You can read more about NMS [here](https://medium.com/analytics-vidhya/non-max-suppression-nms-6623e6572536)


<img src="https://iq.opengenus.org/content/images/2021/12/1_St98vVQEqLndeV_-SeUc9Q.png">

In [None]:
ssd_lite = ssdlite320_mobilenet_v3_large(num_classes=21, pretrained_backbone=True, trainable_backbone_layers=2)

ssd_lite.to(device)

In [None]:
count_parameters(ssd_lite)

The model computes its losses intrnally based on the number of objects in the image. It trains for both localization and classification. We obtain a list of losses which we use to train the model.

For validation we filter predictions below a confidence score and apply NMS to remove overlapping boxes.

We utilize Mean Average Precision to evaluate the model. You can read more about it [here](https://towardsdatascience.com/what-is-average-precision-in-object-detection-localization-algorithms-and-how-to-calculate-it-3f330efe697b)

In [None]:
def train_epoch(model, dataloader, device, optimizer, epoch):
    model.train()

    total_train_loss = 0.0
    dataset_size = 0

    bar = tqdm(enumerate(dataloader), total=len(dataloader), colour='cyan', file=sys.stdout)
    for step, (images, targets) in bar:
        images = [img.to(device) for img in images]
        images = torch.stack(images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
        batch_size = len(images)

        loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        # We keep track of the average training loss
        total_train_loss += (losses.item() * batch_size)
        dataset_size += batch_size

        epoch_loss = np.round(total_train_loss / dataset_size, 2)
        bar.set_postfix(Epoch=epoch, Train_Loss=epoch_loss)

    return epoch_loss


# Validation Function
def valid_epoch(model, dataloader, device, iou_threshold=0.5, score_threshold=0.1):
    model.eval()
    metric = MeanAveragePrecision()

    bar = tqdm(enumerate(dataloader), total=len(dataloader), colour='cyan', file=sys.stdout)
    for step, (images, targets) in bar:
        images = [img.to(device) for img in images]
        images = torch.stack(images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        predictions = model(images, targets)

        filtered_predictions = []
        for pred in predictions:
            boxes = pred['boxes']
            scores = pred['scores']
            labels = pred['labels']

            # Filter boxes with low scores
            keep = scores > score_threshold
            boxes, scores, labels = boxes[keep], scores[keep], labels[keep]

            # Apply NMS
            keep = nms(boxes, scores, iou_threshold)
            filtered_predictions.append({
                'boxes': boxes[keep],
                'scores': scores[keep],
                'labels': labels[keep]
            })

        # Update metric
        metric.update(filtered_predictions, targets)

    metrics = metric.compute()
    print(f"Mean Average Precision: {metrics['map'].item()}")
    print(f"Mean Average Precision (50% IOU): {metrics['map_50'].item()}")
    print(f"Mean Average Precision (75% IOU): {metrics['map_75'].item()}")

    return metrics

def run_training(model, num_epochs, learning_rate):
    # Define optimizer
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Check if we are using GPU
    if torch.cuda.is_available():
        print("[INFO] Using GPU: {}\n".format(torch.cuda.get_device_name()))

    # For keeping track of the best validation mAP
    top_map = 0.0

    # We train the emodel for a number of epochs
    for epoch in range(num_epochs):
        train_loss = train_epoch(model, trainloader, device, optimizer, epoch)

        # For validation we do not keep track of gradients
        with torch.no_grad():
            metrics = valid_epoch(model, testloader, device)
            map_50 = metrics['map_50'].item()

            if top_map < map_50:
                print(f"mAP-50 Improved ({top_map} ---> {map_50})")
                top_map = map_50
        print()

In [None]:
# You may have to play with the hyperparameters to obtain better results
run_training(ssd_lite, 15, 0.001)

In [None]:
# Visualize 5 images and bounding boxes from the train_loader
images, targets = next(iter(trainloader))

# Get predictions
ssd_lite.eval()
with torch.no_grad():
    images = [img.to(device) for img in images]
    predictions = ssd_lite(images)
    images = [img.cpu() for img in images]

visualize_images_and_boxes(images, targets, predictions, num_samples=5)

In [None]:
# Visualize 5 images and bounding boxes from the test_loader
images, targets = next(iter(testloader))

# Get predictions
ssd_lite.eval()
with torch.no_grad():
    images = [img.to(device) for img in images]
    predictions = ssd_lite(images)
    images = [img.cpu() for img in images]

visualize_images_and_boxes(images, targets, predictions, num_samples=10, confidence_threshold=0.5, iou_threshold=0.5)

Play with the ```confidence_threshold``` and ```iou_threshold``` values in the call to the ```visualize_images_and_boxes()``` function. What do you observe?


