In [None]:
%load_ext autoreload
%autoreload 2

from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"

try:
    import jupyter_black

    jupyter_black.load()
except:
    print("black not installed")

# Object-Detection

## Goals

- Object classification and localization: Understand how objects can be localized using an example with restricted complexity.
- Object detection: Use and understand a pre-trained model.

## Setup

Let's define paths, install & load the necessary Python packages.

**Optional: Save the notebook to your personal google drive to persist changes.**

**Optional: Change runtime to a GPU instance (if using Google Colab)** 

Mount your google drive to store data and results (if running the code in Google Colab).

In [None]:
try:
    import google.colab

    IN_COLAB = True
except:
    IN_COLAB = False

print(f"In colab: {IN_COLAB}")

In [None]:
if IN_COLAB:
    from google.colab import drive

    drive.mount("/content/drive")

**Modify the following paths if necessary.**

That is where your data will be stored.

In [None]:
from pathlib import Path

if IN_COLAB:
    DATA_PATH = Path("/content/drive/MyDrive/bveri")
else:
    DATA_PATH = Path("/workspace/code/data")

Install `dl_cv_lectures`

In [None]:
try:
    import dl_cv_lectures

    print("dl_cv_lectures installed, all good")
except ImportError as e:
    import os

    if Path("/workspace/code/src").exists():
        print("Installing from local repo")
        os.system("cd /workspace/code  && pip install -e .")
    else:
        print("Installing from git repo")
        os.system("pip install git+https://github.com/i4Ds/bveri-exercises-hs2024")

Load all packages

In [None]:
import cv2
import numpy as np
import seaborn as sns
import torch
import torch.nn as nn
import torchshow as ts
import torchvision
import torchvision.transforms.v2.functional as TF
from IPython.display import Image
from matplotlib import pyplot as plt
from PIL import Image
from torch.utils.data import Dataset
from torchvision import transforms
from tqdm.notebook import tqdm

from dl_cv_lectures import visualize

Define a default device for your computations.

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using: {device}")

## Object Classification and Localization

We simplify the task: We classify and localize jut one object per image. This demonstrates how object detection works and illustrates the fundamental challenges.


### Dataset

We create an artificial dataset. The following class creates images and labels on-the-fly.

In [None]:
from dl_cv_lectures.data import shapes

Let's take a look at the dataset.

In [None]:
from torchvision.utils import draw_bounding_boxes

ds = shapes.ShapeDataset(img_size=64, max_number_of_shapes_per_image=1, background="white")


def get_images_and_labels_from_ds(
    ds: torch.utils.data.Dataset, num_images_to_fetch: int = 16
) -> list[torch.Tensor]:
    """Fetch first n images from a torch.utils.data.Dataset with (image, label) signature."""
    # for each image: convert it to (N x C x H x W) format and scale to 0-1
    images = [
        TF.to_image(ds[i][0]).to(torch.float32).unsqueeze(0) / 255.0
        for i in range(0, num_images_to_fetch)
    ]
    labels = [ds[i][1] for i in range(0, num_images_to_fetch)]

    return images, labels


images, labels = get_images_and_labels_from_ds(ds, num_images_to_fetch=16)

images_to_plot = list()
for image, label in zip(images, labels):
    boxes = torch.tensor(label["box"])
    image = (image * 255.0).to(torch.uint8).squeeze(0)
    img_with_box = draw_bounding_boxes(image=image, boxes=boxes)
    images_to_plot.append(img_with_box)


fig, ax = visualize.plot_square_collage(images_to_plot)

In [None]:
ts.show(ds[0][0])

**Question:** What do you see in the outputs above? What is in the dataset?

### Create a Training-Dataset

First, we look at the output of the `DataLoader` object. Thats what the model needs to process.

Check the outputs for their data type and shape.

`collate_fn` defines how samples are batches. This is a particular head ache in object detection since each image has a varying number of objects: [Link](https://pytorch.org/docs/stable/data.html#dataloader-collate-fn)

In [None]:
from torch.utils.data import DataLoader

transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize(0.5, 1.0)])


def collate_fn(batch):
    images, annotations = zip(*batch)
    return torch.stack(images), annotations


dataset = shapes.ShapeDataset(
    img_size=64,
    num_samples=1000,
    background="white",
    max_number_of_shapes_per_image=1,
    transforms=transform,
)

dataloader = DataLoader(dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)

images, annotations = next(iter(dataloader))

images.shape
annotations[0:3]

**Question:** How does the output of `ShapeDataset` with a `DataLoader` look like? What are the difficulties?

### Defining the Architecture

We will now define an architecture consisting of three components:

- **Backbone**: A CNN for feature extraction  
- **Classification Head**: Models the classification task  
- **Bounding Box Regression Head**: Models the bounding box regression  

Create the backbone with 3 convolutional layers:  

- **Conv Layer 1**: 16 filters, stride 1  
- **Conv Layer 2**: 32 filters, stride 2  
- **Conv Layer 3**: 64 filters, stride 2  

Add layers for the heads accordingly.


In [None]:
class CNNBackbone(nn.Module):
    """
    CNN Backbone for feature extraction.

    Args:
        input_shape (tuple[int, int]): The height and width of the input images.
        output_features (int): The number of features to output after the fully connected layer.
    """

    def __init__(self, input_shape: tuple[int, int] = (64, 64), output_features: int = 128):
        super(CNNBackbone, self).__init__()
        # Define convolutional layers
        ### BEGIN SOLUTION
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=2, padding=1)
        self.conv3 = nn.Conv2d(32, 64, kernel_size=3, stride=2, padding=1)
        ### END SOLUTION

        # Calculate the flattened feature size after convolutions
        global_stride = 4  # Total downsampling factor from stride 2 in conv2 and conv3
        cnn_features = 64 * (input_shape[0] // global_stride) * (input_shape[1] // global_stride)

        # Fully connected layer
        self.fc1 = nn.Linear(cnn_features, output_features)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Forward pass through the CNN backbone.

        Args:
            x (torch.Tensor): Input tensor of shape (batch_size, channels, height, width).

        Returns:
            torch.Tensor: Extracted feature tensor of shape (batch_size, output_features).
        """
        x = torch.relu(self.conv1(x))
        x = torch.relu(self.conv2(x))
        x = torch.relu(self.conv3(x))
        x = torch.flatten(x, start_dim=1)
        x = torch.relu(self.fc1(x))
        return x


class ClassificationHead(nn.Module):
    """
    Classification head for predicting class logits.

    Args:
        num_classes (int): Number of classes for classification.
        num_input_features (int): Number of input features from the backbone.
    """

    def __init__(self, num_classes: int, num_input_features: int):
        super(ClassificationHead, self).__init__()
        ### BEGIN SOLUTION
        self.fc2_class = nn.Linear(num_input_features, num_classes)
        ### END SOLUTION

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Forward pass through the classification head.

        Args:
            x (torch.Tensor): Input tensor of shape (batch_size, num_input_features).

        Returns:
            torch.Tensor: Class logits of shape (batch_size, num_classes).
        """
        class_logits = self.fc2_class(x)
        return class_logits


class DetectionHead(nn.Module):
    """
    Detection head for predicting bounding box coordinates.

    Args:
        num_input_features (int): Number of input features from the backbone.
    """

    def __init__(self, num_input_features: int):
        super(DetectionHead, self).__init__()
        ### BEGIN SOLUTION
        self.fc2_bb = nn.Linear(num_input_features, 4)
        ### END SOLUTION

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Forward pass through the detection head.

        Args:
            x (torch.Tensor): Input tensor of shape (batch_size, num_input_features).

        Returns:
            torch.Tensor: Bounding box coordinates of shape (batch_size, 4).
        """
        bb_coords = self.fc2_bb(x)
        return bb_coords


class ObjectClassificationAndLocalization(nn.Module):
    """
    Combined model for object classification and localization.

    Args:
        input_shape (tuple[int, int]): The height and width of the input images.
        num_features (int): Number of features output by the backbone.
        num_classes (int): Number of classes for classification.
    """

    def __init__(self, input_shape: tuple[int, int], num_features: int, num_classes: int):
        super(ObjectClassificationAndLocalization, self).__init__()
        self.backbone = CNNBackbone(input_shape=input_shape, output_features=num_features)
        self.classification_head = ClassificationHead(num_classes, num_features)
        self.detection_head = DetectionHead(num_features)

    def forward(self, x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
        """
        Forward pass through the combined model.

        Args:
            x (torch.Tensor): Input tensor of shape (batch_size, channels, height, width).

        Returns:
            tuple[torch.Tensor, torch.Tensor]:
                - Class scores of shape (batch_size, num_classes).
                - Bounding box coordinates of shape (batch_size, 4).
        """
        x = self.backbone(x)
        class_scores = self.classification_head(x)
        bb_coords = self.detection_head(x)
        return class_scores, bb_coords

### Initialize Model, Loss Function, and Optimizer

Initialize your model. Define loss functions (separate ones for classification and regression).

In [None]:
net = ObjectClassificationAndLocalization(input_shape=(64, 64), num_features=128, num_classes=3)

# define loss functions for the different heads
# loss_fn_class = nn.
# loss_fn_bbx = nn.
### BEGIN SOLUTION
loss_fn_class = nn.CrossEntropyLoss()
loss_fn_bbx = nn.MSELoss()
### END SOLUTION

optimizer = torch.optim.Adam(net.parameters(), lr=0.001)

### Modell Training

Let's train the model for 30 epochs.

In [None]:
num_epochs = 30

for epoch in range(num_epochs):
    running_loss = 0.0
    running_cls_loss = 0.0
    running_bbx_loss = 0.0
    for step, (images, annotations) in enumerate(dataloader):
        labels_class = torch.tensor([sample["class"][0] for sample in annotations])
        labels_bb = torch.tensor([sample["box"][0] for sample in annotations])

        # scale bb labels
        labels_bb_scaled = labels_bb / images.shape[2]

        # Forward pass
        class_scores, bb_coords = net(images)

        # compute classification and regression losses
        # loss_class = ...
        # loss_bb = ...
        ### BEGIN SOLUTION
        # Compute classification loss
        loss_class = loss_fn_class(class_scores, labels_class)

        # Compute bounding box regression loss
        loss_bb = loss_fn_bbx(bb_coords, labels_bb_scaled.float())
        ### END SOLUTION

        # Total loss (you can adjust weights for classification and regression losses)
        # total_loss = ...
        ### BEGIN SOLUTION
        total_loss = loss_class + 10 * loss_bb
        ### END SOLUTION

        # Backpropagation and optimization
        optimizer.zero_grad()
        total_loss.backward()
        optimizer.step()

        # print statistics
        running_cls_loss += loss_class.item()
        running_bbx_loss += loss_bb.item()
        running_loss += loss_class.item() + loss_bb.item()

    print(
        f"epoch: {epoch + 1:02d} - step: {step + 1:5d} - total loss: {running_loss / (step+1):.3f} \
        Class Loss: {running_cls_loss  / (step+1):.4f} BB Loss: {running_bbx_loss / (step+1) :.4f}"
    )

**Question:** What do you notice when looking at the loss-values? What can you do?

### Modell Evaluation

Now we evaluate the model (using simple means).

In [None]:
test_dataset = shapes.ShapeDataset(
    background="white",
    max_number_of_shapes_per_image=1,
    img_size=64,
    num_samples=256,
    transforms=transform,
)

test_dataloader = DataLoader(dataset, batch_size=256, collate_fn=collate_fn)

Let's take a look at the classification performance.

In [None]:
test_images, test_labels = next(iter(test_dataloader))
with torch.no_grad():
    net = net.eval()
    class_scores, bb_coords = net(test_images)

y_class_pred = torch.argmax(class_scores, 1).numpy()
y_class_true = np.array([sample["class"][0] for sample in test_labels])

for i, (y_pred, y_true) in enumerate(zip(y_class_pred, y_class_true)):
    if i > 10:
        break
    print(f"Predicted: {test_dataset.classes[y_pred]} True: {test_dataset.classes[y_true]}")


print(f"Accuracy: {sum(y_class_pred == y_class_true) / len(y_class_true)} ")

Now we vsualize prediction and ground truth.

In [None]:
test_dataset = shapes.ShapeDataset(
    img_size=64, max_number_of_shapes_per_image=1, background="white", transforms=transform
)

test_dataloader = DataLoader(test_dataset, batch_size=1, collate_fn=collate_fn)

for i, (img, annotations) in enumerate(test_dataloader):

    # Predict
    with torch.no_grad():
        net = net.eval()
        class_scores, bb_coords_scaled = net(img)
        bb_coords = bb_coords_scaled * img.shape[2]

    # Plot Ground Truth
    img = img.squeeze(0)

    img = (img * 2 * 255.0).to(torch.uint8)
    boxes = torch.tensor(annotations[0]["box"])
    img_with_box = draw_bounding_boxes(image=img, boxes=boxes, colors="red")

    # Plot Prediction
    img_with_box = draw_bounding_boxes(image=img_with_box, boxes=bb_coords, colors="green")

    fig, ax = plt.subplots(figsize=(2, 2))
    pil_image = F.to_pil_image(img_with_box)
    _ = ax.imshow(pil_image)
    # Remove x and y axis ticks
    _ = ax.set_xticks([])
    _ = ax.set_yticks([])
    plt.show()
    if i > 5:
        break

**Question:** Qualitatively: How do you like your model with respect to classification and localization performance? Where could you improve?

The following code can be used to evaluate the detection performance by calculating IoU.

Complete the code. Use [torchvision.ops.box_iou](https://pytorch.org/vision/main/generated/torchvision.ops.box_iou.html).

In [None]:
test_dataset = shapes.ShapeDataset(
    img_size=64, max_number_of_shapes_per_image=1, background="white", transforms=transform
)

test_dataloader = DataLoader(test_dataset, batch_size=1, collate_fn=collate_fn)

for i, (img, annotations) in enumerate(test_dataloader):

    # Predict
    with torch.no_grad():
        net = net.eval()
        class_scores, bb_coords_scaled = net(img)
        predicted_box = bb_coords_scaled * img.shape[2]

    gt_box = torch.tensor(annotations[0]["box"]).reshape(1, -1)

    # iou = ...
    ### BEGIN SOLUTION
    iou = torchvision.ops.box_iou(gt_box, predicted_box).numpy().ravel()[0]
    ### END SOLUTION
    print(f"IOU: {iou:.2f}")

    # Plot Ground Truth
    img = img.squeeze(0)

    img = (img * 2 * 255.0).to(torch.uint8)
    boxes = torch.tensor(annotations[0]["box"])
    img_with_box = draw_bounding_boxes(image=img, boxes=boxes, colors="red")

    # Plot Prediction
    img_with_box = draw_bounding_boxes(image=img_with_box, boxes=predicted_box, colors="green")

    fig, ax = plt.subplots(figsize=(2, 2))
    pil_image = F.to_pil_image(img_with_box)
    _ = ax.imshow(pil_image)
    # Remove x and y axis ticks
    _ = ax.set_xticks([])
    _ = ax.set_yticks([])
    plt.show()
    if i > 5:
        break

**Question:** Which IoUs do you find acceptable?

**Question:** What would you need to adjust to perform object detection (with multiple objects per image)?

## Pre-Trained _Faster R-CNN_

In this exercise we will use a pre-trained object detection model from the family of _Faster R-CNNs_.


### Data

Load the following images with `PIL.Image`. Inspect the images and estimate how well object detection algorithms might perform.

```
DATA_PATH.joinpath("dogs.jpg")
DATA_PATH.joinpath("ducks.jpeg")
````

In [None]:
import gdown

files = [
    {"id": "18zuHwfojUUpmkrQttEtuaNW-MQ0QOoAH", "name": "ducks.jpg"},
    {"id": "1-UWVWqTpE80Qxh36hPuKkuQZj5BT3hXr", "name": "dogs.jpg"},
]

for file in files:
    url = f"https://drive.google.com/uc?id={file['id']}"
    download_path = DATA_PATH / file["name"]
    if not download_path.exists():
        gdown.download(url, str(download_path), quiet=False)

In [None]:
img_dogs = Image.open(DATA_PATH.joinpath("dogs.jpg"))
img_ducks = Image.open(DATA_PATH.joinpath("ducks.jpg"))

display(img_ducks)
display(img_dogs)

### Load Model

Load a pre-trained model of the _Faster R-CNN_ family from [torchvision](https://pytorch.org/vision/stable/models.html#object-detection-instance-segmentation-and-person-keypoint-detection). For example, you could load `fasterrcnn_mobilenet_v3_large_320_fpn`, which is resource-efficient. If you want better performance you might choose a different one.

Initialize the model and put it into `eval` mode. Set `box_score_thresh` to a value between 0.5 und 0.9.

In [None]:
from torchvision.models.detection import (
    FasterRCNN_MobileNet_V3_Large_320_FPN_Weights,
    fasterrcnn_mobilenet_v3_large_320_fpn,
)

Model = fasterrcnn_mobilenet_v3_large_320_fpn
weights = FasterRCNN_MobileNet_V3_Large_320_FPN_Weights.DEFAULT

# Model = fasterrcnn_resnet50_fpn
# weights = FasterRCNN_ResNet50_FPN_Weights.DEFAULT

model = Model(weights=weights, box_score_thresh=0.8)
model = model.eval()

### Use Model

Use the function  `inference_single()` to create predictions for an image. Take a look at the following example: https://pytorch.org/vision/stable/models.html#object-detection-instance-segmentation-and-person-keypoint-detection

Create predictions for "dogs.jpg" and insepct the result.

In [None]:
def inference_single(img, model, preprocess):
    """Inference on a single image
    Args:
        img: (C, H, W) torch.tensor
        model: torchvision.models.detection.faster_rcnn.FasterRCNN
        preprocess: function to pre-process image batch for the model

    Returns:
        predictions: Dict with lists of object detections
    """

    image_batch = img.unsqueeze(0)
    image_processed = preprocess(image_batch)
    return model(image_processed)[0]


img = TF.pil_to_tensor(img_dogs)
### BEGIN SOLUTION
predictions = inference_single(img, model, weights.transforms())
predictions
### END SOLUTION

Visualize the predictions using [torchvision.utils.draw_bounding_boxes](torchvision.utils.draw_bounding_boxes). 

Visualize the labels and the confidence scores of the predictions together with the bounding boxes.

The labels are in `weights.meta["categories"]`

In [None]:
from torchvision.utils import draw_bounding_boxes


def draw_boxes(image, predictions, categories):
    """Draw Boxes from Predictions
    Args:
        image: The input image torch.tensor
        predictions: Output of inference()
        categories: List of category labels
    Returns:
        PIL.Image
    """
    labels = [
        f"{categories[i]} ({s * 100:.2f} %)"
        for i, s in zip(predictions["labels"], predictions["scores"])
    ]

    box = draw_bounding_boxes(
        image, boxes=predictions["boxes"], labels=labels, width=5, colors="red"
    )
    img = box.detach()
    return FT.to_pil_image(img)
    # im = Image.fromarray(im.permute(1, 2, 0).numpy())
    # return im


img_with_box = draw_boxes(img, predictions, weights.meta["categories"])
img_with_box
# im.save(DATA_PATH.joinpath("ducks_with_box.png"))

Initialize the model again but with a lower value for [box_score_thresh](https://github.com/pytorch/vision/blob/main/torchvision/models/detection/faster_rcnn.py). 

Create predictions for "ducks.jpg".

Visualize the boxes.


In [None]:
from torchvision.ops import box_iou

model = Model(weights=weights, box_score_thresh=0.3)
model = model.eval()

img = FT.pil_to_tensor(img_ducks)
predictions = inference_single(img, model, weights.transforms())

img_with_boxes = draw_boxes(img, predictions, weights.meta["categories"])
display(img_with_boxes)

**Question**: What happens with different values of `box_score_thresh`?

Calculate IoU for the detected boxes. Use the following function: [torchvision.ops.box_iou](https://pytorch.org/vision/stable/generated/torchvision.ops.box_iou.html#torchvision.ops.box_iou)

In [None]:
ious = box_iou(predictions["boxes"], predictions["boxes"])

fig, ax = plt.subplots(figsize=(6, 6))
_ = sns.heatmap(
    ious.detach().numpy(), annot=True, fmt=".2f", annot_kws={"size": 8}, cbar=False
).set(title="IoUs")

Now let’s take a look at the activation maps from the backbone CNN that are passed into the RPN. Use the following function to inspect the shape of the activation map.

Examine the activation maps of both example images. What do you observe?

In [None]:
def backbone(img, model, preprocess):
    """Get Features from the Backbone Network
    Args:
        img: (C, H, W) torch.tensor
        model: torchvision.models.detection.faster_rcnn.FasterRCNN
        preprocess: function to pre-process image batch for the model

    Returns:
        predictions: Dict with lists of object detections
    """
    image_batch = img.unsqueeze(0)
    image_processed = preprocess(image_batch)
    features = model.backbone(image_processed)
    return features["0"]


model = Model(weights=weights, box_score_thresh=0.5)
model = model.eval()

img = FT.pil_to_tensor(img_dogs)
backbone_features = backbone(img, model, weights.transforms())

backbone_features.shape

Now let’s take a look at the output of the RPN. Compare the two images again.

Set the model parameters: `rpn_score_thresh` and `rpn_post_nms_top_n_test`, and experiment with different values.

In [None]:
def rpn(img, model, preprocess):
    """Get Region Proposals
    Args:
        img: (C, H, W) torch.tensor
        model: torchvision.models.detection.faster_rcnn.FasterRCNN
        preprocess: function to pre-process image batch for the model

    Returns:
        predictions: Dict with lists of object detections
    """
    image_batch = img.unsqueeze(0)
    image_processed = preprocess(image_batch)

    images, targets = model.transform(image_processed, targets=None)
    features = model.backbone(image_processed)
    proposals, proposal_losses = model.rpn(images, features, targets=targets)

    original_image_sizes: List[Tuple[int, int]] = []
    for img in image_batch:
        val = img.shape[-2:]
        torch._assert(
            len(val) == 2,
            f"expecting the last two dimensions of the Tensor to be H and W instead got {img.shape[-2:]}",
        )
        original_image_sizes.append((val[0], val[1]))
    proposals = model.transform.postprocess(
        [{"boxes": proposals[0]}], images.image_sizes, original_image_sizes
    )

    return proposals


def draw_proposals(image, proposals):
    """Draw Boxes from Predictions
    Args:
        image: The input image torch.tensor
        predictions: Output of inference()
        categories: List of category labels
    Returns:
        PIL.Image
    """

    box = draw_bounding_boxes(image, boxes=proposals, width=5, colors="red")
    im = box.detach()
    im = Image.fromarray(im.permute(1, 2, 0).numpy())
    return im


model = Model(weights=weights, rpn_score_thresh=0.5, rpn_post_nms_top_n_test=200)
model = model.eval()

image_torch = torch.tensor(np.array(img_ducks)).permute(2, 0, 1)
region_proposals = rpn(image_torch, model, weights.transforms())
im = draw_proposals(image_torch, region_proposals[0]["boxes"])
im