# TorchVision Object Detection Finetuning Tutorial

## 1) Initial Installs

In [4]:
import os

os.system("wget https://raw.githubusercontent.com/pytorch/vision/main/references/detection/engine.py")
os.system("wget https://raw.githubusercontent.com/pytorch/vision/main/references/detection/utils.py")
os.system("wget https://raw.githubusercontent.com/pytorch/vision/main/references/detection/coco_utils.py")
os.system("wget https://raw.githubusercontent.com/pytorch/vision/main/references/detection/coco_eval.py")
os.system("wget https://raw.githubusercontent.com/pytorch/vision/main/references/detection/transforms.py")

0

## 2) Imports

In [5]:
import torch
import utils

import zipfile
import torchvision
import matplotlib.pyplot as plt

from engine import train_one_epoch, evaluate

from torchvision import tv_tensors
from torchvision.io import read_image
from torchvision.utils import draw_bounding_boxes, draw_segmentation_masks
from torchvision.transforms import v2 as T
from torchvision.ops.boxes import masks_to_boxes
from torchvision.transforms.v2 import functional as F
from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.rpn import AnchorGenerator
from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor

## 3) Defining the Dataset

We will be importing the Penn-Fudan Database for Pedestrian Detection and Segmentation.

It contains 170 images with 345 instances of pedestrians. The file structure of the dataset is as follows:

```
- PennFudanPed/
  - PedMasks/
    - FudanPed00001_mask.png
    - ...
  - PNGImages/
    - FudanPed00001.png
    - ...

```

We will create a new torchvision dataset to compile the masks and images of pedestrians together into feature vectors with an associated label.

The `torch.utils.data.Dataset` class should implement the `__len__` and `__getitem__` methods. The only requirement is that `__getitem__` returns a tuple of (image, target)

- image: `torchvision.tv_tensors.Image` of shape [3, H, W], a pure tensor, or a PIL Image of size (H, W)
- target: a dict containing
  - **boxes**, `torchvision.tv_tensors.BoundingBoxes` of shape **[N, 4]**, the coordinates of the **N** bounding boxes in **[x0, y0, x1, y1]** format, randing from **0** to **W**, and **0** to **H**
  - **labels**, integer `torch.Tensor` of shape **[N]**: the label for each bounding box. **0** represents always the background class.
  - **image_id**, int: an image identifier
  - **area**, float `torch.Tensor` of shape **[N]**: the area of the bounding box. This is used during evaluation with the COCO metric, to separate the metric scores between small, medium and large boxes.
  - **iscrowd**, uint8 `torch.Tensor` of shape **[N]**: instances with `iscrowd=True` will be ignored suring evaluation
  - (optionally) **masks**, `torchvision.tv_tensors.Mask of shape **[N, H, W]**: the segmentation masks for each of the objects

One note on the **labels**, the model considers **0** as background. If your dataset does not contain the background class, you should not have **0** in your **labels**. For example, if you have two classes, they should be represented as **1** or **2**.

In [6]:
!wget https://www.cis.upenn.edu/~jshi/ped_html/PennFudanPed.zip

--2023-12-03 00:58:39--  https://www.cis.upenn.edu/~jshi/ped_html/PennFudanPed.zip
Resolving www.cis.upenn.edu (www.cis.upenn.edu)... 158.130.69.163, 2607:f470:8:64:5ea5::d
Connecting to www.cis.upenn.edu (www.cis.upenn.edu)|158.130.69.163|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 53723336 (51M) [application/zip]
Saving to: ‘PennFudanPed.zip.1’


2023-12-03 00:58:47 (6.90 MB/s) - ‘PennFudanPed.zip.1’ saved [53723336/53723336]



In [7]:
zip_ref = zipfile.ZipFile("PennFudanPed.zip", "r")
zip_ref.extractall()

zip_ref.close()

In [8]:
class PennFudanDataset(torch.utils.data.Dataset):
  def __init__(self, root, transforms):
    self.root = root
    self.transforms = transforms

    # load all image files, sorting them to ensure that they are aligned
    # so image1.png is aligned with mask1.png
    self.imgs = list(sorted(os.listdir(os.path.join(root, "PNGImages"))))
    self.masks = list(sorted(os.listdir(os.path.join(root, "PedMasks"))))

  def __getitem__(self, idx):
    # load images and masks
    img_path = os.path.join(self.root, "PNGImages", self.imgs[idx])
    mask_path = os.path.join(self.root, "PedMasks", self.masks[idx])
    img = read_image(img_path)
    mask = read_image(mask_path)

    # instances are encoded as different colours
    obj_ids = torch.unique(mask)

    # first id is the background, so remove it
    obj_ids = obj_ids[1:]
    num_objs = len(obj_ids)

    # split the colour-encoded mask into a set of binary masks
    masks = (mask == obj_ids[:, None, None]).to(dtype=torch.uint8)

    # get bounding box coordinates for each mask
    boxes = masks_to_boxes(masks)

    # there is only one class
    labels = torch.ones((num_objs,), dtype=torch.int64)

    area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])

    # suppose all instances are not crowd
    iscrowd = torch.zeros((num_objs,), dtype=torch.int64)

    # wrap sample and targets into torchvision tv_tensors:
    img = tv_tensors.Image(img)

    target = {}
    target["boxes"] = tv_tensors.BoundingBoxes(boxes, format="XYXY", canvas_size=F.get_size(img))
    target["masks"] = tv_tensors.Mask(masks)
    target["labels"] = labels
    target["image_id"] = idx
    target["area"] = area
    target["iscrowd"] = iscrowd

    if self.transforms is not None:
        img, target = self.transforms(img, target)

    return img, target

  def __len__(self):
    return len(self.imgs)


## 4) Defining your Model

We will be using Mask R-CNN, which is based on top of Faster R-CNN. Faster R-CNN is a model that predicts both bounding boxes and class scores for potential objects in the image.

Mask R-CNN adds an extra branch into Faster R-CNN, which also predicts segmentation masks for each instance.

### 4 a) Object Detection and Instance Segmentation model for PennFudan Dataset

There are two common situation one ight want to modify one of the available models in TorchVision Model Zoo. The first is when we want to start from a pre-trained model, and just finetune the last layer. The other is when we want to replace the backbone of the model with a different one (i.e. for faster predictions).

In our case, we want to finetune from a pre-trained model, given that our dataset is very small. Here we want to also compute the instance segmentation masks, so we will be using Mask R-CNN:



In [9]:
def get_model_instance_segmentation(num_classes):
  # load an instance segmentation model pre-trained on COCO
  model = torchvision.models.detection.maskrcnn_resnet50_fpn(weights="DEFAULT")

  # get number of input features for the classifier
  in_features = model.roi_heads.box_predictor.cls_score.in_features

  # replace the pre-trained head with a new one
  model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

  # now get the number of input features for the mask classifier
  in_features_mask = model.roi_heads.mask_predictor.conv5_mask.in_channels
  hidden_layer = 256

  # and replace the mask predictor with a new one
  model.roi_heads.mask_predictor = MaskRCNNPredictor(
      in_features_mask,
      hidden_layer,
      num_classes,
  )

  return model

### 4 b) Helper Functions for Data Augmentation / Transformation

In [10]:
def get_transform(train):
  transformations = []
  if train:
    transformations.append(T.RandomHorizontalFlip(0.5))

  transformations.append(T.ToDtype(torch.float, scale=True))
  transformations.append(T.ToPureTensor())

  return T.Compose(transformations)

### 4 c) Testing `forward()` method

In [11]:
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights="DEFAULT")
dataset = PennFudanDataset('PennFudanPed', get_transform(train=True))
data_loader = torch.utils.data.DataLoader(
    dataset,
    batch_size=2,
    shuffle=True,
    num_workers=4,
    collate_fn=utils.collate_fn
)

# For Training
images, targets = next(iter(data_loader))
images = list(image for image in images)
targets = [{k: v for k, v in t.items()} for t in targets]
output = model(images, targets)  # Returns losses and detections
print(output)

# For inference
model.eval()
x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)]
predictions = model(x)  # Returns predictions
print(predictions[0])

Downloading: "https://download.pytorch.org/models/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth" to /root/.cache/torch/hub/checkpoints/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth
100%|██████████| 160M/160M [00:02<00:00, 73.3MB/s]


{'loss_classifier': tensor(0.2052, grad_fn=<NllLossBackward0>), 'loss_box_reg': tensor(0.0504, grad_fn=<DivBackward0>), 'loss_objectness': tensor(0.0137, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>), 'loss_rpn_box_reg': tensor(0.0037, grad_fn=<DivBackward0>)}
{'boxes': tensor([], size=(0, 4), grad_fn=<StackBackward0>), 'labels': tensor([], dtype=torch.int64), 'scores': tensor([], grad_fn=<IndexBackward0>)}


### 4 d) Main Code for Training and Validation

In [16]:
# train on the GPU or on the CPU, if a GPU is not available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# our dataset has two classes only - background and person
num_classes = 2

# use our dataset and defined transformations
dataset = PennFudanDataset('PennFudanPed', get_transform(train=True))
dataset_test = PennFudanDataset('PennFudanPed', get_transform(train=False))

# split the dataset in train and test set
indices = torch.randperm(len(dataset)).tolist()
dataset = torch.utils.data.Subset(dataset, indices[:-50])
dataset_test = torch.utils.data.Subset(dataset_test, indices[-50:])

# define training and validation data loaders
data_loader = torch.utils.data.DataLoader(
    dataset,
    batch_size=2,
    shuffle=True,
    num_workers=4,
    collate_fn=utils.collate_fn
)

data_loader_test = torch.utils.data.DataLoader(
    dataset_test,
    batch_size=1,
    shuffle=False,
    num_workers=4,
    collate_fn=utils.collate_fn
)

# get the model using our helper function
model = get_model_instance_segmentation(num_classes)

# move model to the right device
model.to(device)

# # Freeze all layers of the pre-trained model
# for param in model.parameters():
#     param.requires_grad = False

# # Unfreeze the new predictors for training
# for param in model.roi_heads.box_predictor.parameters():
#     param.requires_grad = True
# for param in model.roi_heads.mask_predictor.parameters():
#     param.requires_grad = True

# construct an optimizer
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(
    params,
    lr=0.005,
    momentum=0.9,
    weight_decay=0.0005
)

# and a learning rate scheduler
lr_scheduler = torch.optim.lr_scheduler.StepLR(
    optimizer,
    step_size=3,
    gamma=0.1
)

# let's train it for 5 epochs
num_epochs = 5

for epoch in range(num_epochs):
    # train for one epoch, printing every 10 iterations
    train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=10)
    # update the learning rate
    lr_scheduler.step()
    # evaluate on the test dataset
    evaluate(model, data_loader_test, device=device)

print("That's it!")

Epoch: [0]  [ 0/60]  eta: 0:01:06  lr: 0.000090  loss: 5.1475 (5.1475)  loss_classifier: 0.8084 (0.8084)  loss_box_reg: 0.5890 (0.5890)  loss_mask: 3.7111 (3.7111)  loss_objectness: 0.0354 (0.0354)  loss_rpn_box_reg: 0.0036 (0.0036)  time: 1.1109  data: 0.3266  max mem: 2342
Epoch: [0]  [10/60]  eta: 0:00:30  lr: 0.000936  loss: 1.6747 (2.6721)  loss_classifier: 0.4984 (0.4828)  loss_box_reg: 0.3621 (0.3583)  loss_mask: 0.9283 (1.8068)  loss_objectness: 0.0184 (0.0190)  loss_rpn_box_reg: 0.0036 (0.0051)  time: 0.6120  data: 0.0391  max mem: 3031
Epoch: [0]  [20/60]  eta: 0:00:23  lr: 0.001783  loss: 0.9949 (1.7501)  loss_classifier: 0.2499 (0.3298)  loss_box_reg: 0.2410 (0.3021)  loss_mask: 0.3865 (1.0906)  loss_objectness: 0.0169 (0.0200)  loss_rpn_box_reg: 0.0053 (0.0076)  time: 0.5638  data: 0.0093  max mem: 3031
Epoch: [0]  [30/60]  eta: 0:00:17  lr: 0.002629  loss: 0.5846 (1.3581)  loss_classifier: 0.1010 (0.2579)  loss_box_reg: 0.1573 (0.2717)  loss_mask: 0.2435 (0.8054)  loss_ob

In [None]:
image = read_image("tv_image05.png")
eval_transform = get_transform(train=False)

model.eval()
with torch.no_grad():
    x = eval_transform(image)
    # convert RGBA -> RGB and move to device
    x = x[:3, ...].to(device)
    predictions = model([x, ])
    pred = predictions[0]

image = (255.0 * (image - image.min()) / (image.max() - image.min())).to(torch.uint8)
image = image[:3, ...]
pred_labels = [f"pedestrian: {score:.3f}" for label, score in zip(pred["labels"], pred["scores"])]
pred_boxes = pred["boxes"].long()
output_image = draw_bounding_boxes(image, pred_boxes, pred_labels, colors="red")

masks = (pred["masks"] > 0.7).squeeze(1)
output_image = draw_segmentation_masks(output_image, masks, alpha=0.5, colors="blue")

plt.figure(figsize=(12, 12))
plt.imshow(output_image.permute(1, 2, 0))