## TORCHVISION OBJECT DETECTION FINETUNING TUTORIAL

### finetuning a pre-trained Mask R-CNN model on the Penn-Fudan Database for Pedestrian Detection and Segmentation

#### Defining the Dataset

In [1]:
import os
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torchvision.io import read_image
from torchvision.ops.boxes import masks_to_boxes
from PIL import Image

In [2]:
class PennFudanDataset(Dataset):
    def __init__(self,root,transforms):
        self.root = root
        self.transforms = transforms
        self.imgs = list(sorted(os.listdir(os.path.join(root,"PNGImages"))))
        self.masks = list(sorted(os.listdir(os.path.join(root,"PedMasks"))))
        
    def __getitem__(self,idx):
        img_path = os.path.join(self.root,"PNGImages",self.imgs[idx])
        mask_path = os.path.join(self.root,"PedMasks",self.masks[idx])
        img = read_image(img_path)
        mask = read_image(mask_path)
        obj_ids = torch.unique(mask)
        obj_ids = obj_ids[1:]
        num_objs = len(obj_ids)
        
        masks = (mask == obj_ids[:, None, None]).to(dtype=torch.uint8)
        boxes = masks_to_boxes(masks)
        labels = torch.ones((num_objs,), dtype=torch.int64)
        
        image_id = torch.as_tensor(idx)
        area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
        iscrowd = torch.zeros((num_objs,), dtype=torch.int64)
        
        target = {}
        target["boxes"] = boxes
        target["labels"] = labels
        target["masks"] = masks
        target["image_id"] = image_id
        target["area"] = area
        target["iscrowd"] = iscrowd
        
        if self.transforms is not None:
            img, target = self.transforms(img, target)

        return img, target

    def __len__(self):
        return len(self.imgs)

In [3]:
image_path = './PennFudanPed/PennFudanPed/PNGImages/FudanPed00001.png'
mask_path = './PennFudanPed/PennFudanPed/PedMasks/FudanPed00001_mask.png'

In [4]:
image = read_image(image_path)
mask = read_image(mask_path)

In [5]:
image

tensor([[[211, 210, 212,  ..., 143, 146, 148],
         [179, 173, 170,  ..., 126, 128, 128],
         [204, 194, 188,  ..., 130, 131, 130],
         ...,
         [226, 217, 211,  ..., 183, 184, 185],
         [231, 219, 210,  ..., 187, 187, 186],
         [225, 227, 215,  ..., 190, 190, 187]],

        [[200, 199, 201,  ...,  96,  99, 101],
         [168, 162, 159,  ...,  79,  81,  81],
         [193, 183, 177,  ...,  83,  84,  83],
         ...,
         [220, 211, 205,  ..., 183, 184, 185],
         [225, 213, 204,  ..., 187, 187, 186],
         [219, 221, 209,  ..., 190, 190, 187]],

        [[182, 181, 183,  ...,  78,  81,  83],
         [150, 144, 141,  ...,  61,  63,  63],
         [175, 165, 159,  ...,  65,  66,  65],
         ...,
         [220, 211, 205,  ..., 183, 184, 185],
         [225, 213, 204,  ..., 187, 187, 186],
         [219, 221, 209,  ..., 190, 190, 187]]], dtype=torch.uint8)

In [20]:
mask

tensor([[[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]], dtype=torch.uint8)

In [28]:
mask.size()

torch.Size([1, 536, 559])

In [23]:
obj_ids = torch.unique(mask)

In [26]:
obj_ids[:, None, None]

tensor([[[0]],

        [[1]],

        [[2]]], dtype=torch.uint8)

In [27]:
mask == obj_ids[:, None, None]

tensor([[[ True,  True,  True,  ...,  True,  True,  True],
         [ True,  True,  True,  ...,  True,  True,  True],
         [ True,  True,  True,  ...,  True,  True,  True],
         ...,
         [ True,  True,  True,  ...,  True,  True,  True],
         [ True,  True,  True,  ...,  True,  True,  True],
         [ True,  True,  True,  ...,  True,  True,  True]],

        [[False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False],
         ...,
         [False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False]],

        [[False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False],
         ...,
         [False, False, False,  ..., False, False, False],
         [

In [31]:
masks = (mask == obj_ids[:, None, None]).to(dtype = torch.uint8)

In [35]:
masks

tensor([[[1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         ...,
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1]],

        [[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]],

        [[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]], dtype=torch.uint8)

In [36]:
boxes = masks_to_boxes(masks)

In [37]:
boxes

tensor([[  0.,   0., 558., 535.],
        [159., 181., 301., 430.],
        [419., 170., 534., 485.]])

In [41]:
root = './PennFudanPed/PennFudanPed/'

In [43]:
PennFudanDataset = PennFudanDataset(root,None)

In [45]:
PennFudanDataset.masks

['FudanPed00001_mask.png',
 'FudanPed00002_mask.png',
 'FudanPed00003_mask.png',
 'FudanPed00004_mask.png',
 'FudanPed00005_mask.png',
 'FudanPed00006_mask.png',
 'FudanPed00007_mask.png',
 'FudanPed00008_mask.png',
 'FudanPed00009_mask.png',
 'FudanPed00010_mask.png',
 'FudanPed00011_mask.png',
 'FudanPed00012_mask.png',
 'FudanPed00013_mask.png',
 'FudanPed00014_mask.png',
 'FudanPed00015_mask.png',
 'FudanPed00016_mask.png',
 'FudanPed00017_mask.png',
 'FudanPed00018_mask.png',
 'FudanPed00019_mask.png',
 'FudanPed00020_mask.png',
 'FudanPed00021_mask.png',
 'FudanPed00022_mask.png',
 'FudanPed00023_mask.png',
 'FudanPed00024_mask.png',
 'FudanPed00025_mask.png',
 'FudanPed00026_mask.png',
 'FudanPed00027_mask.png',
 'FudanPed00028_mask.png',
 'FudanPed00029_mask.png',
 'FudanPed00030_mask.png',
 'FudanPed00031_mask.png',
 'FudanPed00032_mask.png',
 'FudanPed00033_mask.png',
 'FudanPed00034_mask.png',
 'FudanPed00035_mask.png',
 'FudanPed00036_mask.png',
 'FudanPed00037_mask.png',
 

In [46]:
len(PennFudanDataset)

170

In [48]:
PennFudanDataset.__getitem__(0)

(tensor([[[211, 210, 212,  ..., 143, 146, 148],
          [179, 173, 170,  ..., 126, 128, 128],
          [204, 194, 188,  ..., 130, 131, 130],
          ...,
          [226, 217, 211,  ..., 183, 184, 185],
          [231, 219, 210,  ..., 187, 187, 186],
          [225, 227, 215,  ..., 190, 190, 187]],
 
         [[200, 199, 201,  ...,  96,  99, 101],
          [168, 162, 159,  ...,  79,  81,  81],
          [193, 183, 177,  ...,  83,  84,  83],
          ...,
          [220, 211, 205,  ..., 183, 184, 185],
          [225, 213, 204,  ..., 187, 187, 186],
          [219, 221, 209,  ..., 190, 190, 187]],
 
         [[182, 181, 183,  ...,  78,  81,  83],
          [150, 144, 141,  ...,  61,  63,  63],
          [175, 165, 159,  ...,  65,  66,  65],
          ...,
          [220, 211, 205,  ..., 183, 184, 185],
          [225, 213, 204,  ..., 187, 187, 186],
          [219, 221, 209,  ..., 190, 190, 187]]], dtype=torch.uint8),
 {'boxes': tensor([[159., 181., 301., 430.],
          [419., 1

#### Defining model

##### There are two common situations where one might want to modify one of the available models in TorchVision Model Zoo. The first is when we want to start from a pre-trained model, and just finetune the last layer. The other is when we want to replace the backbone of the model with a different one (for faster predictions, for example).
##### 也就是有两种情况，一个是我们使用预训练的模型，只是微调最后一层分类的类别；另外一个是我们需要更换骨干网络（比如，为了更快的预测）

##### 1.Finetuning from a pretrained model

In [7]:
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor

In [8]:
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights="DEFAULT")

Downloading: "https://download.pytorch.org/models/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth" to C:\Users\59585/.cache\torch\hub\checkpoints\fasterrcnn_resnet50_fpn_coco-258fb6c6.pth
100.0%


In [9]:
num_classes = 2
in_features = model.roi_heads.box_predictor.cls_score.in_features

In [11]:
model.roi_heads.box_predictor = FastRCNNPredictor(in_features,num_classes)

##### 2.Modifying the model to add a different backbone

In [12]:
import torchvision
from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.rpn import AnchorGenerator

In [13]:
backbone = torchvision.models.mobilenet_v2(weights="DEFAULT").features

Downloading: "https://download.pytorch.org/models/mobilenet_v2-7ebf99e0.pth" to C:\Users\59585/.cache\torch\hub\checkpoints\mobilenet_v2-7ebf99e0.pth
100.0%


In [14]:
backbone.out_channels = 1280

In [16]:
anchor_generator = AnchorGenerator(
    sizes=((32, 64, 128, 256, 512),),
    aspect_ratios=((0.5, 1.0, 2.0),)
)

In [17]:
roi_pooler = torchvision.ops.MultiScaleRoIAlign(
    featmap_names=['0'],
    output_size=7,
    sampling_ratio=2,
)

In [18]:
model = FasterRCNN(
    backbone,
    num_classes=2,
    rpn_anchor_generator=anchor_generator,
    box_roi_pool=roi_pooler,
)

#### Object detection and instance segmentation model for PennFudan Dataset