<a href="https://colab.research.google.com/github/ioannis-toumpoglou/pytorch-repo/blob/main/pytorch_custom_model_coco.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
# source: https://medium.com/fullstackai/how-to-train-an-object-detector-with-your-own-coco-dataset-in-pytorch-319e7090da5

import os
import torch
import torch.utils.data
import torchvision
from PIL import Image
from pycocotools.coco import COCO


class CocoDataset(torch.utils.data.Dataset):
    def __init__(self, root, annotation, transforms=None):
        self.root = root
        self.transforms = transforms
        self.coco = COCO(annotation)
        self.ids = list(sorted(self.coco.imgs.keys()))

    def __getitem__(self, index):
        # Own coco file
        coco = self.coco
        # Image ID
        img_id = self.ids[index]
        # List: get annotation id from coco
        ann_ids = coco.getAnnIds(imgIds=img_id)
        # Dictionary: target coco_annotation file for an image
        coco_annotation = coco.loadAnns(ann_ids)
        # path for input image
        path = coco.loadImgs(img_id)[0]['file_name']
        # open the input image
        img = Image.open(os.path.join(self.root, path)).convert('RGB')

        # number of objects in the image
        num_objs = len(coco_annotation)

        # Bounding boxes for objects
        # In coco format, bbox = [xmin, ymin, width, height]
        # In pytorch, the input should be [xmin, ymin, xmax, ymax]
        boxes = []
        for i in range(num_objs):
            xmin = coco_annotation[i]['bbox'][0]
            ymin = coco_annotation[i]['bbox'][1]
            xmax = xmin + coco_annotation[i]['bbox'][2]
            ymax = ymin + coco_annotation[i]['bbox'][3]
            boxes.append([xmin, ymin, xmax, ymax])
        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        # Labels (In my case, I only one class: target class or background)
        labels = torch.ones((num_objs,), dtype=torch.int64)
        # Tensorise img_id
        img_id = torch.tensor([img_id])
        # Size of bbox (Rectangular)
        areas = []
        for i in range(num_objs):
            areas.append(coco_annotation[i]['area'])
        areas = torch.as_tensor(areas, dtype=torch.float32)
        # Iscrowd
        iscrowd = torch.zeros((num_objs,), dtype=torch.int64)

        # Annotation is in dictionary format
        my_annotation = {}
        my_annotation["boxes"] = boxes
        my_annotation["labels"] = labels
        my_annotation["image_id"] = img_id
        my_annotation["area"] = areas
        my_annotation["iscrowd"] = iscrowd

        if self.transforms is not None:
            img = self.transforms(img)

        return img, my_annotation

    def __len__(self):
        return len(self.ids)

In [21]:
# The inputs for a PyTorch model must be in tensor format
def get_transform():
    custom_transforms = []
    custom_transforms.append(torchvision.transforms.ToTensor())
    return torchvision.transforms.Compose(custom_transforms)

In [23]:
# Create data directory
from pathlib import Path


data_path = Path('data/')
image_path = data_path / 'images'

if image_path.is_dir():
  print(f'[INFO] {image_path} already exists, skipping download...')
else:
  print(f'[INFO] Unable to find {image_path}, creating one...')
  image_path.mkdir(parents=True, exist_ok=True)

[INFO] data/images already exists, skipping download...


In [14]:
# path to data and coco file
train_data_dir = 'data/'
train_coco = 'data/train_coco.json'

# create own Dataset
my_dataset = CocoDataset(root=train_data_dir,
                         annotation=train_coco,
                         transforms=get_transform()
                         )

# collate_fn needs for batch
def collate_fn(batch):
    return tuple(zip(*batch))

# Batch size
train_batch_size = 8
num_workers = os.cpu_count()

# own DataLoader
data_loader = torch.utils.data.DataLoader(my_dataset,
                                          batch_size=train_batch_size,
                                          shuffle=True,
                                          num_workers=num_workers,
                                          collate_fn=collate_fn)

loading annotations into memory...
Done (t=0.00s)
creating index...
index created!


In [15]:
# select device (whether GPU or CPU)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# DataLoader is iterable over Dataset
for imgs, annotations in data_loader:
    imgs = list(img.to(device) for img in imgs)
    annotations = [{k: v.to(device) for k, v in t.items()} for t in annotations]
    print(annotations)

[{'boxes': tensor([[ 24.1973,  30.6499, 187.1257, 101.6286],
        [ 24.1973, 132.2785, 187.1257, 203.2572],
        [ 25.0039, 230.6808, 187.1257, 302.4662],
        [ 24.1973, 329.8897, 187.1257, 401.6750],
        [225.8414, 330.6963, 387.9633, 401.6750],
        [224.2282, 230.6808, 387.9633, 302.4662],
        [225.0348, 129.8588, 387.9633, 204.0638],
        [225.0348,  31.4565, 387.9633, 102.4352]]), 'labels': tensor([1, 1, 1, 1, 1, 1, 1, 1]), 'image_id': tensor([15]), 'area': tensor([11564.4521, 11564.4521, 11637.9658, 11695.8662, 11507.2021, 11753.7666,
        12090.1094, 11564.4521]), 'iscrowd': tensor([0, 0, 0, 0, 0, 0, 0, 0])}, {'boxes': tensor([[   7.4275,   44.5648, 1275.0483, 1242.8627]]), 'labels': tensor([1]), 'image_id': tensor([8]), 'area': tensor([1518987.3750]), 'iscrowd': tensor([0])}, {'boxes': tensor([[  14.8549,   24.7582, 1267.6208, 1242.8627]]), 'labels': tensor([1]), 'image_id': tensor([10]), 'area': tensor([1525999.7500]), 'iscrowd': tensor([0])}, {'boxe

In [16]:
# # Run the model
# from torchvision.models.detection import fasterrcnn_resnet50_fpn_v2

# def get_model_instance_segmentation(num_classes):
#     # load an instance segmentation model pre-trained on COCO
#     model = torchvision.models.detection.fasterrcnn_resnet50_fpn_v2(pretrained=True)
#     # get number of input features for the classifier
#     in_features = model.roi_heads.box_predictor.cls_score.in_features
#     # replace the pre-trained head with a new one
#     model.roi_heads.box_predictor = fasterrcnn_resnet50_fpn_v2(in_features, num_classes)

#     return model


# # 2 classes; Only target class or background
# num_classes = 1
# num_epochs = 10
# model = get_model_instance_segmentation(num_classes)

# # move model to the right device
# model.to(device)

# # parameters
# params = [p for p in model.parameters() if p.requires_grad]
# optimizer = torch.optim.SGD(params, lr=0.01, momentum=0.9, weight_decay=0.0005)

# len_dataloader = len(data_loader)

# for epoch in range(num_epochs):
#     model.train()
#     i = 0
#     for imgs, annotations in data_loader:
#         i += 1
#         imgs = list(img.to(device) for img in imgs)
#         annotations = [{k: v.to(device) for k, v in t.items()} for t in annotations]
#         loss_dict = model(imgs, annotations)
#         losses = sum(loss for loss in loss_dict.values())

#         optimizer.zero_grad()
#         losses.backward()
#         optimizer.step()

#         print(f'Iteration: {i}/{len_dataloader}, Loss: {losses}')

In [19]:
from torchvision.models.detection import fasterrcnn_resnet50_fpn_v2
from torchvision.models.detection import FasterRCNN_ResNet50_FPN_V2_Weights

# Download the pretrained weights for FasterRCNN_ResNet50_FPN_V2
weights = data_loader
print(weights)

# Setup the model with the pretrained weights and send it to the target device
object_detection_model = fasterrcnn_resnet50_fpn_v2(weights=weights, box_score_thresh=0.9)
object_detection_model

<torch.utils.data.dataloader.DataLoader object at 0x7f5e9c705fc0>


FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
       

In [20]:
object_detection_model.eval()

FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
       