In [1]:
import torch
import torchvision
from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor
from torchvision.datasets import CocoDetection

import utils
from coco_utils import get_city
import transforms

# Load a model pre-trained on COCO and put it in inference mode

print('Loading pretrained model...')
model = torchvision.models.detection.maskrcnn_resnet50_fpn(pretrained=True)
model.eval()

# Load the COCO 2017 train and val sets. We use the CocoDetection class definition
# from ./coco_utils.py, not the original torchvision.CocoDetection class. Also, we
# use transforms from ./transforms, not torchvision.transforms, because they need
# to transform the bboxes and masks along with the image.

# coco_path = "./COCO"

preprocess = transforms.Compose([
    transforms.ToTensor()
])
print('Loading COCO train, val datasets...')
coco_train_dataset = get_city('train',preprocess)
coco_val_dataset = get_city('val',preprocess)

def collate_fn(batch):
    return tuple(zip(*batch))

train_dataloader = torch.utils.data.DataLoader(coco_train_dataset, batch_size=2, shuffle=True, num_workers=0, collate_fn=collate_fn)
val_dataloader = torch.utils.data.DataLoader(coco_val_dataset, batch_size=2, shuffle=False, num_workers=0, collate_fn=collate_fn)

Loading pretrained model...
Loading COCO train, val datasets...
loading annotations into memory...
Done (t=3.64s)
creating index...
index created!
loading annotations into memory...
Done (t=0.63s)
creating index...
index created!


In [2]:
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
print(device)
model.to(device)

cuda:1


MaskRCNN(
  (transform): GeneralizedRCNNTransform()
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d()
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d()
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d()
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d()
          (relu): ReLU(inplace=True)
          (downsample): Sequential(
            (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): FrozenBatchNorm2d()
          )
    

In [11]:
# images, targets = next(iter(train_dataloader))

# # print(type(images), len(images))
# # print(images[0].shape)
# print(type(targets), len(targets))
# print(targets[0].keys())
# # # print()
# # # images = images.to(device)

# # import numpy as np
# images = [img.to(device) for img in images]
# print(len(images))
# # images.to(device)
# predictions = model(images)

# print('Prediction keys:', list(dict(predictions[0])))
# print('Boxes shape:', predictions[0]['boxes'].shape)
# print('Labels shape:', predictions[0]['labels'].shape)
# print('Scores shape:', predictions[0]['scores'].shape)
# print('Masks shape:', predictions[0]['masks'].shape)

<class 'tuple'> 2
dict_keys(['boxes', 'labels', 'masks', 'image_id', 'area', 'iscrowd'])
2


RuntimeError: CUDA out of memory. Tried to allocate 112.00 MiB (GPU 1; 10.92 GiB total capacity; 10.08 GiB already allocated; 51.50 MiB free; 184.64 MiB cached)

In [3]:
import numpy as np
import cv2
import random

# Array of labels for COCO dataset (91 elements)

coco_names = [
    '__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
    'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'N/A', 'stop sign',
    'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
    'elephant', 'bear', 'zebra', 'giraffe', 'N/A', 'backpack', 'umbrella', 'N/A', 'N/A',
    'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
    'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket',
    'bottle', 'N/A', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl',
    'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza',
    'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'N/A', 'dining table',
    'N/A', 'N/A', 'toilet', 'N/A', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',
    'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'N/A', 'book',
    'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'
]

# Random colors to use for labeling objects

COLORS = np.random.uniform(0, 255, size=(len(coco_names), 3)).astype(np.uint8)

# Overlay masks, bounding boxes, and labels on input numpy image

def draw_segmentation_map(image, masks, boxes, labels):
    alpha = 1
    beta = 0.5 # transparency for the segmentation map
    gamma = 0 # scalar added to each sum
    # convert from RGB to OpenCV BGR format
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    for i in range(len(masks)):
        mask = masks[i,:,:]
        red_map = np.zeros_like(mask).astype(np.uint8)
        green_map = np.zeros_like(mask).astype(np.uint8)
        blue_map = np.zeros_like(mask).astype(np.uint8)
        # apply a randon color mask to each object
        color = COLORS[random.randrange(0, len(COLORS))]
        red_map[mask > 0.5] = color[0]
        green_map[mask > 0.5] = color[1]
        blue_map[mask > 0.5] = color[2]
        # combine all the masks into a single image
        segmentation_map = np.stack([red_map, green_map, blue_map], axis=2)
        # apply colored mask to the image
        image = cv2.addWeighted(image, alpha, segmentation_map, beta, gamma)
        # draw the bounding box around each object
        p1 = (int(boxes[i][0]), int(boxes[i][1]))
        p2 = (int(boxes[i][2]), int(boxes[i][3]))
        color = (int(color[0]), int(color[1]), int(color[2]))
        cv2.rectangle(image, p1, p2, color, 2)
        # put the label text above the objects
        p = (int(boxes[i][0]), int(boxes[i][1]-10))
        cv2.putText(image, labels[i], p, cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2, cv2.LINE_AA)
    
    return cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

# Overlay masks, bounding boxes, and labels of objects with scores greater than
# threshold on one of the images in the input tensor using the predictions output by Mask R-CNN.

def prediction_to_mask_image(images, predictions, img_index, threshold):
    scores = predictions[img_index]['scores']
    boxes_to_use = scores >= threshold
    img = (images[img_index].cpu().permute(1, 2, 0).numpy() * 255).astype(np.uint8)
    masks = predictions[img_index]['masks'][boxes_to_use, :, :].cpu().detach().squeeze(1).numpy()
    boxes = predictions[img_index]['boxes'][boxes_to_use, :].cpu().detach().numpy()
    labels = predictions[img_index]['labels'][boxes_to_use].cpu().numpy()
    labels = [ coco_names[l] for l in labels ]

    return draw_segmentation_map(img, masks, boxes, labels)

In [4]:
from engine import train_one_epoch, evaluate
import utils
# Training
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005)
# and a learning rate scheduler
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,step_size=3,gamma=0.1)
num_epochs = 10
for epoch in range(num_epochs):
    # train for one epoch, printing every 10 iterations
    train_one_epoch(model, optimizer, train_dataloader, device, epoch, print_freq=10)
    lr_scheduler.step()
    evaluate(model, val_dataloader, device=device)

# for images, targets in train_dataloader:
#     images = list(image.to(device) for image in images)
#     targets = [{k: v for k, v in t.items()} for t in targets]
#     output = model(images,targets)   # Returns losses and detections
#     print(output)
#     break

[{'boxes': tensor([[1.9202e+02, 4.2239e+02, 4.7960e+02, 5.8903e+02],
        [7.5876e-01, 4.2959e+02, 1.8935e+02, 6.4906e+02],
        [7.9003e+02, 4.0948e+02, 8.6525e+02, 5.4466e+02],
        [7.1217e+02, 3.9863e+02, 7.8898e+02, 5.4439e+02],
        [1.1082e+03, 4.3297e+02, 1.3899e+03, 5.2017e+02],
        [1.4604e+03, 3.9186e+02, 1.6282e+03, 5.2930e+02],
        [1.5724e+03, 3.6639e+02, 1.7616e+03, 5.5993e+02],
        [1.7022e+03, 1.3174e+02, 2.0480e+03, 6.3802e+02],
        [1.0260e+03, 4.4795e+02, 1.1493e+03, 4.9627e+02],
        [9.9588e+02, 4.5418e+02, 1.0356e+03, 4.9032e+02],
        [1.0460e+03, 4.4301e+02, 1.1516e+03, 4.7486e+02],
        [1.4548e+03, 4.3347e+02, 1.5003e+03, 5.0707e+02],
        [8.3428e+02, 4.5002e+02, 9.8836e+02, 4.9937e+02],
        [8.8196e+02, 4.3244e+02, 9.4600e+02, 4.6126e+02],
        [1.0094e+03, 4.5448e+02, 1.0364e+03, 4.8179e+02],
        [8.3644e+02, 4.5237e+02, 9.0502e+02, 4.8166e+02],
        [9.7020e+02, 4.5569e+02, 9.9589e+02, 4.8786e+02],
   

In [None]:
from matplotlib import pyplot as plt

masked_img = prediction_to_mask_image(images, predictions, 0, 0.5)
plt.figure(1, figsize=(12, 9), dpi=100)
plt.imshow(masked_img)
plt.title('Validation image result')
plt.show()