In [2]:
# import the necessary packages
from torchvision.models import detection
import numpy as np
import torch
import cv2

# set the device we will be using to run the model
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# load the list of categories in the COCO dataset
with open('coco-classes.txt', 'r') as fr:
    CLASSES = [line.strip() for line in fr.readlines()]

#CLASSES = CLASSES[:3]

# generate a set of bounding box colors for each class
COLORS = np.random.uniform(0, 255, size=(len(CLASSES), 3))

# initialize a dictionary containing model name and its corresponding torchvision function call
MODELS = {
	'frcnn-resnet': detection.fasterrcnn_resnet50_fpn,
	'frcnn-mobilenet': detection.fasterrcnn_mobilenet_v3_large_320_fpn,
	'retinanet': detection.retinanet_resnet50_fpn
}

# selected model
MODEL = 'frcnn-mobilenet'

# load the model and set it to evaluation mode
model = MODELS[MODEL](
    pretrained=True, 
    #progress=True, 
    num_classes=len(CLASSES)-1, 
    pretrained_backbone=True,
    #trainable_backbone_layers=0 # 0-6
).to(DEVICE)

model.eval()



FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(320,), max_size=640, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (0): Conv2dNormActivation(
        (0): Conv2d(3, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (1): FrozenBatchNorm2d(16, eps=1e-05)
        (2): Hardswish()
      )
      (1): InvertedResidual(
        (block): Sequential(
          (0): Conv2dNormActivation(
            (0): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=16, bias=False)
            (1): FrozenBatchNorm2d(16, eps=1e-05)
            (2): ReLU(inplace=True)
          )
          (1): Conv2dNormActivation(
            (0): Conv2d(16, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): FrozenBatchNorm2d(16, eps=1e-05)
          )
        )
      )
      (2): InvertedResidual(
        (block): 

In [6]:
from time import time

t0 = time()
image = cv2.imread('../i2l-dataset/ball/ball_01.jpg')
orig = image.copy()

# convert the image from BGR to RGB channel ordering and change the
# image from channels last to channels first ordering
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
image = image.transpose((2, 0, 1))
# add the batch dimension, scale the raw pixel intensities to the
# range [0, 1], and convert the image to a floating point tensor
image = np.expand_dims(image, axis=0)
image = image / 255.0
image = torch.FloatTensor(image)
# send the input to the device and pass the it through the network to
# get the detections and predictions
image = image.to(DEVICE)
results = model(image)[0]

for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
    box = [round(i, 2) for i in box.tolist()]
    print(
        f"Detected {CLASSES[label]} with confidence "
        f"{round(score.item(), 3)} at location {box}"
    )

print(f'Time: {time()-t0}')

Detected tvmonitor with confidence 0.692 at location [83.86, 45.44, 143.68, 105.13]
Detected laptop with confidence 0.274 at location [63.3, 35.78, 208.98, 174.23]
Detected tvmonitor with confidence 0.238 at location [59.18, 19.45, 207.13, 167.77]
Detected tvmonitor with confidence 0.082 at location [158.25, 101.71, 200.14, 151.08]
Detected chair with confidence 0.08 at location [155.37, 101.47, 200.37, 151.83]
Detected laptop with confidence 0.066 at location [72.3, 133.0, 126.31, 169.18]
Detected keyboard with confidence 0.064 at location [75.41, 131.91, 124.68, 170.32]
Time: 0.1725320816040039
