In [1]:
import cv2
import numpy as np

In [2]:
# Load Yolo
# https://pjreddie.com/media/files/yolov3.weights
# https://github.com/pjreddie/darknet/blob/master/cfg/yolov3.cfg
net = cv2.dnn.readNet("yolov3.weights", "yolov3.cfg")

In [3]:
# https://github.com/pjreddie/darknet/blob/master/data/coco.names
# COCO - Common Objects in Context
with open("coco.names", "r") as f:
    classes = [line.strip() for line in f]

In [4]:
layer_names = net.getLayerNames()
# output layer names
output_layers = [layer_names[i[0] - 1] for i in net.getUnconnectedOutLayers()]

In [5]:
# Loading image
img = cv2.imread("yolov3.jpeg")
img = cv2.resize(img, None, fx=0.4, fy=0.4)
height, width, channels = img.shape

In [6]:
# resizing input images to yolo accepted shapes
# Blob it’s used to extract feature from the image and to resize them. 

blob = cv2.dnn.blobFromImage(img, scalefactor=0.00392, size=(416, 416), mean=(0, 0, 0), swapRB=True, crop=False)

# YOLO accepts three sizes:

#     320×320 it’s small so less accuracy but better speed
#     609×609 it’s bigger so high accuracy and slow speed
#     416×416 it’s in the middle and we  get a bit of both.

**cv2.dnn.blobFromImage Params**

1. image
  
 - This is the input image we want to preprocess before passing it through our deep neural network for classification.

1. scalefactor

 - After we perform mean subtraction we can optionally scale our images by some factor. This value defaults to `1.0` (i.e., no scaling) but we can supply another value as well. It’s also important to note that scalefactor
      should be 1 / \sigma as we’re actually multiplying the input channels (after mean subtraction) by scalefactor
     .

1. size
     - Here we supply the spatial size that the Convolutional Neural Network expects. For most current state-of-the-art neural networks this is either 224×224, 227×227, or 299×299.

1. mean
     - These are our mean subtraction values. They can be a 3-tuple of the RGB means or they can be a single value in which case the supplied value is subtracted from every channel of the image. If you’re performing mean subtraction, ensure you supply the 3-tuple in `(R, G, B)` order, especially when utilizing the default behavior of swapRB=True
     .
1. swapRB
     - OpenCV assumes images are in BGR channel order; however, the `mean` value assumes we are using RGB order. To resolve this discrepancy we can swap the R and B channels in image
      by setting this value to `True`. By default OpenCV performs this channel swapping for us.

Further reading [refer this](https://www.pyimagesearch.com/2017/11/06/deep-learning-opencvs-blobfromimage-works/
)

In [7]:
net.setInput(blob)
# 3 outputs - small, medium and large objecta detection using 52, 26, 13 blocks respectively
outputs = net.forward(output_layers)

In [8]:
# Showing informations on the screen
class_ids = []
confidences = []
boxes = []

# interating over small, medium and big objects
for output in outputs:
    for detection in output:
        # probabilities of 80 classes 
        # excluding x, y, h, w, objectiveness
        class_prob = detection[5:]
        
        # selecting single class Id per detection
        class_id = np.argmax(class_prob)
        class_confidence = class_prob[class_id]

        # setting threshold for confidence 
        if class_confidence > 0.1:
            print(classes[class_id], class_confidence)
        if class_confidence > 0.5:
            # mid_x, mid_y, w, h, 
            center_x = int(detection[0] * width)
            center_y = int(detection[1] * height)
            w = int(detection[2] * width)
            h = int(detection[3] * height)

            # Rectangle coordinates
            x = int(center_x - w / 2)
            y = int(center_y - h / 2)

            boxes.append([x, y, w, h])
            confidences.append(float(class_confidence))
            class_ids.append(class_id)


laptop 0.99970794
person 0.99936557
keyboard 0.8689499
chair 0.48856437
chair 0.6281692
bottle 0.9786397
laptop 0.5549331
cell phone 0.24328135
keyboard 0.91006833
book 0.5635614
cell phone 0.30302283
mouse 0.31213394
cell phone 0.3997322


In [9]:
indexes = cv2.dnn.NMSBoxes(bboxes=boxes, scores=confidences, score_threshold=0.1, nms_threshold=0.4)
# print(indexes)
font = cv2.FONT_HERSHEY_PLAIN

# choose unique color for each class
colors = np.random.uniform(0, 255, size=(len(classes), 3))

#iterate over all boxes and display box if NMS filtered
for i in range(len(boxes)):
    if i in indexes:
        x, y, w, h = boxes[i]
        label = str(classes[class_ids[i]])
        color = colors[i]
        cv2.rectangle(img=img, pt1=(x, y), pt2=(x + w, y + h), color=color, thickness=2)
        cv2.putText(img=img, text=label, org=(x, y + 30), fontFace=font, fontScale=2, color=color, thickness=2)


In [10]:
cv2.imshow("Image", img)
cv2.waitKey(0)
cv2.destroyAllWindows()

In [11]:
cv2.imwrite('yolov3_coco_inference.jpg',img)

True