In [1]:
import cv2
import numpy as np
import matplotlib.pyplot as plt

### 1. load the YoloV3 weights and configuration file with the help of `dnn` module of OpenCV.

In [2]:
net = cv2.dnn.readNet("data/yolov3.weights", "data/yolov3.cfg")

In [3]:
type(net)

cv2.dnn_Net

### 2. `coco.names` file contains the names of the different objects that our model has been trained to identify. We store them in a list called classes.

In [3]:
classes = []

with open("data/coco.names", "r") as f:
    classes = f.read().splitlines()
    
type(classes)

list

In [6]:
len(classes)

80

In [13]:
print(classes)

['person', 'bicycle', 'car', 'motorbike', 'aeroplane', 'bus', 'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'sofa', 'pottedplant', 'bed', 'diningtable', 'toilet', 'tvmonitor', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush']


In [14]:
type(classes[0])

str

In [4]:
f = open(file="data/coco.names", mode="r")
print(f.read().splitlines())

['person', 'bicycle', 'car', 'motorbike', 'aeroplane', 'bus', 'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'sofa', 'pottedplant', 'bed', 'diningtable', 'toilet', 'tvmonitor', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush']


##### `read()` function

In [29]:
f = open(file="example_data/random_text.txt", mode="r")

# default size = -1, read the whole text
f.read()

'Hello! Welcome to random_text.txt'

In [19]:
f

<_io.TextIOWrapper name='example_data/random_text.txt' mode='r' encoding='cp1252'>

In [28]:
type(f)

_io.TextIOWrapper

In [30]:
f = open(file="example_data/random_text.txt", mode="r")
len(f.read())

33

In [31]:
f = open(file="example_data/random_text.txt", mode="r")
f.read(3)

'Hel'

##### `splitlines()` function

The splitlines() method splits a string into a list. The splitting is done at line breaks.

In [33]:
text = "Hello YOLO!\nYou're awesome."
text.splitlines()

['Hello YOLO!', "You're awesome."]

### 3. Now to run a forward pass using the `cv2.dnn` module, we need to pass in the names of layers for which the output is to be computed. `net.getUnconnectedOutLayers()` returns the indices of the output layers of the network.

In [4]:
output_layers_names = net.getUnconnectedOutLayersNames()
type(output_layers_names), len(output_layers_names)

(list, 3)

In [36]:
output_layers_names

['yolo_82', 'yolo_94', 'yolo_106']

#### Note:

YOLOv3 has total 106 layers. The above layers are responsible for object detection.

In [5]:
all_layers = net.getLayerNames()
print(len(all_layers), all_layers)

254 ['conv_0', 'bn_0', 'relu_1', 'conv_1', 'bn_1', 'relu_2', 'conv_2', 'bn_2', 'relu_3', 'conv_3', 'bn_3', 'relu_4', 'shortcut_4', 'conv_5', 'bn_5', 'relu_6', 'conv_6', 'bn_6', 'relu_7', 'conv_7', 'bn_7', 'relu_8', 'shortcut_8', 'conv_9', 'bn_9', 'relu_10', 'conv_10', 'bn_10', 'relu_11', 'shortcut_11', 'conv_12', 'bn_12', 'relu_13', 'conv_13', 'bn_13', 'relu_14', 'conv_14', 'bn_14', 'relu_15', 'shortcut_15', 'conv_16', 'bn_16', 'relu_17', 'conv_17', 'bn_17', 'relu_18', 'shortcut_18', 'conv_19', 'bn_19', 'relu_20', 'conv_20', 'bn_20', 'relu_21', 'shortcut_21', 'conv_22', 'bn_22', 'relu_23', 'conv_23', 'bn_23', 'relu_24', 'shortcut_24', 'conv_25', 'bn_25', 'relu_26', 'conv_26', 'bn_26', 'relu_27', 'shortcut_27', 'conv_28', 'bn_28', 'relu_29', 'conv_29', 'bn_29', 'relu_30', 'shortcut_30', 'conv_31', 'bn_31', 'relu_32', 'conv_32', 'bn_32', 'relu_33', 'shortcut_33', 'conv_34', 'bn_34', 'relu_35', 'conv_35', 'bn_35', 'relu_36', 'shortcut_36', 'conv_37', 'bn_37', 'relu_38', 'conv_38', 'bn_38'

##### Yolo has: 75 cnn-layers (convolutional layers) + 31 other layers (shortcut, route, upsample, yolo) = 106 layers in total.

In [9]:
cnn_layers = [layer for layer in all_layers if "conv" in layer]
print(len(cnn_layers), cnn_layers)

75 ['conv_0', 'conv_1', 'conv_2', 'conv_3', 'conv_5', 'conv_6', 'conv_7', 'conv_9', 'conv_10', 'conv_12', 'conv_13', 'conv_14', 'conv_16', 'conv_17', 'conv_19', 'conv_20', 'conv_22', 'conv_23', 'conv_25', 'conv_26', 'conv_28', 'conv_29', 'conv_31', 'conv_32', 'conv_34', 'conv_35', 'conv_37', 'conv_38', 'conv_39', 'conv_41', 'conv_42', 'conv_44', 'conv_45', 'conv_47', 'conv_48', 'conv_50', 'conv_51', 'conv_53', 'conv_54', 'conv_56', 'conv_57', 'conv_59', 'conv_60', 'conv_62', 'conv_63', 'conv_64', 'conv_66', 'conv_67', 'conv_69', 'conv_70', 'conv_72', 'conv_73', 'conv_75', 'conv_76', 'conv_77', 'conv_78', 'conv_79', 'conv_80', 'conv_81', 'conv_84', 'conv_87', 'conv_88', 'conv_89', 'conv_90', 'conv_91', 'conv_92', 'conv_93', 'conv_96', 'conv_99', 'conv_100', 'conv_101', 'conv_102', 'conv_103', 'conv_104', 'conv_105']


### 4. read the image, resize it

In [7]:
def load_image(img_path):
    # image loading
    img = cv2.imread(img_path)
    img = cv2.resize(img, None, fx=0.4, fy=0.4)
    height, width, channels = img.shape
    
    return img, height, width, channels

In [8]:
img, height, width, channels = load_image("files/pedestrians.jpg")

In [12]:
type(img), type(height), type(width), type(channels)

(numpy.ndarray, int, int, int)

In [7]:
height, width, channels

(267, 400, 3)

In [14]:
img.shape

(267, 400, 3)

### 5. To correctly predict the objects with deep neural networks, we need to preprocess our data and `cv2.dnn` module provides us with two functions for this purpose: 
    1. blobFromImage and 
    2. blobFromImages. 
    
    These functions perform scaling, mean subtraction and channel swap which is optional. We will use `blobFromImage` in a function called detect_objects() that accepts image/frame from video or webcam stream, model and output layers as parameters.
    
The input to the network is a so-called blob object. 

The function `cv.dnn.blobFromImage(img, scale, size, mean)` transforms the image into a blob:

It has the following parameters:

- the image to transform
- the scale factor (1/255 to scale the pixel values to [0..1])
- the size, here a 416x416 square image
- the mean value (default=0)
- the option swapBR=True (since OpenCV uses BGR)

In [9]:
blob = cv2.dnn.blobFromImage(img,
                             scalefactor=1/255, 
                             size=(416, 416), 
                             mean=(0, 0, 0), 
                             swapRB=True, 
                             crop=False)

A blob is a 4D numpy array object (images, channels, width, height).

In [11]:
%%time

net.setInput(blob)
layerOutputs = net.forward(output_layers_names)

Wall time: 2.45 s


The forward propagation takes about 2 seconds

The `forward()` function of `cv2.dnn` module returns a nested list containing information about all the detected objects which includes 
- the x and y coordinates of the centre of the object detected, 
- height and width of the bounding box, 
- confidence and scores for all the classes of objects listed in `coco.names`. 

The class with the highest score is considered to be the predicted class.

In [16]:
type(layerOutputs)

list

In [17]:
len(layerOutputs)

3

**3 layerOutputs for 3 layers**

In [19]:
type(layerOutputs[0])

numpy.ndarray

In [27]:
layerOutputs[0].shape, layerOutputs[1].shape, layerOutputs[2].shape

((507, 85), (2028, 85), (8112, 85))

The outputs object are vectors of lenght 85

- 4x the bounding box (centerx, centery, width, height)
- 1x box confidence
- 80x class confidence

### 5. layerOutputs

In [11]:
boxes = []
confidences = []
class_ids = []

In [29]:
for output in layerOutputs:
    print(output.shape)

(507, 85)
(2028, 85)
(8112, 85)


In [39]:
for i in range(len(layerOutputs)):
    print(f"Layer - {i+1}: total bounding box found: {layerOutputs[i].shape[0]}")

Layer - 1: total bounding box found: 507
Layer - 2: total bounding box found: 2028
Layer - 3: total bounding box found: 8112


In [36]:
for output in layerOutputs:
    for detection in output:
        scores = detection[5:]
        class_id = np.argmax(scores)
        confidence = scores[class_id]

In [38]:
type(scores), type(class_id), len(scores)

(numpy.ndarray, numpy.int64, 80)

In [12]:
for output in layerOutputs:
    for detection in output:
        scores = detection[5:]
        class_id = np.argmax(scores)
        confidence = scores[class_id]
        if confidence > 0.5:
            center_x = int(detection[0] * width)
            center_y = int(detection[1] * height)
            w = int(detection[2] * width)
            h = int(detection[3] * height)

            x = int(center_x - w/2)
            y = int(center_y - h/2)

            boxes.append([x, y, w, h])
            confidences.append(float(confidence))
            class_ids.append(class_id)

I have selected all the predicted bounding boxes with the confidence of more than `50%`. 

### 6. Now that we have the vertices of the predicted bounding box and class_id (index of predicted object class), we need to draw the bounding box and add object label to it.

In [13]:
indexes = cv2.dnn.NMSBoxes(bboxes=boxes, scores=confidences, score_threshold=0.5, nms_threshold=0.4)

##### what is `cv2.dnn.NMSBoxes()` is for?

Although we removed the low confidence bounding boxes, there is a possibility that we will still have duplicate detections around an object.

You may observe that some objects have been detected multiple times and we have more than one bounding box for it. To fix this situation we’ll need to apply `Non-Maximum Suppression (NMS)`, also called Non-Maxima Suppression. We pass in `confidence threshold value` and `NMS threshold value` as parameters to select one bounding box. From the range of 0 to 1, we should select an intermediate value like 0.4 or 0.5 to make sure that we detect the overlapping objects but do not end up getting multiple bounding boxes for the same object.

In [14]:
type(indexes)

numpy.ndarray

In [15]:
indexes.shape

(16, 1)

In [17]:
print(indexes)

[[12]
 [ 8]
 [ 5]
 [ 6]
 [ 7]
 [10]
 [ 4]
 [ 9]
 [11]
 [ 2]
 [ 1]
 [13]
 [ 0]
 [14]
 [15]
 [ 3]]


In [19]:
(indexes.flatten()).shape

(16,)

In [20]:
indexes.flatten()

array([12,  8,  5,  6,  7, 10,  4,  9, 11,  2,  1, 13,  0, 14, 15,  3],
      dtype=int32)

In [21]:
for i in indexes.flatten():
    print(classes[class_ids[i]])

person
person
person
person
person
person
car
person
person
car
traffic light
person
bus
person
person
handbag


In [22]:
class_ids

[5, 9, 2, 26, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]