In [1]:
import cv2
import numpy as np
import matplotlib.pyplot as plt
# a is an array of pixels in a image


In [2]:
# Classes list with trained object name
classes = None
with open('coco.names', 'r') as f:
    classes = [line.strip() for line in f.readlines()]

In [3]:
classes

['person',
 'bicycle',
 'car',
 'motorbike',
 'aeroplane',
 'bus',
 'train',
 'truck',
 'boat',
 'traffic light',
 'fire hydrant',
 'stop sign',
 'parking meter',
 'bench',
 'bird',
 'cat',
 'dog',
 'horse',
 'sheep',
 'cow',
 'elephant',
 'bear',
 'zebra',
 'giraffe',
 'backpack',
 'umbrella',
 'handbag',
 'tie',
 'suitcase',
 'frisbee',
 'skis',
 'snowboard',
 'sports ball',
 'kite',
 'baseball bat',
 'baseball glove',
 'skateboard',
 'surfboard',
 'tennis racket',
 'bottle',
 'wine glass',
 'cup',
 'fork',
 'knife',
 'spoon',
 'bowl',
 'banana',
 'apple',
 'sandwich',
 'orange',
 'broccoli',
 'carrot',
 'hot dog',
 'pizza',
 'donut',
 'cake',
 'chair',
 'sofa',
 'pottedplant',
 'bed',
 'diningtable',
 'toilet',
 'tvmonitor',
 'laptop',
 'mouse',
 'remote',
 'keyboard',
 'cell phone',
 'microwave',
 'oven',
 'toaster',
 'sink',
 'refrigerator',
 'book',
 'clock',
 'vase',
 'scissors',
 'teddy bear',
 'hair drier',
 'toothbrush']

# NOTE: net variable holds neural network model

In [4]:
net = cv2.dnn.readNet('yolov3.weights', 'yolov3.cfg')
# read pretrained yolov3 model from weights and config file
# The weights file (yolov3.weights) contains the learned parameters of the YOLOv3 model,
# including the weights of the neural network's layers.

In [16]:
image = cv2.imread(r"C:\Users\HARISH A K\Desktop\python\Deep_Learning\Deep_learning_projects\human_detection_in_footage\sample2.jfif")
height, width = image.shape[:2] # yolov cantains 1st 2 as height and weight
blob = cv2.dnn.blobFromImage(image, 0.00392, (416, 416), swapRB=True, crop=False)
net.setInput(blob)

In [17]:
outs = net.forward(net.getUnconnectedOutLayersNames())
# net.getUnconnectedOutLayersNames(): This method retrieves the names of the unconnected output layers of the neural network
# net.forward(...): This function is used to perform a forward pass of the neural network. 
# A forward pass involves passing the input data through the network's layers to obtain the output

In [18]:
outs # with this output bounding boxes can be classified

(array([[0.05365018, 0.05884587, 0.3532788 , ..., 0.        , 0.        ,
         0.        ],
        [0.04892453, 0.03397522, 0.30836058, ..., 0.        , 0.        ,
         0.        ],
        [0.04801371, 0.04328671, 0.86561227, ..., 0.        , 0.        ,
         0.        ],
        ...,
        [0.954852  , 0.9450227 , 0.3873407 , ..., 0.        , 0.        ,
         0.        ],
        [0.9575348 , 0.96462333, 0.3224339 , ..., 0.        , 0.        ,
         0.        ],
        [0.96665597, 0.9607834 , 0.83714205, ..., 0.        , 0.        ,
         0.        ]], dtype=float32),
 array([[0.02696408, 0.02945836, 0.05510319, ..., 0.        , 0.        ,
         0.        ],
        [0.02137303, 0.02427736, 0.26840758, ..., 0.        , 0.        ,
         0.        ],
        [0.0251774 , 0.01975079, 0.08691705, ..., 0.        , 0.        ,
         0.        ],
        ...,
        [0.9809921 , 0.9703064 , 0.03630849, ..., 0.        , 0.        ,
         0.        

In [26]:
conf_threshold = 0.5
nms_threshold = 0.4

class_ids = []
confidences = []
boxes = []

for out in outs:
    for detection in out:
        scores = detection[5:]
        # The first 5 values in each detection represent the bounding box coordinates (x, y, w, h) and the confidence score
        class_id = np.argmax(scores) # index of maximum value in score 
        confidence = scores[class_id]
        if confidence > conf_threshold and class_id == 0:  # 0 corresponds to the 'person' class
            center_x = int(detection[0] * width)
            center_y = int(detection[1] * height)
            w = int(detection[2] * width)
            h = int(detection[3] * height)

            x = int(center_x - w / 2)
            y = int(center_y - h / 2)

# The first 4 values in the detection variable represent the bounding box coordinates in the normalized coordinate system. 
# The normalized coordinate system is a coordinate system where the width and height of the image are both 1.0. 
# To convert the normalized coordinates to the actual coordinates,
# we need to multiply them by the width and height of the image
            
            class_ids.append(class_id)
            confidences.append(float(confidence))
            boxes.append([x, y, w, h])

indices = cv2.dnn.NMSBoxes(boxes, confidences, conf_threshold, nms_threshold)

# cv2.dnn.NMSBoxes() used to remove overlapping bounding boxes, leaving only the most confident bounding boxes.
# boxes - list of bounding boxes
# confidence - list of confidence score for each box
# conf_threshold - minimum confidence required for the bounding box.
# nms_threshold -  when 2 bounding boxes overlapping means higher nms_threshold value bounding box considered
# list of indices of the bounding boxes that remain after non-maximum suppression.
# The indices are sorted in descending order of confidence scores.

print("center_x ",center_x)
print("center_y ",center_y)
print("w ",w)
print("h ",h)
print("x ",x)
print("y ",y)
print("confidence ",confidence)
print("indices ",indices)

center_x  315
center_y  225
w  70
h  37
x  280
y  206
confidence  0.0
indices  [136 116 134 119 147  97 140  93 114 118 100  86 113 139  85  88  74  75
  48 153  58  52  50  77  64  21 142  71  42  41  30  67  32 111  40 120
  24  20  38  28  51  45   1  19  23  90  37  17  22  33  16  39  25  69
  18  26]


In [20]:
count=0
for i in indices:
    box = boxes[i]
    x, y, w, h = box
    label = f"Person: {confidences[i]:.2f}"
    color = (0, 255, 0)  # Green color for bounding box
    cv2.rectangle(image, (x, y), (x + w, y + h), color, 2)
    cv2.putText(image, label, (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
    count+=1


In [23]:
# Resize the image to a smaller size for displaying
resize_factor = 1  # You can adjust this factor as needed
small_image = cv2.resize(image, None, fx=resize_factor, fy=resize_factor)

# Display the resized image with detected people
cv2.imshow('Detected People', small_image)
print("Total cout of people is ",count)
cv2.waitKey(0)
cv2.destroyAllWindows()

Total cout of people is  56
