# Task 1: Object Detection/ Optical character recognition 

## Problem Statement: Implement an object detector which identifies the classes of the objects in an image/video.

- Maintained by - **Ajinkya Jadhav**

### Reference :
- Big thanks to **Adrian Rosebrock** explaining how to implement computer vision model practically.🙏🙏 
- I have learned to implement this CV model from his tutorial.

### Import required libraries

In [10]:
# import the necessary packages
import numpy as np
import time
import cv2
import os
import matplotlib.pyplot as plt
import imutils

In [11]:
base_path = os.getcwd()
config_path = os.path.sep.join([base_path, 'model_config'])

### Project Outline:
- Using `YOLOv3` pretrained on `Coco` datasets which has 80 class labels.
- We first try to predict in the image or video whether given class labels objects are present or not.
- Then we bound a box around it with respective assigned colors and show the confidence of the model for that object.

#### Load the pretrained model

In [3]:
# load the COCO class labels our YOLO model was trained on
lables_path = os.path.sep.join([config_path, 'coco.names'])

LABELS = open(lables_path).read().strip().split("\n") # list of categories on which model trained
#print(len(LABELS))# there are total 80 class labels
# initialize a list of colors to represent each possible class label
np.random.seed(42)
COLORS = np.random.randint(0, 255, size=(len(LABELS), 3),
                           dtype='uint8') # decides colors to detect each class

In [4]:
# derive the paths to the YOLO weights and model configuration
weightsPath = os.path.sep.join([config_path, 'yolov3.weights'])
configPath = os.path.sep.join([config_path, 'yolov3.cfg'])

# load our YOLO object detector trained on COCO dataset(80 classes)
print("Loading YOLO...")
net = cv2.dnn.readNetFromDarknet(configPath, weightsPath)

Loading YOLO...


### Evaluating the model on the image

In [5]:
img_path = os.path.sep.join([base_path, 'examples'])

In [6]:
# load our input image and grab its spatial dimension
img_path = os.path.sep.join([img_path,'plaughing.jpg'])
image =  cv2.imread(img_path)
(H, W) = image.shape[:2]

# determine only the *output* layer names that we need from YOLO
ln = net.getLayerNames()
#un_ln = net.getUnconnectedOutLayers()
ln = [ln[i[0] - 1]for i in net.getUnconnectedOutLayers()]

# construct a blob from the input image and then perform a forward
# pass of the YOLO object detector,giving us our bounding boxes and 
# associated probabilities
blob = cv2.dnn.blobFromImage(image, 1/ 255.0,(416, 416),
                            swapRB = True, crop=False)
net.setInput(blob)
start = time.time()
layerOutputs = net.forward(ln)
end  = time.time()

# show timing information on YOLO
print("YOLO took {:.6f} seconds".format(end - start))

YOLO took 6.152124 seconds


In [7]:
# initialize our lists of detected bounding boxes, confidences, and
# class IDs,respectively
boxes = []
confidences = []
classIDs = []

# loop over each of the layer outputs
for output in layerOutputs:
    # loop over each of the detections
    #print(output.shape)
    for detection in output:
        #  extract the class ID and confidence (i.e. probability) of
        # the current object detection
        scores = detection[5:]
        classID = np.argmax(scores)
        confidence = scores[classID]
        
        # filter out weak predictions by ensuring the deteccted
        # probability is greater than the minimum probability
        if confidence > 0.5:
            # scale the bounding box coordinates back  relative to the
            # size of the image, keeping in mind that YOLO actually
            # returns the center (x, y)- coordinates of the bounding
            # box followed by the boxes' width and height
            box = detection[0:4]* np.array([W, H, W, H])
            (centerX, centerY, width, height) = box.astype("int")
            
            # use the center (x, y)-coordinates to derive the top and
            # and left corner of the bounding box
            x = int(centerX - (width / 2))
            y = int(centerY - (height / 2))
            
            # update our list of bounding box coordinates,confidences,
            # and class IDs
            boxes.append([x, y, int(width), int(height)])
            confidences.append(float(confidence))
            classIDs.append(classID)
            

# apply non-maxima suppression to suppress weak, overlapping bounding
# boxes
idxs = cv2.dnn.NMSBoxes(boxes, confidences, 0.5, 0.3)

In [8]:
# ensure at least one detection exists
if len(idxs) > 0:
    # loop over the indexes we are keeping
    for i in idxs.flatten():
        # extract the bounding box coordinates
        (x, y) = (boxes[i][0], boxes[i][1])
        (w, h) = (boxes[i][2], boxes[i][3])
        # draw a bounding box rectangle and label on the image
        color = [int(c) for c in COLORS[classIDs[i]]]
        cv2.rectangle(image, (x, y), (x + w, y + h), color, 2)
        text = "{}: {:.4f}".format(LABELS[classIDs[i]], confidences[i])
        cv2.putText(image, text, (x, y - 5), cv2.FONT_HERSHEY_SIMPLEX,
                    0.5, color, 2)
# show the output image
cv2.imshow("Image", image)
cv2.waitKey(0)

-1

### Evaluating model on video

In [9]:
video_path = os.sep.join([base_path,'Videos'])
output_path = os.sep.join([base_path,'Output','output2.avi'])

In [12]:
# Load the COCO class labels that the YOLO model was trained on
labelsPath = os.path.sep.join([config_path, "coco.names"])
LABELS = open(labelsPath).read().strip().split("\n")

# Initialize the color list to represent each possible class label
np.random.seed(42)
COLORS = np.random.randint(0, 255, size=(len(LABELS), 3), dtype="uint8")

# Derive the paths to the YOLO weights and model configuration
weightsPath = os.path.sep.join([config_path, "yolov3.weights"])
configPath = os.path.sep.join([config_path, "yolov3.cfg"])

# Load the YOLO object detector trained on COCO dataset (80 classes)
# and determine only the output layer names that we need from YOLO
print("Loading YOLO ...")
net = cv2.dnn.readNetFromDarknet(configPath, weightsPath)
ln = net.getLayerNames()
ln = [ln[i[0] - 1] for i in net.getUnconnectedOutLayers()]

# Initialize the video stream, pointer to output video file and frame dimensions
vs = cv2.VideoCapture(os.path.sep.join([video_path, 'airport.mp4']))
writer = None
(W, H) = (None, None)

# Try to determine the total number of frames in the video file
try:
    prop = cv2.cv.CV_CAP_PROP_FRAME_COUNT if imutils.is_cv2() \
        else cv2.CAP_PROP_FRAME_COUNT
    total = int(vs.get(prop))
    print("{} total frames in video".format(total))
# an error occurred while trying to determine the total number of frames in the video file
except:
    print("Could not determine # of frames in video")
    print("No approx. completion time can be provided")
    total = -1

# Loop over frames from the video file stream
while True:
    # read the next frame from the file
    (grabbed, frame) = vs.read()
    # if the frame was not grabbed then we have reached the end of the stream
    if not grabbed:
        break
    # if the frame dimensions are empty grab them
    if W is None or H is None:
        (H, W) = frame.shape[:2]
    # construct a blob from the input frame and then perform a forward pass of the YOLO object detector
    # giving us our bouding boxes and associated probabilities
    blob = cv2.dnn.blobFromImage(frame, 1 / 255.0, (416, 416), 
                                 swapRB=True, crop=False)
    net.setInput(blob)
    start = time.time()
    layerOutputs = net.forward(ln)
    end = time.time()
    # initialize our list of detected bounding boxes, confidences and class IDs, respectively
    boxes = []
    confidences = []
    classIDs = []
    # loop over each one of the layer outputs
    for output in layerOutputs:
        # loop over each of the detections
        for detection in output:
            # extract the class ID and confidence (i.e, probability) of the current object detection
            scores = detection[5:]
            classID = np.argmax(scores)
            confidence = scores[classID]
            # filter out weak predictions by ensuring the detected probability is greater than the minimum probability
            if confidence > 0.5:
                # scale the bounding box coordinates back relative to the size of the image, keeping in mind that YOLO
                # actually returns the center (x, y) coordinates of the bounding box followed by the boxes width and height
                box = detection[0:4] * np.array([W, H, W, H])
                (centerX, centerY, width, height) = box.astype("int")
                # use the center (x, y)-coordinates to derive the top and and left corner of the bounding box
                x = int(centerX - (width / 2))
                y = int(centerY - (height / 2))
                # update our list of bounding box coordinates, confidences, and class IDs
                boxes.append([x, y, int(width), int(height)])
                confidences.append(float(confidence))
                classIDs.append(classID)
    # apply non-maxima suppression to suppress weak, overlapping bounding boxes
    idxs = cv2.dnn.NMSBoxes(boxes, confidences, 0.5, 0.3)
    # ensure at least one detection exists
    if len(idxs) > 0:
        # loop over the indexes we are keeping
        for i in idxs.flatten():
            # extract the bounding box coordinates
            (x, y) = (boxes[i][0], boxes[i][1])
            (w, h) = (boxes[i][2], boxes[i][3])
            # draw a bounding box rectangle and label on the frame
            color = [int(c) for c in COLORS[classIDs[i]]]
            cv2.rectangle(frame, (x, y), (x + w, y + h), color, 2)
            text = "{}: {:.4f}".format(LABELS[classIDs[i]], confidences[i])
            cv2.putText(frame, text, (x, y - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
    # check if the video writer is None
    if writer is None:
        # initialize our video writer
        fourcc = cv2.VideoWriter_fourcc(*"MJPG")
        writer = cv2.VideoWriter(output_path, fourcc, 30, (frame.shape[1], frame.shape[0]), True)
        # some information on processing single frame
        if total > 0:
            elap = (end - start)
            print("Single frame took {:.4f} seconds".format(elap))
            print("Estimated total time to finish: {:.4f}".format(elap * total))
    # write the output frame to disk
    writer.write(frame)

# release the file pointers
print("Cleaning up...")
writer.release()
vs.release()

Loading YOLO ...
749 total frames in video
Single frame took 4.9967 seconds
Estimated total time to finish: 3742.5380
Cleaning up...
