# OpenCV Detection and Tracking
Based on: https://github.com/emasterclassacademy/Single-Multiple-Custom-Object-Detection-and-Tracking

Using OpenCV and EuclidanDistTracker. Machine Learning approach. 

Object Detection - YOLO
- Will be applied to each and every frame

Object Tracking - DeepSort


# Function

In [1]:
def check_tf_cuda(): 
    import tensorflow as tf
    return len(tf.config.list_physical_devices('GPU')) > 0

def check_cv2_cuda():
    import cv2
    import re
    cv_info = [re.sub('\s+', ' ', ci.strip()) for ci in cv2.getBuildInformation().strip().split('\n')
                if len(ci) > 0 and re.search(r'(nvidia*:?)|(cuda*:)|(cudnn*:)', ci.lower()) is not None]

    return len(cv_info) > 0

def download_weights():
    # download weights of not present
    import urllib.request
    import os
    if not os.path.exists("weights/yolov3.weights"):
        print("Downloading weights...")
        urllib.request.urlretrieve(settings.weight_urls, "weights/yolov3.weights")
    else:
        print("Weights already downloaded")

    if not os.path.exists("weights/yolov3-tiny.weights"):
        print("Downloading tiny weights...")
        urllib.request.urlretrieve(settings.tiny_weight_urls, "weights/yolov3-tiny.weights")
    else:
        print("Tiny Weights already downloaded")

# Main

## Convert to Tensorflow Model

In [3]:
import sys
import settings

settings.init()

# Check if we have a GPU support TF
if check_tf_cuda():
    print("TensorFlow GPU is available")
else:
    print("TensorFlow GPU is NOT available")
    sys.exit(1)

# Check if we have a GPU support OpenCV
if check_cv2_cuda():
    print("CV2 GPU is available")
else:
    print("CV2 GPU is NOT available")
    sys.exit(1)

# check if weights are downloaded
download_weights()

# check if weights have been converted
# Convert the weights to TensorFlow
from convert import convert
convert()
convert(tiny=True, weights="weights/yolov3-tiny.weights", output="weights/yolov3-tiny.tf")


TensorFlow GPU is available
CV2 GPU is available
Downloading weights...
Downloading tiny weights...
Model: "yolov3"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input (InputLayer)             [(None, None, None,  0           []                               
                                 3)]                                                              
                                                                                                  
 yolo_darknet (Functional)      ((None, None, None,  40620640    ['input[0][0]']                  
                                 256),                                                            
                                 (None, None, None,                                               
                                 512),                                                      

## Model

In [4]:
from absl import flags
import sys

import time
import numpy as np
import cv2
import matplotlib.pyplot as plt
from _collections import deque
import tensorflow as tf

# Load the model
from yolov3_tf2.models import YoloV3
from yolov3_tf2.dataset import transform_images
from yolov3_tf2.utils import convert_boxes

from deep_sort import preprocessing
from deep_sort import nn_matching
from deep_sort.detection import Detection
from deep_sort.tracker import Tracker

from tools import generate_detections as gdet

# checks if video is available
def check_video_present(video):
    ok, frame = video.read()
    if not ok:
        print ('Cannot read video file')
        sys.exit()
    return frame

# create file for video output
def get_output_video(vid):
    codec = cv2.VideoWriter_fourcc(*'XVID') # avi format
    vid_fps = int(vid.get(cv2.CAP_PROP_FPS))
    vid_size = (int(vid.get(cv2.CAP_PROP_FRAME_WIDTH)), int(vid.get(cv2.CAP_PROP_FRAME_HEIGHT)))
    return cv2.VideoWriter('./data/video/output.avi', codec, vid_fps, vid_size)

# align video to the model dimensions
def align_video_to_model(img):
    # convert color space from BGR to RGB because YOLOv3 was trained on RGB images
    img_in = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

    # expand the image to have a batch dimension
    img_in = tf.expand_dims(img_in, 0)

    # resize the image to the input size of the model, i.e. 416x416 pixels for YOLOv3
    img_in = transform_images(img_in, 416)

    return img_in

# read coco names based on IDs
def get_class_coco_names(classes):
    names = []
    for i in range(len(classes)):
        names.append(class_names[int(classes[i])])
    return np.array(names)
  
# get region of interest
def get_region_of_interest_selection(frame):
    roi_coordinates = cv2.selectROI(frame, False)
    cv2.destroyAllWindows()
    print("Region of interest: ", roi_coordinates)

    return roi_coordinates

# apply non-max suppression to the bounding boxes
def run_non_maxima_suppression(detections):
    boxs = np.array([d.tlwh for d in detections])
    scores = np.array([d.confidence for d in detections])
    classes = np.array([d.class_name for d in detections])

    # indices of the kept boxes, eliminated multi frame detections
    indices = preprocessing.non_max_suppression(boxs, classes, nms_max_overlap, scores)
    return [detections[i] for i in indices]

# Set the flags for the model
FLAGS = flags.FLAGS
FLAGS(sys.argv[:1])

# Enforce tensorflow v1
tf.compat.v1.disable_v2_behavior()

# CONFIG
output_video = False
max_cosine_distance = 0.5       # is it the same object?
nn_budget = None                # number of features to be stored in the memory
nms_max_overlap = 0.8           # non-maxima suppression, i.e. removes all boxes with a lower score than the max box
model_filename = 'model_data/mars-small128.pb'          # pre-trained model for pedestrian tracking

# Variable Section
class_names = [c.strip() for c in open('./data/labels/coco.names').readlines()]
cmap = plt.get_cmap('tab20b')
colors = [cmap(i)[:3] for i in np.linspace(0,1,20)]
pts = [deque(maxlen=30) for _ in range(1000)]       # 1000 is the maximum number of objects to be tracked, here we use 30 points to draw the trajectory
counter = []

# load video
vid = cv2.VideoCapture('./data/video/los_angeles.mp4')
frame = check_video_present(vid)
roi = get_region_of_interest_selection(frame)
out = get_output_video(vid)

# initialize yolo
yolo = YoloV3(classes=len(class_names))
yolo.load_weights('./weights/yolov3.tf')
#yolo = YoloV3Tiny(classes=len(class_names))
#yolo.load_weights('./weights/yolov3-tiny.tf')

# initialaize encoder
encoder = gdet.create_box_encoder(model_filename, batch_size=1)
metric = nn_matching.NearestNeighborDistanceMetric('cosine', max_cosine_distance, nn_budget)
tracker = Tracker(metric)

while True:
    _, img = vid.read()
    if img is None:
        print('Completed')
        break

    t1 = time.time()

    img_in = align_video_to_model(img)
    print("Time required to align video from: " + str(time.time()-t1))

    # object detection using YOLO
    # boxes 3D shape: (1, 100, 4)
    # scores 2D shape: (1, 100)
    # classes 2D shape: (1, 100)
    # nums 1D shape: (1,)
    
    boxes, scores, classes, nums = yolo.predict(img_in, steps=1)
    print("Time required to predict: " + str(time.time()-t1))
    classes = classes[0]

    # get the bounding boxes of detected objects
    converted_boxes = convert_boxes(img, boxes[0])

    # get the feature vectors of the detected objects
    features = encoder(img, converted_boxes)
    print("Time required to encode: " + str(time.time()-t1))

    # initialize detections
    detections = [Detection(bbox, score, class_name, feature) for bbox, score, class_name, feature in zip(converted_boxes, scores[0], classes, features)]
    detections = run_non_maxima_suppression(detections)
    print("Time required to run non maxima suppression: " + str(time.time()-t1))

    # execute kalman filter
    tracker.predict()
    tracker.update(detections)
    print("Time required for tracker to update: " + str(time.time()-t1))

    current_count = int(0)
    for track in tracker.tracks:
        # if kalman has no update, skip
        if not track.is_confirmed() or track.time_since_update >1:
            continue

        bbox = track.to_tlbr()
        class_name= class_names[int(track.get_class())]
        color = colors[int(track.get_class()) % len(colors)]            # color of the bounding box
        color = [i * 255 for i in color]                                # convert to RGB

        # draw bounding box
        cv2.rectangle(img, (int(bbox[0]),int(bbox[1])), (int(bbox[2]),int(bbox[3])), color, 2)
        
        # draw label with class name and track id
        cv2.rectangle(img, (int(bbox[0]), int(bbox[1]-30)), (int(bbox[0])+(len(class_name) + len(str(track.track_id))) * 17, int(bbox[1])), color, -1)
        cv2.putText(img, class_name + "-" + str(track.track_id), (int(bbox[0]), int(bbox[1] - 10)), 0, 0.75,(255, 255, 255), 2)

        # draw trajectory
        center = (int(((bbox[0]) + (bbox[2])) / 2), int(((bbox[1]) + (bbox[3])) / 2))
        pts[track.track_id].append(center)

        for j in range(1, len(pts[track.track_id])):
            # if we do not have enough points to draw a line, skip
            if pts[track.track_id][j] is None or  pts[track.track_id][j-1] is None:
                continue

            thickness = int(np.sqrt(64/float(j+1))*2)       # thickness of the line is inversely proportional to the number of points
            cv2.line(img, (pts[track.track_id][j-1]), (pts[track.track_id][j]), color, thickness)

        
        # count the number of objects in the ROI
        height, width, _ = img.shape
        cv2.line(img, (0, int(3*height/6+height/20)), (width, int(3*height/6+height/20)), (0, 255, 0), thickness=2)
        cv2.line(img, (0, int(3*height/6-height/20)), (width, int(3*height/6-height/20)), (0, 255, 0), thickness=2)

        center_y = int(((bbox[1])+(bbox[3]))/2)

        if center_y <= int(3*height/6+height/20) and center_y >= int(3*height/6-height/20):
            if class_name == 'car' or class_name == 'truck':
                counter.append(int(track.track_id))
                current_count += 1

    print("Time required to draw results for each track: " + str(time.time()-t1))
    
    total_count = len(set(counter))
    cv2.putText(img, "Current Vehicle Count: " + str(current_count), (0, 80), 0, 1, (0, 0, 255), 2)
    cv2.putText(img, "Total Vehicle Count: " + str(total_count), (0,130), 0, 1, (0,0,255), 2)

    # draw FPS
    fps = 1./(time.time()-t1)
    cv2.putText(img, "FPS: {:.2f}".format(fps), (0,30), 0, 1, (0,0,255), 2)

    cv2.imshow('output', img)
    cv2.resizeWindow('output', 1024, 768)
    
    if output_video:
        out.write(img)

    key = cv2.waitKey(1)
    if key == 27:
        break

vid.release()
out.release()
cv2.destroyAllWindows()

Region of interest:  (464, 399, 618, 240)
Time required to align video from: 0.017964839935302734


  updates=self.state_updates,


Time required to predict: 1.5052995681762695
Time required to encode: 2.576633930206299
Time required to run non maxima suppression: 2.5776350498199463
Time required for tracker to update: 2.5776350498199463
Time required to draw results for each track: 2.5776350498199463
Time required to align video from: 0.02399730682373047
Time required to predict: 1.120332956314087
Time required to encode: 1.5877008438110352
Time required to run non maxima suppression: 1.5877008438110352
Time required for tracker to update: 1.588670015335083
Time required to draw results for each track: 1.588670015335083
Time required to align video from: 0.016026020050048828
Time required to predict: 1.0610721111297607
Time required to encode: 1.5211262702941895
Time required to run non maxima suppression: 1.5220956802368164
Time required for tracker to update: 1.5220956802368164
Time required to draw results for each track: 1.5231027603149414
Time required to align video from: 0.01699995994567871
Time required to