In [49]:
import numpy as np
import cv2
import time

In [50]:
def ssd_detection(video_path, class_names, net, output_path, confidence):
    cap = cv2.VideoCapture(video_path)
    writer = None
    
    try:
        prop = cv2.CAP_PROP_FRAME_COUNT
        total = int(cap.get(prop))
        print("[INFO] {} total frames in video".format(total))

    except:
        print("[INFO] could not determine # of frames in video")
        total = -1
    
    np.random.seed(123)
    colors = np.random.randint(0, 255, size=(len(class_names), 3), 
                               dtype="uint8").astype("float")
    
    while True:
        status_cap, frame = cap.read()
        if not status_cap:
            break
        
        h, w = frame.shape[:2]
#         tensor = cv2.dnn.blobFromImage(frame, 1.0, (1000, 1000), 
#                                        (104.0, 177.0, 123.0))
        tensor = cv2.dnn.blobFromImage(frame, 1 / 255.0, (416, 416), 
                                       swapRB=True, crop=False)
            
        net.setInput(tensor)
        start = time.time()
        detections = net.forward()
        end = time.time()
        
        for i in range(detections.shape[2]):
            confidence_ = detections[0, 0, i, 2]

            if confidence_ > confidence:
                # extract the index of the class label from the `detections`,
                # then compute the (x, y)-coordinates of the bounding box for
                # the object
                idx = int(detections[0, 0, i, 1])
                box = detections[0, 0, i, 3:7] * np.array([w, h, w, h])
                (startX, startY, endX, endY) = box.astype("int")

                # display the prediction
                label = "{}: {:.2f}%".format(class_names[idx], confidence_ * 100)
                cv2.rectangle(frame, (startX, startY), (endX, endY), 
                              colors[idx], 2)
                y = startY - 15 if startY - 15 > 15 else startY + 15
                cv2.putText(frame, label, (startX, y),
                            cv2.FONT_HERSHEY_SIMPLEX, 0.5, colors[idx], 2)
         
        if writer is None:
            fourcc = cv2.VideoWriter_fourcc(*"MJPG")
            writer = cv2.VideoWriter(output_path, fourcc, 30, 
                                     (frame.shape[1], frame.shape[0]), True)
            if total > 0:
                elap = (end - start)
                print("[INFO] single frame took {:.4f} seconds".format(elap))
                print("[INFO] estimated total time to finish: {:.4f} seconds".format(
                    elap * total))

        writer.write(frame)
    writer.release()
    cap.release()

In [51]:
model_path = "./model/MobileNetSSD_deploy.caffemodel"
proto_path = "./model/deploy.prototxt"

In [52]:
CONFIDENCE = 0.5

In [53]:
class_names = ["background", "aeroplane", "bicycle", "bird", "boat", 
               "bottle", "bus", "car", "cat", "chair", "cow", 
               "diningtable", "dog", "horse", "motorbike", "person", 
               "pottedplant", "sheep", "sofa", "train", "tvmonitor"]

In [54]:
ssd = cv2.dnn.readNetFromCaffe(proto_path, model_path)

In [55]:
video_path = "../../../datasets/road_scene.mp4"
output_path = "./output/ssd_detection_road_scene.avi"
ssd_detection(video_path, class_names, ssd, output_path, confidence=CONFIDENCE)

[INFO] 1817 total frames in video
[INFO] single frame took 0.6198 seconds
[INFO] estimated total time to finish: 1126.1889 seconds
