In [143]:
import os
import cv2
import time
import numpy as np
import pandas as pd
from yolov4.tf import YOLOv4
import matplotlib.pyplot as plt
from collections import defaultdict

In [169]:
def buildActionDict():
    with open("ava_videos/action_list.pbtxt", 'r') as file:
        actions = file.read()
    actions = actions.split('item {\n  ')[1:]
    actions = [[keys.split(': ') for keys in ac.split('\n')[:2]] for ac in actions]
    actions_dict ={}
    for ac in actions:
        actions_dict[int(ac[1][1])] = ac[0][1][1:-1]
    return actions_dict

In [188]:
def getGroundTruthBbox(df):
    bboxes = []
    for idx, row in df.iterrows():
        bboxes.append([row['x1'], row['y1'], row['x2'], row['y2'], row['action_id']])
    return np.array(bboxes)

In [199]:
def draw_groundTruth_bboxes(image, bboxes):
    image = np.copy(image)
    height, width, _ = image.shape
    bboxes[:, [0, 2]] = bboxes[:, [0, 2]] * width
    bboxes[:, [1, 3]] = bboxes[:, [1, 3]] * height
    actions = buildActionDict()
    for bbox in bboxes:
        top_left = (int(bbox[0]), int(bbox[1]))
        bottom_right = (int(bbox[2]), int(bbox[3]))
        action_id = bbox[4]
        bbox_color = (255, 0, 255)
        font_size = 0.4
        font_thickness = 1
        cv2.rectangle(image, top_left, bottom_right, bbox_color, 2)
        bbox_text = actions[action_id]
        t_size = cv2.getTextSize(bbox_text, 0, font_size, font_thickness)[0]
        cv2.rectangle(
            image,
            top_left,
            (top_left[0] + t_size[0], top_left[1] - t_size[1] - 3),
            bbox_color,
            -1,
        )
        cv2.putText(
            image,
            bbox_text,
            (top_left[0], top_left[1] - 2),
            cv2.FONT_HERSHEY_SIMPLEX,
            font_size,
            (255 - bbox_color[0], 255 - bbox_color[1], 255 - bbox_color[2]),
            font_thickness,
            lineType=cv2.LINE_AA,
        )
    return image

In [179]:
def draw_objects(image, bboxes, classes):
    image = np.copy(image)
    height, width, _ = image.shape
    bboxes.view('i8,i8,i8,i8,i8,i8').sort(order=['f0','f1'], axis=0)
    bboxes[:, [0, 2]] = bboxes[:, [0, 2]] * width
    bboxes[:, [1, 3]] = bboxes[:, [1, 3]] * height
    person_count = 0
    for bbox in bboxes:
        c_x = int(bbox[0])
        c_y = int(bbox[1])
        half_w = int(bbox[2] / 2)
        half_h = int(bbox[3] / 2)
        top_left = [c_x - half_w, c_y - half_h]
        bottom_right = [c_x + half_w, c_y + half_h]
        top_left[0] = max(top_left[0], 0)
        top_left[1] = max(top_left[1], 0)
        bottom_right[0] = min(bottom_right[0], width)
        bottom_right[1] = min(bottom_right[1], height)
        class_id = int(bbox[4])
        if class_id == 0:
            person_count += 1
            windowName = "{}_{}".format(classes[class_id],person_count)
            cv2.namedWindow(windowName, cv2.WINDOW_AUTOSIZE)
            obj = image[top_left[1]:bottom_right[1], top_left[0]:bottom_right[0], :]
            cv2.imshow(windowName, obj)

In [180]:
def buildYoloModel():
    yolo = YOLOv4()
    yolo.classes = "coco.names"
    yolo.input_size=(608,608)
    yolo.make_model()
    yolo.load_weights("yolov4.weights", weights_type='yolo')
    return yolo

In [181]:
def run(media_path, yolo, groundTruth_df, iou_threshold = 0.3, score_threshold = 0.4, start_time = 902, end_time = 1798):
    
    if not os.path.exists(media_path):
        raise FileNotFoundError("{} does not exist".format(media_path))

    cv2.namedWindow("result", cv2.WINDOW_AUTOSIZE)
    cv2.namedWindow("origin", cv2.WINDOW_AUTOSIZE)
    cv2.namedWindow("ground_truth", cv2.WINDOW_AUTOSIZE)

    cap = cv2.VideoCapture(media_path)

    if cap.isOpened():
        while True:
            try:
                is_success, frame = cap.read()
            except cv2.error:
                continue
                
            now_second = cap.get(0)/1000
            
            if now_second < start_time: continue
            if (not is_success) or (now_second >= end_time+1): break

            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

            bboxes = yolo.predict(
                frame,
                iou_threshold=iou_threshold,
                score_threshold=score_threshold,
            )
            
            groundTruth_bboxes = getGroundTruthBbox(groundTruth_df[groundTruth_df['timestamp']==int(now_second)])

            frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
            image = yolo.draw_bboxes(frame, bboxes)
            groundTruth_img = draw_groundTruth_bboxes(frame, groundTruth_bboxes)

            cv2.imshow("result", image)
            cv2.imshow("origin", frame)
            cv2.imshow("ground_truth", groundTruth_img)
            #draw_objects(frame, bboxes, yolo.classes)

            if cv2.waitKey(1) & 0xFF == ord("q"):
                break

    cap.release()
    cv2.destroyAllWindows()

In [149]:
with open('ava_videos/ava_file_names_trainval_v2.1.txt', 'r') as f:
    video_names = f.readlines()
video_names = [v.rstrip().split('.') for v in video_names]
video_names_dict = {}
for video in video_names:
    video_names_dict[video[0]] = video[0]+'.'+video[1]

columns = ['video_id', 'timestamp', 'x1', 'y1', 'x2', 'y2', 'action_id', 'person_id']
train_df = pd.read_csv('ava_videos/ava_train_v2.2.csv')
val_df = pd.read_csv('ava_videos/ava_val_v2.2.csv')
train_df.columns = columns
val_df.columns = columns
train_df.drop(train_df[train_df['video_id']=='#NAME?'].index, inplace=True)
train_df['video_id'] = train_df['video_id'].map(video_names_dict)
val_df['video_id'] = val_df['video_id'].map(video_names_dict)

train_videos = train_df['video_id'].unique()
val_videos = val_df['video_id'].unique()

In [153]:
yolo = buildYoloModel()
train_path = "ava_videos/train/"
val_path = "ava_videos/val/"
media_name = train_videos[1]
media_path = train_path + media_name

In [201]:
cv2.destroyAllWindows()

In [200]:
run(media_path, yolo, train_df[train_df['video_id']==media_name])

KeyboardInterrupt: 