In [1]:
import os
import cv2
import time
import numpy as np
import pandas as pd
from yolov4.tf import YOLOv4
import matplotlib.pyplot as plt
from collections import defaultdict

In [None]:
from deep_sort.tools.generate_detections import create_box_encoder
from deep_sort.deep_sort import nn_matching
from deep_sort.deep_sort.detection import Detection
from deep_sort.deep_sort.tracker import Tracker as ds_Tracker
MODEL_CKPT = "./deep_sort/weights/mars-small128.pb"
import action_detection.action_detector as act

In [None]:
class Tracker():
    def __init__(self, timesteps=32):
        self.active_actors = []
        self.inactive_actors = []
        self.actor_no = 0
        self.frame_history = []
        self.frame_no = 0
        self.timesteps = timesteps
        self.actor_infos = {}
        # deep sort
        self.encoder = create_box_encoder(MODEL_CKPT, batch_size=16)
        metric = nn_matching.NearestNeighborDistanceMetric("cosine", 0.2, None) #, max_cosine_distance=0.2) #, nn_budget=None)
        #self.tracker = ds_Tracker(metric, max_iou_distance=0.7, max_age=30, n_init=3)
        #self.tracker = ds_Tracker(metric, max_iou_distance=0.7, max_age=200, n_init=1)
        self.tracker = ds_Tracker(metric, max_iou_distance=0.7, max_age=200, n_init=5)
        self.score_th = 0.40
        #self.results = []


    def update_tracker(self, detection_info, frame):
        ''' Takes the frame and the results from the object detection
            Updates the tracker wwith the current detections and creates new tracks
        '''

        boxes = np.array([d[:4] for d in detection_info])
        classes = np.array([d[4] for d in detection_info])
        scores = np.array([d[5] for d in detection_info])
        num_detections = len(detection_info)
        indices = np.logical_and(scores > self.score_th, classes == 0)# filter score threshold and non-person detections
        filtered_boxes, filtered_scores = boxes[indices], scores[indices]

        H,W,C = frame.shape
        filtered_boxes[:, [0, 2]] = filtered_boxes[:, [0, 2]] * W
        filtered_boxes[:, [1, 3]] = filtered_boxes[:, [1, 3]] * H
        # deep sort format boxes (x, y, W, H)
        ds_boxes = []
        for bb in range(filtered_boxes.shape[0]):
            cur_box = filtered_boxes[bb]
            cur_score = filtered_scores[bb]
            c_x = int(cur_box[0])
            c_y = int(cur_box[1])
            half_w = int(cur_box[2]/2)
            half_h = int(cur_box[3]/2)
            ds_box = [c_x - half_w, c_y - half_h, int(cur_box[2]), int(cur_box[3])]
            ds_boxes.append(ds_box)
        features = self.encoder(frame, ds_boxes)

        detection_list = []
        for bb in range(filtered_boxes.shape[0]):
            cur_box = filtered_boxes[bb]
            cur_score = filtered_scores[bb]
            feature = features[bb]
            c_x = int(cur_box[0])
            c_y = int(cur_box[1])
            half_w = int(cur_box[2]/2)
            half_h = int(cur_box[3]/2)
            ds_box = [c_x - half_w, c_y - half_h, int(cur_box[2]), int(cur_box[3])]
            detection_list.append(Detection(ds_box, cur_score, feature))

        # update tracker
        self.tracker.predict()
        self.tracker.update(detection_list)
        
        # Store results.
        #results = []
        actives = []
        for track in self.tracker.tracks:
            if not track.is_confirmed() or track.time_since_update > 1:
                continue
            bbox = track.to_tlwh()
            left, top, width, height = bbox
            tr_box = [top / float(H), left / float(W), (top+height)/float(H), (left+width)/float(W)]
            actor_id = track.track_id
            detection_conf = track.last_detection_confidence
            #results.append([frame_idx, track.track_id, bbox[0], bbox[1], bbox[2], bbox[3]])
            #results.append({'all_boxes': [tr_box], 'all_scores': [1.00], 'actor_id': track.track_id})
            if actor_id in self.actor_infos: # update with the new bbox info
                cur_actor = self.actor_infos[actor_id]
                no_interpolate_frames = self.frame_no - cur_actor['last_updated_frame_no']
                interpolated_box_list = bbox_interpolate(cur_actor['all_boxes'][-1], tr_box, no_interpolate_frames)
                cur_actor['all_boxes'].extend(interpolated_box_list[1:])
                cur_actor['last_updated_frame_no'] = self.frame_no
                cur_actor['length'] = len(cur_actor['all_boxes'])
                cur_actor['all_scores'].append(detection_conf)
                actives.append(cur_actor)
            else:
                new_actor = {'all_boxes': [tr_box], 'length':1, 'last_updated_frame_no': self.frame_no, 'all_scores':[detection_conf], 'actor_id':actor_id}
                self.actor_infos[actor_id] = new_actor

        self.active_actors = actives
        
        self.frame_history.append(frame)
        if len(self.frame_history) > 2*self.timesteps:
            del self.frame_history[0]

        self.frame_no += 1

    def generate_all_rois(self):
        no_actors = len(self.active_actors)
        rois_np = np.zeros([no_actors, 4])
        temporal_rois_np = np.zeros([no_actors, self.timesteps, 4])
        for bb, actor_info in enumerate(self.active_actors):
            actor_no = actor_info['actor_id']
            norm_roi, full_roi = self.generate_person_tube_roi(actor_no)
            rois_np[bb] = norm_roi
            temporal_rois_np[bb] = full_roi
        return rois_np, temporal_rois_np

    def generate_person_tube_roi(self, actor_id):
        actor_info = [act for act in self.active_actors if act['actor_id'] == actor_id][0]
        boxes = actor_info['all_boxes']
        if actor_info['length'] < self.timesteps:
            recent_boxes = boxes
            index_offset = (self.timesteps - actor_info['length'] + 1) 
        else:
            recent_boxes = boxes[-self.timesteps:]
            index_offset = 0
        H,W,C = self.frame_history[-1].shape
        mid_box = recent_boxes[len(recent_boxes)//2]
        # top, left, bottom, right = mid_box
        # edge = max(bottom - top, right - left) / 2.
        edge, norm_roi = generate_edge_and_normalized_roi(mid_box)

        # tube = np.zeros([self.timesteps] + list(box_size) + [3], np.uint8)
        full_rois = []
        # for rr in range(len(recent_boxes)):
        for rr in range(self.timesteps):
            if rr < index_offset:
                cur_box = recent_boxes[0]
            else:
                cur_box = recent_boxes[rr - index_offset]
            
            # zero pad so that we dont have to worry about edge cases
            # cur_frame = self.frame_history[rr]
            # padsize = int(edge * max(H,W))
            # cur_frame = np.pad(cur_frame, [(padsize,padsize),(padsize,padsize), (0,0)], 'constant')

            top, left, bottom, right = cur_box
            cur_center = (top+bottom)/2., (left+right)/2.
            top, bottom = cur_center[0] - edge, cur_center[0] + edge
            left, right = cur_center[1] - edge, cur_center[1] + edge

            # top_ind, bottom_ind = int(top * H)+padsize, int(bottom * H)+padsize
            # left_ind, right_ind = int(left * W)+padsize, int(right * W)+padsize
            # cur_image_crop = cur_frame[top_ind:bottom_ind, left_ind:right_ind]
            # tube[rr+index_offset,:,:,:] = cv2.resize(cur_image_crop, box_size)
            full_rois.append([top, left, bottom, right])
        full_rois_np = np.stack(full_rois, axis=0)

        return norm_roi, full_rois_np

In [None]:
def bbox_interpolate(start_box, end_box, no_interpolate_frames):
    delta = (np.array(end_box) - np.array(start_box)) / float(no_interpolate_frames)
    interpolated_boxes = []
    for ii in range(0, no_interpolate_frames+1):
        cur_box = np.array(start_box) + delta * ii
        interpolated_boxes.append(cur_box.tolist())
    return interpolated_boxes

In [None]:
def generate_edge_and_normalized_roi(mid_box):
    top, left, bottom, right = mid_box

    edge = max(bottom - top, right - left) / 2. * 1.5 # change this to change the size of the tube

    cur_center = (top+bottom)/2., (left+right)/2.
    context_top, context_bottom = cur_center[0] - edge, cur_center[0] + edge
    context_left, context_right = cur_center[1] - edge, cur_center[1] + edge

    normalized_top = (top - context_top) / (2*edge)
    normalized_bottom = (bottom - context_top) / (2*edge)

    normalized_left = (left - context_left) / (2*edge)
    normalized_right = (right - context_left) / (2*edge)

    norm_roi = [normalized_top, normalized_left, normalized_bottom, normalized_right]

    return edge, norm_roi

In [2]:
def buildActionDict():
    with open("ava_videos/action_list.pbtxt", 'r') as file:
        actions = file.read()
    actions = actions.split('item {\n  ')[1:]
    actions = [[keys.split(': ') for keys in ac.split('\n')[:2]] for ac in actions]
    actions_dict ={}
    for ac in actions:
        actions_dict[int(ac[1][1])] = ac[0][1][1:-1]
    return actions_dict

In [3]:
def getGroundTruthBbox(df):
    bboxes = []
    for idx, row in df.iterrows():
        bboxes.append([row['x1'], row['y1'], row['x2'], row['y2'], row['action_id']])
    return np.array(bboxes)

In [4]:
def draw_groundTruth_bboxes(image, bboxes):
    image = np.copy(image)
    height, width, _ = image.shape
    bboxes[:, [0, 2]] = bboxes[:, [0, 2]] * width
    bboxes[:, [1, 3]] = bboxes[:, [1, 3]] * height
    actions = buildActionDict()
    for bbox in bboxes:
        top_left = (int(bbox[0]), int(bbox[1]))
        bottom_right = (int(bbox[2]), int(bbox[3]))
        action_id = bbox[4]
        bbox_color = (255, 0, 255)
        font_size = 0.4
        font_thickness = 1
        cv2.rectangle(image, top_left, bottom_right, bbox_color, 2)
        bbox_text = actions[action_id]
        t_size = cv2.getTextSize(bbox_text, 0, font_size, font_thickness)[0]
        cv2.rectangle(
            image,
            top_left,
            (top_left[0] + t_size[0], top_left[1] - t_size[1] - 3),
            bbox_color,
            -1,
        )
        cv2.putText(
            image,
            bbox_text,
            (top_left[0], top_left[1] - 2),
            cv2.FONT_HERSHEY_SIMPLEX,
            font_size,
            (255 - bbox_color[0], 255 - bbox_color[1], 255 - bbox_color[2]),
            font_thickness,
            lineType=cv2.LINE_AA,
        )
    return image

In [5]:
def draw_objects(image, bboxes, classes):
    image = np.copy(image)
    height, width, _ = image.shape
    bboxes[:, [0, 2]] = bboxes[:, [0, 2]] * width
    bboxes[:, [1, 3]] = bboxes[:, [1, 3]] * height
    person_count = 0
    for bbox in bboxes:
        c_x = int(bbox[0])
        c_y = int(bbox[1])
        half_w = int(bbox[2] / 2)
        half_h = int(bbox[3] / 2)
        top_left = [c_x - half_w, c_y - half_h]
        bottom_right = [c_x + half_w, c_y + half_h]
        top_left[0] = max(top_left[0], 0)
        top_left[1] = max(top_left[1], 0)
        bottom_right[0] = min(bottom_right[0], width)
        bottom_right[1] = min(bottom_right[1], height)
        class_id = int(bbox[4])
        if class_id == 0:
            person_count += 1
            windowName = "{}_{}".format(classes[class_id],person_count)
            cv2.namedWindow(windowName, cv2.WINDOW_AUTOSIZE)
            obj = image[top_left[1]:bottom_right[1], top_left[0]:bottom_right[0], :]
            cv2.imshow(windowName, obj)

In [6]:
def buildYoloModel():
    yolo = YOLOv4()
    yolo.classes = "coco.names"
    yolo.input_size=(608,608)
    yolo.make_model()
    yolo.load_weights("yolov4.weights", weights_type='yolo')
    return yolo

In [None]:
def run(media_path, yolo, groundTruth_df, iou_threshold = 0.5, score_threshold = 0.5, start_time = 902, end_time = 1798):
    
    if not os.path.exists(media_path):
        raise FileNotFoundError("{} does not exist".format(media_path))

    cv2.namedWindow("result", cv2.WINDOW_AUTOSIZE)
    cv2.namedWindow("origin", cv2.WINDOW_AUTOSIZE)
    #cv2.namedWindow("ground_truth", cv2.WINDOW_AUTOSIZE)

    cap = cv2.VideoCapture(media_path)
    
    tracker = Tracker()
    action_freq = 8
    W, H = int(cap.get(3)), int(cap.get(4))
    T = tracker.timesteps
    act_detector = act.Action_Detector('soft_attn')
    ckpt_name = 'model_ckpt_soft_attn_pooled_cosine_drop_ava-130'
    memory_size = act_detector.timesteps - action_freq
    updated_frames, temporal_rois, temporal_roi_batch_indices, cropped_frames = act_detector.crop_tubes_in_tf_with_memory([T,H,W,3], memory_size)
    
    rois, roi_batch_indices, pred_probs = act_detector.define_inference_with_placeholders_noinput(cropped_frames)
    
    ckpt_path = os.path.join('./', 'action_detection', 'weights', ckpt_name)
    act_detector.restore_model(ckpt_path)

    prob_dict = {}
    
    frame_cnt = 0

    if cap.isOpened():
        while True:
            try:
                is_success, frame = cap.read()
            except cv2.error:
                continue
                
            now_second = cap.get(0)/1000
            
            if now_second < start_time: continue
            if (not is_success) or (now_second >= end_time+1): break
                
            frame_cnt += 1
            print(frame_cnt)

            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

            bboxes = yolo.predict(
                frame,
                iou_threshold=iou_threshold,
                score_threshold=score_threshold,
            )
            bboxes.view('i8,i8,i8,i8,i8,i8').sort(order=['f0','f1'], axis=0)
            
            tracker.update_tracker(bboxes, frame)
            no_actors = len(tracker.active_actors)
            
            if tracker.active_actors and frame_cnt % action_freq == 0:
                probs = []

                cur_input_sequence = np.expand_dims(np.stack(tracker.frame_history[-action_freq:], axis=0), axis=0)

                rois_np, temporal_rois_np = tracker.generate_all_rois()
                if no_actors > 14:
                    no_actors = 14
                    rois_np = rois_np[:14]
                    temporal_rois_np = temporal_rois_np[:14]

                feed_dict = {updated_frames:cur_input_sequence, # only update last #action_freq frames
                             temporal_rois: temporal_rois_np,
                             temporal_roi_batch_indices: np.zeros(no_actors),
                             rois:rois_np, 
                             roi_batch_indices:np.arange(no_actors)}
                run_dict = {'pred_probs': pred_probs}
                out_dict = act_detector.session.run(run_dict, feed_dict=feed_dict)
                probs = out_dict['pred_probs']
                # associate probs with actor ids
                print_top_k = 5
                for bb in range(no_actors):
                    act_probs = probs[bb]
                    order = np.argsort(act_probs)[::-1]
                    cur_actor_id = tracker.active_actors[bb]['actor_id']
                    print("Person %i" % cur_actor_id)
                    cur_results = []
                    for pp in range(print_top_k):
                        print('\t %s: %.3f' % (act.ACTION_STRINGS[order[pp]], act_probs[order[pp]]))
                        cur_results.append((act.ACTION_STRINGS[order[pp]], act_probs[order[pp]]))
                    prob_dict[cur_actor_id] = cur_results

                t5 = time.time(); print('action %.2f seconds' % (t5-t3))
            
            #groundTruth_bboxes = getGroundTruthBbox(groundTruth_df[groundTruth_df['timestamp']==int(now_second)])

            frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
            image = yolo.draw_bboxes(frame, bboxes)
            #groundTruth_img = draw_groundTruth_bboxes(frame, groundTruth_bboxes)

            cv2.imshow("result", image)
            cv2.imshow("origin", frame)
            #cv2.imshow("ground_truth", groundTruth_img)
            #draw_objects(frame, bboxes, yolo.classes)

            if cv2.waitKey(1) & 0xFF == ord("q"):
                break

    cap.release()
    cv2.destroyAllWindows()

In [40]:
def objectDetection(path, media_name, yolo, iou_threshold = 0.5, score_threshold = 0.5, start_time = 902, end_time = 1798):
    
    media_path = path + media_name
    
    if not os.path.exists(media_path):
        raise FileNotFoundError("{} does not exist".format(media_path))

    cap = cv2.VideoCapture(media_path)

    if cap.isOpened():
        while True:
            try:
                is_success, frame = cap.read()
            except cv2.error:
                continue
                
            now_second = cap.get(0)/1000
            
            if now_second < start_time: continue
            if (not is_success) or (now_second >= end_time+1): break
                
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

            bboxes = yolo.predict(
                frame,
                iou_threshold=iou_threshold,
                score_threshold=score_threshold,
            )
            bboxes.view('i8,i8,i8,i8,i8,i8').sort(order=['f0','f1'], axis=0)
            for bb in bboxes:
                if bb[4] == 0:
                    obj = [media_name, now_second]+list(bb)
                    objs.append(obj)
            if cv2.waitKey(1) & 0xFF == ord("q"):
                break

    cap.release()

In [9]:
with open('ava_videos/ava_file_names_trainval_v2.1.txt', 'r') as f:
    video_names = f.readlines()
video_names = [v.rstrip().split('.') for v in video_names]
video_names_dict = {}
for video in video_names:
    video_names_dict[video[0]] = video[0]+'.'+video[1]

columns = ['video_id', 'timestamp', 'x1', 'y1', 'x2', 'y2', 'action_id', 'person_id']
train_df = pd.read_csv('ava_videos/ava_train_v2.2.csv')
val_df = pd.read_csv('ava_videos/ava_val_v2.2.csv')
train_df.columns = columns
val_df.columns = columns
train_df.dropna(inplace=True)
val_df.dropna(inplace=True)
train_df.drop(train_df[train_df['video_id']=='#NAME?'].index, inplace=True)
train_df['video_id'] = train_df['video_id'].map(video_names_dict)
val_df['video_id'] = val_df['video_id'].map(video_names_dict)

train_videos = train_df['video_id'].unique()
val_videos = val_df['video_id'].unique()

In [None]:
yolo = buildYoloModel()

train_path = "ava_videos/train/"
val_path = "ava_videos/val/"

cnt = 0
for media_name in val_videos:
    start = time.time()
    objs = []
    cnt+=1
    print("Video", cnt, ", Processing:", media_name)
    objectDetection(val_path, media_name, yolo)
    print("Done in ", time.time()-start, 'seconds')
    res = pd.DataFrame(objs, columns=['video_id','timestamp','c_x','c_y','w','h','obj_id','confidence'])
    res.to_csv(media_name+'.csv', index=False)

Video 1 , Processing: 1j20qq1JyX4.mp4


In [11]:
cv2.destroyAllWindows()

In [16]:
run(media_path, yolo, train_df[train_df['video_id']==media_name])

[[0.64625376 0.52953063 0.56066054 0.9593384  0.         0.95671797]]
[[0.64709496 0.52929073 0.55817723 0.96114922 0.         0.96396029]]
[[0.64737463 0.52802891 0.55543369 0.95834234 0.         0.96096146]]
[[0.647035   0.52766768 0.55654144 0.95937453 0.         0.95982504]]
[[0.6469717  0.52754074 0.5589447  0.96170722 0.         0.95807731]]
[[0.64701718 0.52735329 0.55822039 0.96316825 0.         0.95761442]]
[[0.64754409 0.52748617 0.55817181 0.95704492 0.         0.95818079]]
[[0.64795047 0.52818341 0.56000203 0.95656247 0.         0.96046382]]
[[0.64860839 0.52779526 0.55726939 0.9577677  0.         0.96552509]]


KeyboardInterrupt: 

In [18]:
import tensorflow as tf
assert tf.executing_eagerly(), "Sonnet v2 requires TensorFlow 2"

In [10]:
buildActionDict()

{1: 'bend/bow (at the waist)',
 3: 'crouch/kneel',
 4: 'dance',
 5: 'fall down',
 6: 'get up',
 7: 'jump/leap',
 8: 'lie/sleep',
 9: 'martial art',
 10: 'run/jog',
 11: 'sit',
 12: 'stand',
 13: 'swim',
 14: 'walk',
 15: 'answer phone',
 17: 'carry/hold (an object)',
 20: 'climb (e.g., a mountain)',
 22: 'close (e.g., a door, a box)',
 24: 'cut',
 26: 'dress/put on clothing',
 27: 'drink',
 28: 'drive (e.g., a car, a truck)',
 29: 'eat',
 30: 'enter',
 34: 'hit (an object)',
 36: 'lift/pick up',
 37: 'listen (e.g., to music)',
 38: 'open (e.g., a window, a car door)',
 41: 'play musical instrument',
 43: 'point to (an object)',
 45: 'pull (an object)',
 46: 'push (an object)',
 47: 'put down',
 48: 'read',
 49: 'ride (e.g., a bike, a car, a horse)',
 51: 'sail boat',
 52: 'shoot',
 54: 'smoke',
 56: 'take a photo',
 57: 'text on/look at a cellphone',
 58: 'throw',
 59: 'touch (an object)',
 60: 'turn (e.g., a screwdriver)',
 61: 'watch (e.g., TV)',
 62: 'work on a computer',
 63: 'writ

In [13]:
from absl import logging

import tensorflow as tf
import tensorflow_hub as hub

logging.set_verbosity(logging.ERROR)

# Some modules to help with reading the UCF101 dataset.
import random
import re
import os
import tempfile
import ssl
import cv2
import numpy as np

# Some modules to display an animation using imageio.
import imageio
from IPython import display

from urllib import request  # requires python3

In [37]:
# Utilities to fetch videos from UCF101 dataset
UCF_ROOT = "https://www.crcv.ucf.edu/THUMOS14/UCF101/UCF101/"
_VIDEO_LIST = None
_CACHE_DIR = tempfile.mkdtemp()
# As of July 2020, crcv.ucf.edu doesn't use a certificate accepted by the
# default Colab environment anymore.
unverified_context = ssl._create_unverified_context()

def list_ucf_videos():
  """Lists videos available in UCF101 dataset."""
  global _VIDEO_LIST
  if not _VIDEO_LIST:
    index = request.urlopen(UCF_ROOT, context=unverified_context).read().decode("utf-8")
    videos = re.findall("(v_[\w_]+\.avi)", index)
    _VIDEO_LIST = sorted(set(videos))
  return list(_VIDEO_LIST)

def fetch_ucf_video(video):
  """Fetchs a video and cache into local filesystem."""
  cache_path = os.path.join(_CACHE_DIR, video)
  if not os.path.exists(cache_path):
    urlpath = request.urljoin(UCF_ROOT, video)
    print("Fetching %s => %s" % (urlpath, cache_path))
    data = request.urlopen(urlpath, context=unverified_context).read()
    open(cache_path, "wb").write(data)
  return cache_path

# Utilities to open video files using CV2
def crop_center_square(frame):
  y, x = frame.shape[0:2]
  min_dim = min(y, x)
  start_x = (x // 2) - (min_dim // 2)
  start_y = (y // 2) - (min_dim // 2)
  return frame[start_y:start_y+min_dim,start_x:start_x+min_dim]

def load_video(path, max_frames=0, resize=(224, 224)):
  start_time = 902
  end_time = 907
  cap = cv2.VideoCapture(path)
  frame_width = int(cap.get(3)) 
  frame_height = int(cap.get(4)) 
   
  size = (frame_width, frame_height) 
  result = cv2.VideoWriter('test.avi',  
                         cv2.VideoWriter_fourcc(*'MJPG'), 
                         30, size) 
  frames = []
  try:
    while True:
      ret, frame = cap.read()
      if not ret:
        break
      now_second = cap.get(0)/1000
      if now_second < start_time: continue
      if (now_second >= end_time+1): break
      result.write(frame)
      frame = crop_center_square(frame)
      frame = cv2.resize(frame, resize)
      frame = frame[:, :, [2, 1, 0]]
      frames.append(frame)

      if len(frames) == max_frames:
        break
  finally:
    cap.release()
  return np.array(frames) / 255.0

In [15]:
i3d = hub.load("https://tfhub.dev/deepmind/i3d-kinetics-400/1").signatures['default']

In [16]:
def predict(sample_video):
  # Add a batch axis to the to the sample video.
  model_input = tf.constant(sample_video, dtype=tf.float32)[tf.newaxis, ...]

  logits = i3d(model_input)['default'][0]
  probabilities = tf.nn.softmax(logits)

  print("Top 5 actions:")
  for i in np.argsort(probabilities)[::-1][:5]:
    print(f"  {labels[i]:22}: {probabilities[i] * 100:5.2f}%")

In [24]:
#video_path = fetch_ucf_video("v_CricketShot_g04_c02.avi")
sample_video = load_video(video_path)

In [19]:
# Get the kinetics-400 action labels from the GitHub repository.
KINETICS_URL = "https://raw.githubusercontent.com/deepmind/kinetics-i3d/master/data/label_map.txt"
with request.urlopen(KINETICS_URL) as obj:
  labels = [line.decode("utf-8").strip() for line in obj.readlines()]
print("Found %d labels." % len(labels))

Found 400 labels.


In [38]:
sample_media = load_video(media_path)

In [35]:
predict(sample_media)

Top 5 actions:
  shaving legs          :  9.57%
  dancing ballet        :  8.78%
  plastering            :  5.42%
  stretching arm        :  4.80%
  yawning               :  4.43%


In [27]:
import inspect
lines = inspect.getsource(yolo.inference)
print(lines)

    def inference(
        self,
        media_path,
        is_image: bool = True,
        cv_apiPreference=None,
        cv_frame_size: tuple = None,
        cv_fourcc: str = None,
        cv_waitKey_delay: int = 1,
        iou_threshold: float = 0.3,
        score_threshold: float = 0.25,
    ):
        if not path.exists(media_path):
            raise FileNotFoundError("{} does not exist".format(media_path))

        cv2.namedWindow("result", cv2.WINDOW_AUTOSIZE)

        if is_image:
            frame = cv2.imread(media_path)
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

            start_time = time.time()
            bboxes = self.predict(
                frame,
                iou_threshold=iou_threshold,
                score_threshold=score_threshold,
            )
            exec_time = time.time() - start_time
            print("time: {:.2f} ms".format(exec_time * 1000))

            frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
            image = self.draw