# Action recognition with an inflated 3D CNN

## Setup

In [1]:
# TF and TF-Hub modules
from absl import logging

import tensorflow as tf
import tensorflow_hub as hub
from tensorflow_docs.vis import embed

logging.set_verbosity(logging.ERROR)

In [2]:
# Modules for reading the UCF101 dataset
import random
import re
import os
import tempfile
import cv2
import numpy as np

In [3]:
# Modules for display and animation
import imageio
from IPython import display

from urllib import request  # requires python 3

In [5]:
# Utilities to fetch videos from UCF101 dataset
UCF_ROOT = 'http://crcv.ucf.edu/THUMOS14/UCF101/UCF101/'
_VIDEO_LIST = None
_CACHE_DIR = tempfile.mkdtemp()


def list_ucf_videos():
    """Lists videos available in UCF101 dataset."""
    global _VIDEO_LIST
    if not _VIDEO_LIST:
        index = request.urlopen(UCF_ROOT).read().decode('utf-8')
        videos = re.findall('(v_[\w_]+\.avi)', index)
        _VIDEO_LIST = sorted(set(videos))
    return list(_VIDEO_LIST)


def fetch_ucf_video(video):
    """Fetches a video and caches in local filesystem."""
    cache_path = os.path.join(_CACHE_DIR, video)
    if not os.path.exists(cache_path):
        urlpath = request.urljoin(UCF_ROOT, video)
        print('Fetching %s => %s' % (urlpath, cache_path))
        data = request.urlopen(urlpath).read()
        open(cache_path, 'wb').write(data)
    return cache_path

In [6]:
# Utilities to open video files using CV2
def crop_center_square(frame):
    y, x = frame.shape[0:2]
    min_dim = min(y, x)
    start_x = (x // 2) - (min_dim // 2)
    start_y = (y // 2) - (min_dim // 2)
    return frame[start_y:start_y + min_dim,
                 start_x:start_x + min_dim]


def load_video(path, max_frames=0, resize=(224, 224)):
    cap = cv2.VideoCapture(path)
    frames = []
    try:
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            frame = crop_center_square(frame)
            frame = cv2.resize(frame, resize)
            frame = frame[:, :, [2, 1, 0]]  # TODO: ?
            frames.append(frame)
            
            if len(frames) == max_frames:
                break
    finally:
        cap.release()
    return np.array(frames) / 255.0  # TODO: ?


def to_gif(images):
    converted_images = np.clip(images * 255, 0, 255).astype(np.uint8)
    imageio.mimsave('./animation.gif', converted_images, fps=25)
    return embed.embed_file('./animation.gif')


In [7]:
# Get the kinetics-400 action labels from the Github repo
KINETICS_URL = 'https://raw.githubusercontent.com/deepmind/kinetics-i3d/master/data/label_map.txt'
with request.urlopen(KINETICS_URL) as obj:
    labels = [line.decode('utf-8').strip() for line in obj.readlines()]
print('Found %d labels.' % len(labels))

Found 400 labels.


## Using the UCF101 dataset

In [8]:
# Get the list of videos in the dataset.
ucf_videos = list_ucf_videos()

categories = {}
for video in ucf_videos:
    category = video[2:-12]  # TODO: ?
    if category not in categories:
        categories[category] = []
    categories[category].append(video)
print('Found %d videos in %d categories.' % (len(ucf_videos), len(categories)))

for category, sequences in categories.items():
    summary = ', '.join(sequences[:2])
    print('%-20s %4d videos (%s, ...)' % (category, len(sequences), summary))

Found 13320 videos in 101 categories.
ApplyEyeMakeup        145 videos (v_ApplyEyeMakeup_g01_c01.avi, v_ApplyEyeMakeup_g01_c02.avi, ...)
ApplyLipstick         114 videos (v_ApplyLipstick_g01_c01.avi, v_ApplyLipstick_g01_c02.avi, ...)
Archery               145 videos (v_Archery_g01_c01.avi, v_Archery_g01_c02.avi, ...)
BabyCrawling          132 videos (v_BabyCrawling_g01_c01.avi, v_BabyCrawling_g01_c02.avi, ...)
BalanceBeam           108 videos (v_BalanceBeam_g01_c01.avi, v_BalanceBeam_g01_c02.avi, ...)
BandMarching          155 videos (v_BandMarching_g01_c01.avi, v_BandMarching_g01_c02.avi, ...)
BaseballPitch         150 videos (v_BaseballPitch_g01_c01.avi, v_BaseballPitch_g01_c02.avi, ...)
BasketballDunk        131 videos (v_BasketballDunk_g01_c01.avi, v_BasketballDunk_g01_c02.avi, ...)
Basketball            134 videos (v_Basketball_g01_c01.avi, v_Basketball_g01_c02.avi, ...)
BenchPress            160 videos (v_BenchPress_g01_c01.avi, v_BenchPress_g01_c02.avi, ...)
Biking              

In [9]:
# Get a sample cricket (game) video
video_path = fetch_ucf_video('v_CricketShot_g04_c02.avi')
sample_video = load_video(video_path)

Fetching http://crcv.ucf.edu/THUMOS14/UCF101/UCF101/v_CricketShot_g04_c02.avi => /tmp/tmpasv5ah18/v_CricketShot_g04_c02.avi


In [10]:
sample_video.shape

(116, 224, 224, 3)

### Inflated 3D ConvNet

In [11]:
# Fetch the i3d model from TF-Hub
i3d = hub.load('https://tfhub.dev/deepmind/i3d-kinetics-400/1').signatures['default']

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


Instructions for updating:
If using Keras pass *_constraint arguments to layers.


##### Deprecation warning
```
WARNING:tensorflow:From /home/ivogeorg/anaconda3/envs/action-recognition/lib/python3.7/site-packages/tensorflow/python/ops/resource_variable_ops.py:1817: calling BaseResourceVariable.__init__ (from tensorflow.python.ops.resource_variable_ops) with constraint is deprecated and will be removed in a future version.
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
```

Run i3d model and print the top-5 action predictions:

In [12]:
def predict(sample_video):
    # Add a batch axis to the sample video
    model_input = tf.constant(sample_video,
                              dtype=tf.float32)[tf.newaxis, ...]
    
    logits = i3d(model_input)['default'][0]  # TODO: Grok
    probabilities = tf.nn.softmax(logits)
    
    print('Top 5 actions:')
    for i in np.argsort(probabilities)[::-1][:5]:  # TODO: Parse
        print(f'    {labels[i]:22}: {probabilities[i] * 100:5.2f}%')

In [13]:
predict(sample_video)

Top 5 actions:
    playing cricket       : 97.77%
    skateboarding         :  0.71%
    robot dancing         :  0.56%
    roller skating        :  0.56%
    golf putting          :  0.13%


Test with [a video by Patrick Gillett](https://commons.wikimedia.org/wiki/File:End_of_a_jam.ogv) from [Wikimedia](https://commons.wikimedia.org/wiki/Category:Videos_of_sports):

In [16]:
video_path = 'End_of_a_jam.ogv'
sample_video = load_video(video_path)[:100]
sample_video.shape

(0,)

##### Library error
```
[ERROR:0] global /io/opencv/modules/videoio/src/cap.cpp (116) open VIDEOIO(CV_IMAGES): raised OpenCV exception:

OpenCV(4.2.0) /io/opencv/modules/videoio/src/cap_images.cpp:253: error: (-5:Bad argument) CAP_IMAGES: can't find starting number (in the name of file): End_of_a_jam.ogv in function 'icvExtractPattern'
```