Code to extract C3D Features from the videos in the BDDX Dataset

This code is based on the C3D Implementation found at https://github.com/karolzak/conv3d-video-action-recognition

# C3D Model

In [26]:
c3d_dir = "conv3d-video-action-recognition"

In [27]:
%run {c3d_dir}/python/data_prep.py
%run {c3d_dir}/python/mpypl_pipe_func.py
%run {c3d_dir}/python/mpypl_pipes.py

In [28]:
# C3D_model function
%run {c3d_dir}/python/c3dmodel.py

MODEL = get_video_descriptor(weights_path='%s/models/weights_C3D_sports1M_tf.h5'%(c3d_dir))

Using TensorFlow backend.


Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Loading Model Weights from conv3d-video-action-recognition/models/weights_C3D_sports1M_tf.h5
Popping last 3 layers


In [5]:
"""
The input to this model is 16 frames, in the shape of (batch,16,112,112,3)
"""

MODEL.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1 (Conv3D)               (None, 16, 112, 112, 64)  5248      
_________________________________________________________________
pool1 (MaxPooling3D)         (None, 16, 56, 56, 64)    0         
_________________________________________________________________
conv2 (Conv3D)               (None, 16, 56, 56, 128)   221312    
_________________________________________________________________
pool2 (MaxPooling3D)         (None, 8, 28, 28, 128)    0         
_________________________________________________________________
conv3a (Conv3D)              (None, 8, 28, 28, 256)    884992    
_________________________________________________________________
conv3b (Conv3D)              (None, 8, 28, 28, 256)    1769728   
_________________________________________________________________
pool3 (MaxPooling3D)         (None, 4, 14, 14, 256)    0         
__________

# Get video frames

In [16]:
import moviepy.editor as mp
import torch

# t is the time of the first frame
# count is the number of frames
# clip_len is the number of seconds of the subclip
def sample_frames(clip,t=0,count=16,clip_len=1):
    output = []
    for i in range(count):
        time = t+clip_len*(i/count) # Sample at equal intervals across the subclip
        frame = clip.get_frame(time) # Grab the frame at the specified time
        output.append(frame)
    return torch.Tensor(output)

# clip_start and clip_end are the times of the BDD-X Sample
#   e.g., the first sample for video [0] has clip_start=0, clip_end=11
def get_frames(clip,clip_start,clip_end,frame_count=16,clip_len=1):
    output = []
    for i in range(clip_start,clip_end,clip_len):
        output.append( sample_frames(clip,t=i,count=frame_count,clip_len=clip_len).unsqueeze(0) )
    return output
        

url = "https://s3-us-west-2.amazonaws.com/sq8geewpqu/samples-1k/06d501fd-a9ffc960.mov"
clip = mp.VideoFileClip(url,target_resolution=[112,112])
# get_frames(.) returns a list, where each element is 16 frames chosen uniformly across a 1-second subclip
#   Each element of nn_inputs should be put through the network and then fed into the Transformer to find an
#   overall representation of the clip
nn_inputs = get_frames(clip,0,11)
print(len(nn_inputs))
print(nn_inputs[-1].shape)


11
torch.Size([1, 16, 112, 112, 3])


# Run video through the model

In [7]:
output = MODEL.predict(nn_inputs[0])

In [8]:
print(output)
print(output.shape)

[[0.01433312 0.         0.         ... 0.         0.         0.07065101]]
(1, 4096)


# Full pipeline
## Extract C3D Features for each clip in BDDX

In [1]:
# Open CSV file
import pandas as pd
data_url = "../revisedBDDX.csv"
def load_bddx_data(csv_name):
    column_names = ['Index', 'InputVideo', '1S', '1E', '1A', '1J', '2S', '2E', '2A', '2J', '3S', '3E', '3A', '3J',
                    '4S', '4E', '4A', '4J','5S', '5E', '5A', '5J','6S', '6E', '6A', '6J','7S', '7E', '7A', '7J',
                    '8S', '8E', '8A', '8J','9S', '9E', '9A', '9J','10S', '10E', '10A', '10J','11S', '11E', '11A', '11J',
                    '12S', '12E', '12A', '12J','13S', '13E', '13A', '13J','14S', '14E', '14A', '14J','15S', '15E', '15A', '15J']
    
    return pd.read_csv(csv_name, names=column_names)

bddx = load_bddx_data(data_url)
bddx = bddx.drop(bddx.index[0])

In [33]:
# Author's Note: I have never run this code. I know it works, but I'm not sure if the entire feature_vectors array
#   will fit into memory or not. Try this at your own risk.

from math import isnan

vector_size = 4096
feature_vectors = []

cnt = 0
for index,row in bddx.iterrows():
    url = row['InputVideo']
    #clip = mp.VideoFileClip(url,target_resolution=[112,112])
    for i in range(1,16):
        start = int(row['%iS'%(i)])
        finish = int(row['%iE'%(i)])
        if isnan(start) or isnan(finish):
            continue
        if finish == start: # Some clips have same start and finish times; avoid 0-second clips
            finish+=1
        nn_inputs = get_frames(clip,start,finish)
        vectors = np.zeros((finish-start,vector_size))
        for j,nn_input in enumerate(nn_inputs):
            vectors[j] = MODEL.predict(nn_input)
        feature_vectors.append(vectors)
    
print(len(feature_vectors))
print(feature_vectors[0].shape)


(11, 4096)
[[0.01433312 0.         0.         ... 0.         0.         0.07065101]
 [0.01486513 0.         0.         ... 0.         0.         0.07457145]
 [0.01166258 0.         0.         ... 0.         0.         0.10145477]
 ...
 [0.00248984 0.         0.         ... 0.00125599 0.         0.02231222]
 [0.         0.         0.         ... 0.0087162  0.         0.00646217]
 [0.         0.         0.         ... 0.         0.         0.06207535]]
