Code to extract C3D Features from the videos in the BDDX Dataset

This code is based on the C3D Implementation found at https://github.com/karolzak/conv3d-video-action-recognition

# C3D Model

In [None]:
!git clone https://github.com/karolzak/conv3d-video-action-recognition.git

Cloning into 'conv3d-video-action-recognition'...
remote: Enumerating objects: 125, done.[K
remote: Counting objects: 100% (125/125), done.[K
remote: Compressing objects: 100% (78/78), done.[K
remote: Total 125 (delta 62), reused 99 (delta 40), pack-reused 0[K
Receiving objects: 100% (125/125), 1.83 MiB | 29.32 MiB/s, done.
Resolving deltas: 100% (62/62), done.


In [None]:
!pip install pipe
!pip install mPyPl

Collecting pipe
  Downloading https://files.pythonhosted.org/packages/50/aa/2c7d8e1131d709d009deb9919c29ee8b1e1b2997034cbd4a440fddbf1d3e/pipe-1.6.0-py2.py3-none-any.whl
Installing collected packages: pipe
Successfully installed pipe-1.6.0
Collecting mPyPl
  Downloading https://files.pythonhosted.org/packages/a5/e0/4338997e7523bb31d664127d912c849f8781e8d5f6e7a5db9aaa38ef98cb/mPyPl-0.0.3.8-py3-none-any.whl
Installing collected packages: mPyPl
Successfully installed mPyPl-0.0.3.8


In [None]:
c3d_dir = "conv3d-video-action-recognition"

In [None]:
%run {c3d_dir}/python/data_prep.py
%run {c3d_dir}/python/mpypl_pipe_func.py
%run {c3d_dir}/python/mpypl_pipes.py

Imageio: 'ffmpeg-linux64-v3.3.1' was not found on your computer; downloading it now.
Try 1. Download from https://github.com/imageio/imageio-binaries/raw/master/ffmpeg/ffmpeg-linux64-v3.3.1 (43.8 MB)
Downloading: 8192/45929032 bytes (0.0%)3588096/45929032 bytes (7.8%)7536640/45929032 bytes (16.4%)11788288/45929032 bytes (25.7%)15892480/45929032 bytes (34.6%)19865600/45929032 bytes (43.3%)23863296/45929032 bytes (52.0%)27746304/45929032 bytes (60.4%)31686656/45929032 bytes (69.0%)35446784/45929032 bytes (77.2%)39477248/45929032 bytes (86.0%)43491328/45929032 bytes (94.7%)45929032/45929032 bytes (100.0%)
  Done
File saved as /root

In [None]:
# C3D_model function
%run {c3d_dir}/python/c3dmodel.py

MODEL = get_video_descriptor(weights_path='%s/models/weights_C3D_sports1M_tf.h5'%(c3d_dir))

Loading Model Weights from conv3d-video-action-recognition/models/weights_C3D_sports1M_tf.h5


OSError: ignored

In [None]:
"""
The input to this model is 16 frames, in the shape of (batch,16,112,112,3)
"""

MODEL.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1 (Conv3D)               (None, 16, 112, 112, 64)  5248      
_________________________________________________________________
pool1 (MaxPooling3D)         (None, 16, 56, 56, 64)    0         
_________________________________________________________________
conv2 (Conv3D)               (None, 16, 56, 56, 128)   221312    
_________________________________________________________________
pool2 (MaxPooling3D)         (None, 8, 28, 28, 128)    0         
_________________________________________________________________
conv3a (Conv3D)              (None, 8, 28, 28, 256)    884992    
_________________________________________________________________
conv3b (Conv3D)              (None, 8, 28, 28, 256)    1769728   
_________________________________________________________________
pool3 (MaxPooling3D)         (None, 4, 14, 14, 256)    0         
__________

# Get video frames

In [None]:
import moviepy.editor as mp
import torch

# t is the time of the first frame
# count is the number of frames
# clip_len is the number of seconds of the subclip
def sample_frames(clip,t=0,count=16,clip_len=1):
    output = []
    for i in range(count):
        time = t+clip_len*(i/count) # Sample at equal intervals across the subclip
        frame = clip.get_frame(time) # Grab the frame at the specified time
        output.append(frame)
    return torch.Tensor(output)

# clip_start and clip_end are the times of the BDD-X Sample
#   e.g., the first sample for video [0] has clip_start=0, clip_end=11
def get_frames(clip,clip_start,clip_end,frame_count=16,clip_len=1):
    output = []
    for i in range(clip_start,clip_end,clip_len):
        output.append( sample_frames(clip,t=i,count=frame_count,clip_len=clip_len).unsqueeze(0) )
    return output


11
torch.Size([1, 16, 112, 112, 3])


In [None]:
url = "https://s3-us-west-2.amazonaws.com/sq8geewpqu/samples-1k/06d501fd-a9ffc960.mov"
clip = mp.VideoFileClip(url,target_resolution=[112,112])
# get_frames(.) returns a list, where each element is 16 frames chosen uniformly across a 1-second subclip
#   Each element of nn_inputs should be put through the network and then fed into the Transformer to find an
#   overall representation of the clip
nn_inputs = get_frames(clip,0,11)
print(len(nn_inputs))
print(nn_inputs[-1].shape)
clip.close()


# Run video through the model

In [None]:
output = MODEL.predict(nn_inputs[0])

In [None]:
print(output)
print(output.shape)

[[0.01433312 0.         0.         ... 0.         0.         0.07065101]]
(1, 4096)


# Text Features 

In [1]:
import pandas as pd # Pandas library enables data manipulation
data_url = "https://raw.githubusercontent.com/linklab-uva/Scenario2Vector/master/BDDX.csv?token=AH3QIX4XTFNXLYCT2FEQHILAEBCMS"
def load_bddx_data(csv_name):
    column_names = ['Index', 'InputVideo', '1S', '1E', '1A', '1J', '2S', '2E', '2A', '2J', '3S', '3E', '3A', '3J',
                    '4S', '4E', '4A', '4J','5S', '5E', '5A', '5J','6S', '6E', '6A', '6J','7S', '7E', '7A', '7J',
                    '8S', '8E', '8A', '8J','9S', '9E', '9A', '9J','10S', '10E', '10A', '10J','11S', '11E', '11A', '11J',
                    '12S', '12E', '12A', '12J','13S', '13E', '13A', '13J','14S', '14E', '14A', '14J','15S', '15E', '15A', '15J']
    
    return pd.read_csv(csv_name, names=column_names)
bddx = load_bddx_data(data_url)
bddx = bddx.drop(['1S', '1E','2S', '2E','3S', '3E','4S', '4E','5S', '5E','6S', '6E','7S', '7E','8S', '8E','9S', '9E','10S', '10E','11S', '11E','12S', '12E','13S', '13E','14S', '14E','15S', '15E', ], axis=1)
bddx = bddx.fillna("")

bddx['1AJ'] = bddx[['1A', '1J']].agg(' '.join, axis=1)
bddx['2AJ'] = bddx[['2A', '2J']].agg(' '.join, axis=1)
bddx['3AJ'] = bddx[['3A', '3J']].agg(' '.join, axis=1)
bddx['4AJ'] = bddx[['4A', '4J']].agg(' '.join, axis=1)
bddx['5AJ'] = bddx[['5A', '5J']].agg(' '.join, axis=1)
bddx['6AJ'] = bddx[['6A', '6J']].agg(' '.join, axis=1)
bddx['7AJ'] = bddx[['7A', '7J']].agg(' '.join, axis=1)
bddx['8AJ'] = bddx[['8A', '8J']].agg(' '.join, axis=1)
bddx['9AJ'] = bddx[['9A', '9J']].agg(' '.join, axis=1)
bddx['10AJ'] = bddx[['10A', '10J']].agg(' '.join, axis=1)
bddx['11AJ'] = bddx[['11A', '11J']].agg(' '.join, axis=1)
bddx['12AJ'] = bddx[['12A', '12J']].agg(' '.join, axis=1)
bddx['13AJ'] = bddx[['13A', '13J']].agg(' '.join, axis=1)
bddx['14AJ'] = bddx[['14A', '14J']].agg(' '.join, axis=1)
bddx['15AJ'] = bddx[['15A', '15J']].agg(' '.join, axis=1)

bddx = bddx.drop(['Index', '1A', '1J', '2A', '2J', '3A', '3J', '4A', '4J', '5A', '5J', '6A', '6J', '7A', '7J', '8A', '8J', '9A', '9J', '10A', '10J', '11A', '11J', '12A', '12J', '13A', '13J', '14A', '14J', '15A', '15J', ], axis=1)
bddx = bddx.drop(bddx.index[0])

In [2]:
x = bddx.iloc[0]['6AJ']
print(x==' ')

True


In [3]:
urls_captions = {}

for index, col in bddx.iterrows():
    urls_captions[col['InputVideo']] = []
    for i in range(1,16):
        col_name = str(i)+'AJ'
        if col[col_name] != ' ':
            urls_captions[col['InputVideo']].append(col[col_name])
            


In [4]:
import itertools
captions_dict = list(urls_captions.values())
captions_list = (list(itertools.chain.from_iterable(captions_dict)))

In [5]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/88/b1/41130a228dd656a1a31ba281598a968320283f48d42782845f6ba567f00b/transformers-4.2.2-py3-none-any.whl (1.8MB)
[K     |████████████████████████████████| 1.8MB 8.0MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 35.9MB/s 
[?25hCollecting tokenizers==0.9.4
[?25l  Downloading https://files.pythonhosted.org/packages/0f/1c/e789a8b12e28be5bc1ce2156cf87cb522b379be9cadc7ad8091a4cc107c4/tokenizers-0.9.4-cp36-cp36m-manylinux2010_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 51.3MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp36-none-any.whl size=893261 sha256=1ce10fe65156a

In [6]:
from transformers import BertTokenizer, BertModel
import torch
captions_encoded = []
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
for i in captions_list[:5]:
    inputs = tokenizer(i, return_tensors="pt")
    outputs = model(**inputs)
    captions_encoded.append(outputs)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




In [8]:
'''
last_hidden_state (torch.FloatTensor of shape (batch_size, sequence_length, hidden_size)) 
Sequence of hidden-states at the output of the last layer of the model.
'''
print(captions_encoded[3].last_hidden_state.shape)

torch.Size([1, 12, 768])


# Video Features

In [51]:
# Note - we need to use PCA to reduce dims from 4096 to 512, I just used this dummy tensor to test is the model would work
video_empty = torch.zeros([1,512], dtype=torch.long)
output_video = model(video_empty)
output_video.last_hidden_state.shape

torch.Size([1, 512, 768])

In [49]:
m = torch.nn.Conv1d(4096, 512, 1)
input = torch.randn(1, 1, 4096).transpose(1,2)
output = m(input)

In [50]:
output.shape

torch.Size([1, 512, 1])

# Full pipeline
## Extract C3D Features for each clip in BDDX

In [None]:
# Open CSV file
import pandas as pd
data_url = "../revisedBDDX.csv"
def load_bddx_data(csv_name):
    column_names = ['Index', 'InputVideo', '1S', '1E', '1A', '1J', '2S', '2E', '2A', '2J', '3S', '3E', '3A', '3J',
                    '4S', '4E', '4A', '4J','5S', '5E', '5A', '5J','6S', '6E', '6A', '6J','7S', '7E', '7A', '7J',
                    '8S', '8E', '8A', '8J','9S', '9E', '9A', '9J','10S', '10E', '10A', '10J','11S', '11E', '11A', '11J',
                    '12S', '12E', '12A', '12J','13S', '13E', '13A', '13J','14S', '14E', '14A', '14J','15S', '15E', '15A', '15J']
    
    return pd.read_csv(csv_name, names=column_names)

bddx = load_bddx_data(data_url)
bddx = bddx.drop(bddx.index[0])

In [None]:
from math import isnan
import pickle as pkl
import struct
import os

vector_size = 4096
fname = "vectors2.txt"

# Setup vectors file
if os.path.isfile(fname):
    print("WARNING: vectors.txt already exists! Over(W)rite, (A)ppend, or (Q)uit?")
    char = input("(W/A/Q) > ")
    if char.lower() == "w":
        with open(fname,"wb") as _out:
            pass # Empty the file to overwrite the vectors
    # elif char == "A": # NOTE: This is unneeded; append is considered default behavior for the pipeline
    elif char.lower() != "a":
        raise KeyboardInterrupt # Interrupt so as not to run the following code

vectors = np.zeros((60,vector_size)) # 60 is max vector size
milestone = 5600 # When to stop running the pipeline to allow the code to be run in pieces. Set to -1 if you don't want to stop
prev_milestone = 5300 # The last milestone that the code was stopped at
print("starting vectorization")
# Extact vectors
for index,row in bddx.iterrows():
    if index <= prev_milestone: # Pick up where I last left off
        continue
    if index%10 == 0: # Monitor progression
        print(index)
    if milestone >= 0 and index > milestone: # Halt execution so I'm not running it all at once
        break
    url = row['InputVideo']
    if index == 4882:
        url = "./4882.mov"
    clip = mp.VideoFileClip(url,target_resolution=[112,112])
    for i in range(1,16):
        start = float(row['%iS'%(i)])
        finish = float(row['%iE'%(i)])
        if isnan(start) or isnan(finish):
            continue
        else:
            start = int(start)
            finish = int(finish)
        if finish == start: # Some clips have same start and finish times; avoid 0-second clips
            finish+=1
        elif start > finish:
            continue
        elif finish > 100: # Take care of some erroneous finish times
            while finish > 100:
                finish = int(finish/10)
        nn_inputs = get_frames(clip,start,finish)
        vsize = finish-start
        for j,nn_input in enumerate(nn_inputs):
            vectors[j] = MODEL.predict(nn_input)
        byte_vals = [bytearray(struct.pack('d',val)) for val in vectors[0:vsize].flatten()]
        with open(fname,"ab") as _out:
            for b in byte_vals:
                _out.write(b)
    clip.close()

print("vectors completed")


(11, 4096)
[[0.01433312 0.         0.         ... 0.         0.         0.07065101]
 [0.01486513 0.         0.         ... 0.         0.         0.07457145]
 [0.01166258 0.         0.         ... 0.         0.         0.10145477]
 ...
 [0.00248984 0.         0.         ... 0.00125599 0.         0.02231222]
 [0.         0.         0.         ... 0.0087162  0.         0.00646217]
 [0.         0.         0.         ... 0.         0.         0.06207535]]


In [None]:
from math import isnan

keys_7x15 = []
keys_26x1 = []
vcount = 0

for index,row in bddx.iterrows():
    if index%500 == 0:
        print(index)
    keys_7x15.append([None]*15)
    for i in range(1,16):
        start = float(row['%iS'%(i)])
        finish = float(row['%iE'%(i)])
        if isnan(start) or isnan(finish):
            continue
        else:
            start = int(start)
            finish = int(finish)
        if finish == start: # Some clips have same start and finish times; avoid 0-second clips
            finish+=1
        elif start > finish: # Some clips have start after finish; ignore these
            continue
        elif finish > 100: # Take care of some erroneous finish times
            while finish > 100:
                finish = int(finish/10)
        vsize = finish-start
        keys_7x15[index-1][i-1] = (vcount,vsize)
        keys_26x1.append((vcount,vsize))
        vcount+=vsize

with open("vector_keys.pkl","wb") as _out:
    pkl.dump((keys_7x15,keys_26x1),_out)


# Code to Read Vectors from File

There are two ways to read vectors from file: get_vectors_by_video() and get_vectors_by_clip().

get_vectors_by_video() should be used if you want a specific clip: e.g., clip 0 of video 5 (caption: "The car slows to a stop because the light is red.")

get_vectors_by_clip() should be used if you want to iterate over all vectors, e.g.

    for i in range(len(keys1d)):
    
        v = get_vectors_by_clip(i)

In [None]:
import pickle as pkl
import numpy as np
import struct

vector_size = 4096
double_size = 8

with open("vector_keys.pkl","rb") as _in:
    keys2d,keys1d = pkl.load(_in)

def get_vector_by_info(vector_info):
    """
    This function should only be called by get_vectors_by_video and get_vectors_by_clip
    """
    with open("vectors.txt","rb") as _in:
        vector_loc = vector_info[0]*vector_size*double_size
        _in.seek(vector_loc,0)
        vectors_bytes = _in.read(double_size*vector_info[1]*vector_size)
        #vectors_bytes = bytes(vectors_str,'iso-8859-1')
        vectors_flat = struct.unpack('d'*vector_info[1]*vector_size,vectors_bytes)
        vectors = np.reshape(vectors_flat,(vector_info[1],vector_size))
    return vectors
    
def get_vectors_by_video(video_id,clip_id):
    """
    Inputs: video ID and clip ID. E.g., to get the first clip from the second video, which
    has the caption "The car is stopped. The car is at an intersection with a red light.",
    use get_vectors_by_video(1,0)
    """
    vector_info = keys2d[video_id][clip_id]
    print(vector_info)
    if vector_info == None:
        return
    return get_vector_by_info(vector_info)

def get_vectors_by_clip(clip_id):
    """
    Input: clip ID in a flat format. E.g., to get the first clip from the second video, which
    has the caption "The car is stopped. The car is at an intersection with a red light.",
    use get_vectors_by_video(5). Uses the same id format as phrase_lookup from Ranking.ipynb
    ### WARNING: Due to changes in processing caused by samples with start times before end times,
    ###   these indices may no longer match up with phrase_lookup
    """
    vector_info = keys1d[clip_id]
    if vector_info == None:
        return
    return get_vector_by_info(vector_info)


In [None]:
v = get_vectors_by_video(6995,0)
print(v.shape)
print(v[0][0:10])

# Utils
Utility code for generating vectors.txt. You shouldn't need any of this code, but I'm including it for legacy.

In [None]:
# Util: check if the saved vector is the same as the generated one.
eps = 1e-5
print(output[0][0:20])
print(v[0][0:20])
for n in range(4096):
    if abs(output[0][n]-v[0][n]) > eps:
        print("Unequal")
        print(n)
        break

In [None]:
# Util: Check bddx for instances of start time after finish time

for index,row in bddx.iterrows():
    for i in range(1,16):
        start = float(row['%iS'%(i)])
        finish = float(row['%iE'%(i)])
        if isnan(start) or isnan(finish):
            continue
        elif start > finish:
            print(index,start,finish)


In [None]:
# Util: Remove (corrupted) vectors in the middle of the file

def excise(s,e):
    fend = 96284
    with open("v2.txt","wb") as _out:
        pass
    with open("vectors.txt","rb") as _in:
        a = 0
        batch = 10
        while a < fend:
            if a%10000 == 0:
                print(a)
            if a+batch < s:
                vector_bytes = _in.read(double_size*vector_size*batch)
                with open("v2.txt","ab") as _out:
                    _out.write(vector_bytes)
                a+=batch
            elif a < s:
                vector_bytes = _in.read(double_size*vector_size)
                with open("v2.txt","ab") as _out:
                    _out.write(vector_bytes)
                a+=1
            elif a < e:
                _in.read(double_size*vector_size)
                a+=1
            elif a+batch < fend and a%batch != 0:
                vector_bytes = _in.read(double_size*vector_size)
                with open("v2.txt","ab") as _out:
                    _out.write(vector_bytes)
                a+=1
            elif a+batch < fend:
                vector_bytes = _in.read(double_size*vector_size*batch)
                with open("v2.txt","ab") as _out:
                    _out.write(vector_bytes)
                a+=batch
            elif a < fend:
                vector_bytes = _in.read(double_size*vector_size)
                with open("v2.txt","ab") as _out:
                    _out.write(vector_bytes)
                a+=1

t1 = 93
t2 = 6
s = 88628 # 35577, 41540, 47459
e = s+(60-(t1-t2))

a = input("Are you sure? ")
if a == 'y':
    excise(s,e)


In [None]:
# Util: Delete (corrupted) vectors at end of file

fend = 90008
with open("v2.txt","wb") as _out:
    pass
with open("vectors.txt","rb") as _in:
    a = 0
    batch = 100
    while a < fend:
        if a%10000 == 0:
            print(a)
        if a+batch < fend:
            vector_bytes = _in.read(double_size*vector_size*batch)
            with open("v2.txt","ab") as _out:
                _out.write(vector_bytes)
            a+=batch
        else:
            vector_bytes = _in.read(double_size*vector_size)
            with open("v2.txt","ab") as _out:
                _out.write(vector_bytes)
            a+=1


In [None]:
# Util: join two vector binary files

base_file = "vectors.txt"
next_file = "vectors_5600+.txt"

double_size = 8
vector_size = 4096

print("start")
with open(next_file,"rb") as _in:
    a = _in.read(double_size*vector_size)
    while len(a) == 32768:
        with open(base_file,"ab") as _out:
            _out.write(a)
        a = _in.read(double_size*vector_size)

print("done")