In [1]:
import glob
import json
import numpy as np
import os
import pickle
import torch

from detector import Detector
from parser import parse
from transformers import LxmertModel, LxmertTokenizer

PyTorch version 1.6.0 available.


In [2]:
def read_json(path='output.json'):
    """
    Check for valid JSON format and read content
    path: path to JSON file
    """
    file = open(path)
    line = file.read().replace('\n', ' ')
    file.close()
    try:
        parsed_json = json.loads(line)
    except:
        assert False, 'Invalid JSON'
    return parsed_json

def get_vid_ext(vid_id, video_dir):
    """
    Returns video file extension
    vid_id: video id
    video_dir: directory path to video files
    """
    vid_prefix = os.path.join(video_dir, vid_id)
    if os.path.exists(vid_prefix+'.mp4'):
        return '.mp4'
    elif os.path.exists(vid_prefix+'.mkv'):
        return '.mkv'
    elif os.path.exists(vid_prefix+'.webm'):
        return '.webm'
    

In [3]:
def download_video(vid_id, video_dir):
    """
    Download video
    vid_id: video id
    video_dir: directory path to video files
    """
    # download the video
    vid_url = 'www.youtube.com/watch?v='+vid_id
    vid_prefix = os.path.join(video_dir, vid_id) 
    os.system(' '.join(("youtube-dl -o", vid_prefix, vid_url)))


def sample_frames(vid_id, video_dir, frame_dir, fps=5):
    """
    Sample video into frames at fixed fps
    vid_id: video id
    video_dir: directory path to video files
    frame_dir: directory path to video frames
    fps: fps for frame extraction
    """
    if not os.path.isdir(os.path.join(frame_dir, vid_id)):
        os.mkdir(os.path.join(frame_dir, vid_id))
    vid_ext = get_vid_ext(vid_id, video_dir)
    ff_command = 'ffmpeg -i {}/{}{} -y -an -qscale 0 -vf fps={} {}/{}/%06d.jpg'.format(video_dir, vid_id, vid_ext, fps, frame_dir, vid_id)
    os.system(ff_command)


def remove_video(vid_id, video_dir):
    """
    Delete video
    vid_id: video id
    video_dir: directory path to video files
    """
    vid_prefix = os.path.join(video_dir, vid_id)
    vid_ext = get_vid_ext(vid_id, video_dir)
    os.remove(vid_prefix+vid_ext)


def select_frames(actions, vid_id, num_frames_per_step):
    """
    Returns representative frames for actions
    actions: list of action annotations from YCII annotations
    vid_id: video id
    num_frames_per_step: number of frames per action step
    Returns required_frames: set contataining names of representative frames
    """
    required_frames = set()
    for action in actions:
        action_start = action['segment'][0]
        action_end = action['segment'][1]
        action_delta = (action_end - action_start) / (num_frames_per_step + 1)    # need num_frames_per_step+1 intervals for num_frames_per_step inner frames
        for i in range(num_frames_per_step):
            frame_time = action_start + action_delta * (i+1)    # in seconds
            frame_id = int( frame_time*(num_frames_per_step + 1) )
            frame_name = '{}.jpg'.format(str(frame_id).zfill(6))
            required_frames.add(frame_name)
    return required_frames


def remove_frames(vid_id, frame_dir, required_frames):
    """
    Remove unused frames
    vid_id: video id
    frame_dir: directory path to video frames
    required_frames: set contataining names of representative frames
    """
    if os.path.isdir(os.path.join(frame_dir, vid_id)):
        curr_frames = os.listdir(os.path.join(frame_dir, vid_id))
        for frame in curr_frames:
            if frame not in required_frames:
                os.remove(os.path.join(frame_dir, vid_id, frame))


def get_actions(actions):
    """
    Returns list of actions text for video
    actions: list of action annotations from YCII annotations
    Returns actions_text: list of actions text for video
    """
    actions_text = []
    for action in actions:
        actions_text.append(action['sentence'])
    return actions_text


def pickle_data(data, pickles_dir, vid_id, fname):
    """
    Pickle data into bytestreams
    data: data to be pickled
    pickles_dir: directory path to pickled data
    vid_id: video id
    fname: name of pickled file
    """
    if not os.path.isdir(os.path.join(pickles_dir, vid_id)):
        os.mkdir(os.path.join(pickles_dir, vid_id))
    pickle_out = open(os.path.join(pickles_dir, vid_id, fname+'.pickle'), 'wb')
    pickle.dump(data, pickle_out)
    pickle_out.close()


def depickle_data(pickles_dir, vid_id, fname):
    """
    Depickle data from bytestreams
    pickles_dir: directory path to pickled data
    vid_id: video id
    fname: name of pickled file
    """
    pickle_path = os.path.join(pickles_dir, vid_id, fname+'.pickle')
    if os.path.exists(pickle_path):
        pickle_in = open(pickle_path, 'rb')
        candidates = pickle.load(pickle_in)
        return candidates
    return []



In [4]:
def prepare_dataset(dataset_root='/h/mkhan/ece496-capstone/datasets', num_frames_per_step=4):
    """
    Download and prepare dataset files
    dataset_root: directory path to dataset base
    num_frames_per_step: number of frames per action step
    """

    annotations = read_json(os.path.join(dataset_root, 'annotations', 'ycii_annotations_trainval.json'))['database']
    
    videos_root = os.path.join(dataset_root, 'ycii_videos')
    if not os.path.isdir(videos_root):
        os.mkdir(videos_root)
    frames_root = os.path.join(dataset_root, 'ycii_frames')
    if not os.path.isdir(frames_root):
        os.mkdir(frames_root)
    pickles_root = os.path.join(dataset_root, 'ycii_pickles')
    if not os.path.isdir(pickles_root):
        os.mkdir(pickles_root)

    missing_vid_list = []

    detector = Detector()
    
    with open(os.path.join(dataset_root, 'vid_list', 'vid_list_ycii_val_short.txt')) as f:
        lines = f.readlines()
        for line in lines:
            rcp_type,vid_id = line.replace('\n','').split('/')
            print('[INFO] Processing video {}'.format(vid_id))
            
            # download the video
#             download_video(vid_id, videos_root)
            vid_url = 'www.youtube.com/watch?v='+vid_id
            vid_prefix = os.path.join(videos_root, vid_id) 
            os.system(' '.join(("youtube-dl -o", vid_prefix, vid_url)))

            # check if the video is available
            if os.path.exists(vid_prefix+'.mp4') or os.path.exists(vid_prefix+'.mkv') or os.path.exists(vid_prefix+'.webm'):
                print('[INFO] Downloaded video {}'.format(vid_id))
            else:
                missing_vid_list.append(line)
                print('[INFO] Cannot download video {}'.format(vid_id))
                continue

            # sample frames at fixed fps
            sample_frames(vid_id, videos_root, frames_root, fps=5)
            print('[INFO] Sampled frames for video {}'.format(vid_id))

            # remove sampled video file (optional)
            remove_video(vid_id, videos_root)
            print('[INFO] Removed video {}'.format(vid_id))
            
            # select representative frames for actions
            actions = annotations[vid_id]['annotations']
            selected_frames = select_frames(actions, vid_id, num_frames_per_step)
            print('[INFO] Selected frames for video {}'.format(vid_id))

            # remove unsued frames
            remove_frames(vid_id, frames_root, selected_frames)
            print('[INFO] Removed unused frames for video {}'.format(vid_id))

            # get candidates for images
            frames = sorted(glob.glob(os.path.join(frames_root, vid_id, '*.*')))
            candidates = [detector.inference(frame, max_detections=5) for frame in frames]
            print('[INFO] Extracted candidates for video {}'.format(vid_id))

            # save pickeled files for candidates
            pickle_data(candidates, pickles_root, vid_id, 'candidates')
            print('[INFO] Saved candidates for video {}'.format(vid_id))
            
            # get annotations list
            actions_list = get_actions(actions)
            print('[INFO] Extracted actions for video {}'.format(vid_id))
            
            # save pickled files for annotations list
            pickle_data(actions_list, pickles_root, vid_id, 'actions')
            print('[INFO] Saved candidates for video {}'.format(vid_id))


    # write the missing videos to file
    missing_vid = open(os.path.join(dataset_root, 'vid_list', 'missing_videos.txt'), 'w')
    for line in missing_vid_list:
        missing_vid.write(line)

    # sanitize and remove the intermediate files
    # os.system("find {} -name '*.part*' -delete".format(dataset_root))
    os.system("find {} -name '*.f*' -delete".format(dataset_root))

In [5]:
def load_dataset(dataset_root='/h/mkhan/ece496-capstone/datasets'):
    """
    Load dataset values from saved files
    dataset_root: directory path to dataset base
    """
    pickles_root = os.path.join(dataset_root, 'ycii_pickles')
    
    all_candidates = []
    all_actions = []
    with open(os.path.join(dataset_root, 'vid_list', 'vid_list_ycii_val_short.txt')) as f:
        lines = f.readlines()
        for line in lines:
            rcp_type,vid_id = line.replace('\n','').split('/')
            print('[INFO] Loading data for video {}'.format(vid_id))
            
            # load candidates data
            candidates = depickle_data(pickles_root, vid_id, 'candidates')
            if candidates:
                all_candidates.extend(candidates)
                print('[INFO] Loaded candidates for video {}'.format(vid_id))
            else:
                print('[INFO] Cannot load candidates for video {}'.format(vid_id))

            # load actions data
            actions = depickle_data(pickles_root, vid_id, 'actions')
            if actions:
                all_actions.append(actions)
                print('[INFO] Loaded actions for video {}'.format(vid_id))
            else:
                print('[INFO] Cannot load actions for video {}'.format(vid_id))

    return all_candidates, all_actions

In [6]:
# USAGE: Run this just once to prepare and save data on disk
prepare_dataset(dataset_root='/h/mkhan/ece496-capstone/datasets', num_frames_per_step=1)

loading configuration file cache
loading weights file https://cdn.huggingface.co/unc-nlp/frcnn-vg-finetuned/pytorch_model.bin from cache at /h/mkhan/.cache/torch/transformers/57f6df6abe353be2773f2700159c65615babf39ab5b48114d2b49267672ae10f.77b59256a4cf8343ae0f923246a81489fc8d82f98d082edc2d2037c977c0d9d0
All model checkpoint weights were used when initializing GeneralizedRCNN.

All the weights of GeneralizedRCNN were initialized from the model checkpoint at unc-nlp/frcnn-vg-finetuned.
If your task is similar to the task the model of the checkpoint was trained on, you can already use GeneralizedRCNN for predictions without further training.
[INFO] Processing video sdB8qBlLS2E
[INFO] Cannot download video sdB8qBlLS2E
[INFO] Processing video fn9anlEL4FI
[INFO] Downloaded video fn9anlEL4FI
[INFO] Sampled frames for video fn9anlEL4FI
[INFO] Removed video fn9anlEL4FI
[INFO] Selected frames for video fn9anlEL4FI
[INFO] Removed unused frames for video fn9anlEL4FI
[INFO] Extracted candidates for

In [7]:
# Required Cell

# USAGE: Run this to load all candidate and actions data from disk
all_candidates, all_actions = load_dataset(dataset_root='/h/mkhan/ece496-capstone/datasets')

[INFO] Loading data for video sdB8qBlLS2E
[INFO] Cannot load candidates for video sdB8qBlLS2E
[INFO] Cannot load actions for video sdB8qBlLS2E
[INFO] Loading data for video fn9anlEL4FI
[INFO] Loaded candidates for video fn9anlEL4FI
[INFO] Loaded actions for video fn9anlEL4FI
[INFO] Loading data for video RnSl1LVrItI
[INFO] Loaded candidates for video RnSl1LVrItI
[INFO] Loaded actions for video RnSl1LVrItI
[INFO] Loading data for video vVZsj1t9R70
[INFO] Loaded candidates for video vVZsj1t9R70
[INFO] Loaded actions for video vVZsj1t9R70


In [8]:
# Required Cell
# Handles building steps and entity_count


# for actions in all_actions:
#     for i, action in enumerate(actions):
#         print(str(i) + ': ' + action)


# Strip all whitespaces and periods
# Note, periods will be added back later; need to temporarily remove periods before passing into parser
all_actions = [[action.strip('.') for action in actions] for actions in all_actions]
    
NULL = '[unused1]'
PAD = '[unused2]'
ENTITY = '[unused3]'
ACTION = '[SEP]'

# TODO: iterate through all annotations to find max_step_length
MAX_STEP_LENGTH = 30

tokenizer = LxmertTokenizer.from_pretrained("unc-nlp/lxmert-base-uncased", pad_token=PAD)
tokenizer.add_special_tokens({"additional_special_tokens": [NULL, PAD, ENTITY]})
tokenizer.encode([NULL, PAD, ENTITY], add_special_tokens=True)

# TODO: iterate through all annotations to find max_step_length
max_step_length = 20    

entity_count = []
steps = []

for idx in range(len(all_actions)):
    
    entities, indices = parse(all_actions[idx], max_step_length=max_step_length)
    entities.append([NULL])
    entity_count.append([len(en) for en in entities])
    
    # insert in reverse so preceeding word indices can still be used for modified actions
    for ind in reversed(indices):
        action_idx = ind[0]//max_step_length
        entity_idx = ind[0]%max_step_length
        words = all_actions[idx][action_idx].split()
        words.insert(entity_idx, ENTITY)
        all_actions[idx][action_idx] = ' '.join(words)

        all_actions[idx] = [action + '.' if not action.endswith('.') else action for action in all_actions[idx]]

    tokens_steps = tokenizer(
                    all_actions[idx],
                    return_token_type_ids=False,
                    return_attention_mask=False,
                    add_special_tokens=True,
                    padding="max_length",
                    max_length=MAX_STEP_LENGTH + 2,
                    return_tensors="pt"
                )
    tokens_steps = tokens_steps['input_ids'].flatten()
    tokens_steps = tokens_steps[tokens_steps != 101]
    tokens_steps = tokenizer.decode(tokens_steps) + ' ' + NULL
    steps.append(tokens_steps)


# for actions in all_actions:
#     for i, action in enumerate(actions):
#         print(str(i) + ': ' + action)

In [9]:
# Required Cell
# Handles building boxes and features


# Reshape all of the candidate bounding box and feature tensors.
boxes = torch.tensor([candidate[0].numpy() for candidate in all_candidates]).squeeze(1)
features = torch.tensor([candidate[1].numpy() for candidate in all_candidates]).squeeze(1)

boxes = boxes.flatten(start_dim=0, end_dim=1)
features = features.flatten(start_dim=0, end_dim=1)

boxes = torch.stack((boxes, boxes))
features = torch.stack((features, features))

In [10]:
print(steps)

['add [unused3] garram masala seeds and a bay leaf to [unused3] the oil. [SEP] [unused2] [unused2] [unused2] [unused2] [unused2] [unused2] [unused2] [unused2] [unused2] [unused2] [unused2] [unused2] [unused2] add [unused3] the lamb to [unused3] the pot. [SEP] [unused2] [unused2] [unused2] [unused2] [unused2] [unused2] [unused2] [unused2] [unused2] [unused2] [unused2] [unused2] [unused2] [unused2] [unused2] [unused2] [unused2] [unused2] [unused2] [unused2] [unused2] add [unused3] garlic ginger paste and chopped onions to the pot. [SEP] [unused2] [unused2] [unused2] [unused2] [unused2] [unused2] [unused2] [unused2] [unused2] [unused2] [unused2] [unused2] [unused2] [unused2] [unused2] [unused2] [unused2] [unused2] add [unused3] chili tumeric coriander cumin and salt. [SEP] [unused2] [unused2] [unused2] [unused2] [unused2] [unused2] [unused2] [unused2] [unused2] [unused2] [unused2] [unused2] [unused2] [unused2] [unused2] [unused2] [unused2] add [unused3] water to [unused3] the pot. [SEP] [

In [11]:
print(features)

tensor([[[5.2232e-01, 2.9430e-01, 3.0040e-02,  ..., 0.0000e+00,
          4.8199e+00, 0.0000e+00],
         [6.7331e-01, 0.0000e+00, 5.0316e-03,  ..., 2.4784e-01,
          2.2119e+00, 1.4384e-02],
         [7.0783e-02, 1.1483e-01, 0.0000e+00,  ..., 1.7114e-02,
          2.2685e-01, 1.2317e-01],
         ...,
         [4.2445e-02, 4.1700e+00, 7.6074e-03,  ..., 7.0933e-02,
          5.6186e+00, 7.6773e-02],
         [3.9388e-02, 8.3052e-01, 0.0000e+00,  ..., 0.0000e+00,
          4.6912e-02, 3.2373e+00],
         [7.0633e-03, 3.3752e+00, 1.2944e-01,  ..., 9.5188e-03,
          1.8473e+00, 2.0603e+00]],

        [[5.2232e-01, 2.9430e-01, 3.0040e-02,  ..., 0.0000e+00,
          4.8199e+00, 0.0000e+00],
         [6.7331e-01, 0.0000e+00, 5.0316e-03,  ..., 2.4784e-01,
          2.2119e+00, 1.4384e-02],
         [7.0783e-02, 1.1483e-01, 0.0000e+00,  ..., 1.7114e-02,
          2.2685e-01, 1.2317e-01],
         ...,
         [4.2445e-02, 4.1700e+00, 7.6074e-03,  ..., 7.0933e-02,
          5.618

In [12]:
print(boxes)

tensor([[[4.9113e-04, 3.0941e-01, 2.9766e-02, 3.8976e-01],
         [6.5312e-01, 1.2063e-03, 9.9088e-01, 4.3768e-01],
         [6.6566e-01, 1.9197e-01, 9.9766e-01, 9.5529e-01],
         ...,
         [6.3899e-01, 0.0000e+00, 9.8486e-01, 1.7960e-01],
         [2.3436e-01, 4.4089e-02, 5.2754e-01, 2.8973e-01],
         [6.5031e-01, 2.2177e-03, 9.9128e-01, 3.5090e-01]],

        [[4.9113e-04, 3.0941e-01, 2.9766e-02, 3.8976e-01],
         [6.5312e-01, 1.2063e-03, 9.9088e-01, 4.3768e-01],
         [6.6566e-01, 1.9197e-01, 9.9766e-01, 9.5529e-01],
         ...,
         [6.3899e-01, 0.0000e+00, 9.8486e-01, 1.7960e-01],
         [2.3436e-01, 4.4089e-02, 5.2754e-01, 2.8973e-01],
         [6.5031e-01, 2.2177e-03, 9.9128e-01, 3.5090e-01]]])


In [13]:
print(entity_count)

[[2, 2, 1, 1, 2, 2, 2, 2, 1], [3, 2, 2, 2, 1, 2, 1, 2, 1], [2, 2, 1, 2, 2, 1, 1, 2, 1, 1, 1, 1]]


In [14]:
# NOTE: Do not execute the cells below, they are rough code for testing stuff out

In [None]:
# print(all_actions)

for actions in all_actions:
    for action in actions:
        print(action)
# actions = all_actions[0]
# print(actions)
# actions = [action + '.' if not action.endswith('.') else action for action in actions]
# print(actions)

# print(all_actions)
for actions in all_actions:
    for action in actions:
        print(action)


# for actions in all_actions:
#     for action in actions:
# #         print(action)
#         if not action.endswith('.'):
#             action += '.'

# print(all_actions)

In [None]:
# Ensure all actions terminate with periods (Note, make sure NULL action is not yet added to actions lists)
# all_actions = [[action + '.' if not action.endswith('.') else action for action in actions] for actions in all_actions]

In [None]:

idx = 0
# print(all_actions[idx])
print(all_actions[idx][0])
words = all_actions[idx][0].split()
print(words)

indices = [[1, 2, 3], [9, 10]]

for ind in reversed(indices):
    print(ind[0])
    words.insert(ind[0], '[unused3]')

all_actions[idx][0] = ' '.join(words)
print(all_actions[idx][0])

# sentence = 'add  garram masala seeds and a bay leaf to the oil.'
# print(sentence)
# words = sentence.split()
# print(words)



In [None]:
# print(all_actions)

steps = all_actions[0]
steps1 = [step.strip() for step in steps]

# print(steps)
# print(steps1)
for s in steps1:
    print(s)


In [None]:
entities, indices = parse(steps, max_step_length=20)
entities.append([NULL])

In [None]:
print(len(steps))
print(len(entities))
print(len(indices))

print(steps)
print(entities)
print(indices)

In [None]:
# entities.append(NULL)
ent_len = [len(e) for e in entities]
# ent_len.append(1)    # For appened NULL (alternatively, do entities.append(null))
print(ent_len)

In [None]:
steps1 = []
steps1.append('Grill [unused3] the tomatoes in [unused3] a pan.')
steps1.append('Add [unused3] oil into [unused3] the pan.')
steps1.append('Add [unused3] oil into [unused3] the pan.')
steps1.append('Cook [unused3] the bacon.')
steps1.append('Spread [unused3] some mayonnaise onto [unused3] the bread.')
steps1.append('Place [unused3] a piece of [unused3] lettuce onto [unused3] it.')
steps1.append('Place [unused3] the tomatoes over [unused3] it.')
steps1.append('Sprinkle [unused3] some salt and pepper onto [unused3] it.')
steps1.append('Place [unused3] the bacon at [unused3] the top.')
steps1.append('Place the [unused3] piece of bread at the top.')


In [None]:
NULL = '[unused1]'
PAD = '[unused2]'
ENTITY = '[unused3]'
ACTION = '[SEP]'

MAX_STEP_LENGTH = 30

In [None]:
from transformers import LxmertModel, LxmertTokenizer

tokenizer = LxmertTokenizer.from_pretrained("unc-nlp/lxmert-base-uncased", pad_token=PAD)
tokenizer.add_special_tokens({"additional_special_tokens": [NULL, PAD, ENTITY]})
tokenizer.encode([NULL, PAD, ENTITY], add_special_tokens=True)

tokens_steps1 = tokenizer(
                    steps1,
                    return_token_type_ids=False,
                    return_attention_mask=False,
                    add_special_tokens=True,
                    padding="max_length",
                    max_length=MAX_STEP_LENGTH + 2,
                    return_tensors="pt"
                )


In [None]:
print(tokens_steps1['input_ids'])

In [None]:
steps1_flat = tokens_steps1['input_ids'].flatten()
# print(steps1_flat)
steps1_flat = steps1_flat[steps1_flat != 101]
# print(steps1_flat)
steps1_flat = tokenizer.decode(steps1_flat) + ' ' + NULL
print(steps1_flat)

In [None]:
asdf = 'hello'
qwer = 'world'
print(type([asdf,qwer]))


In [None]:
print(steps1_flat)

In [None]:
# print(tokens_steps1['input_ids'])

steps1_flat = tokens_steps1['input_ids'].flatten()
print(steps1_flat)
steps1_flat = steps1_flat[steps1_flat != 101]
print(steps1_flat)


In [None]:
annotations = read_json(os.path.join('/h/mkhan/ece496-capstone/datasets', 'annotations', 'ycii_annotations_trainval.json'))['database']

In [None]:
actions = annotations['fn9anlEL4FI']['annotations']

In [None]:
print(actions)

In [None]:
i = 0
for item in annotations:
    if annotations[item]['subset']=='validation':
        print(item)
#         print(annotations[item])
        print(annotations[item]['duration'])
#         print(annotations[item]['annotations'])
        segments = annotations[item]['annotations']
        for segment in segments:
            print(segment)
#             start = segment['segment'][0]
            end = segment['segment'][1]
#             print(str(start) + " " + str(end))
        
        print(end)

        i += 1
        if i==3:
            break