In [60]:
# Thie notebook processes video files specified in the
# "Finding "It": Weakly-Supervised Reference-Aware
# Visual Grounding in Instructional Videos" paper (FI),
# and extracts key frames from the annotated test dataset

In [2]:
import sys
sys.path.append("/h/sagar/ece496-capstone/train/utils")

In [3]:
import glob
import json
import numpy as np
import os
import pickle
import torch
import webvtt

from detector import Detector
from parser import parse
from transformers import LxmertModel, LxmertTokenizer

In [4]:
# youtube_dl_custom.py
# Embedded youtube-dl script adapted from:
# https://github.com/ytdl-org/youtube-dl/blob/master/README.md
# Needed to download FI dataset with specific youtube-dl opts

from __future__ import unicode_literals
import youtube_dl
import os

class MyLogger(object):
    def debug(self, msg):
        pass
    def warning(self, msg):
        pass
    def error(self, msg):
        print('[INFO] Cannot download video')
        print(msg)

def my_hook(d):
    if d['status'] == 'finished':
#         print('[INFO] Downloaded video')
        pass

def youtube_dl_wrapper(vid_id, video_dir):
    # ydl_opts adapted from:
    # https://finding-it.github.io/README.txt
    ydl_opts = {
        'ignoreerrors': True,
        'writesubtitles':False,
        'format': '18',
        'noplaylist' : True,
        'outtmpl': os.path.join(video_dir, '%(id)s.%(ext)s'),
        'logger': MyLogger(),
        'progress_hooks': [my_hook],
    }
    with youtube_dl.YoutubeDL(ydl_opts) as ydl:
        ydl.download(['https://www.youtube.com/watch?v=' + vid_id])


In [5]:
def read_json(path='output.json'):
    """
    Check for valid JSON format and read content
    path: path to JSON file
    """
    file = open(path)
    line = file.read().replace('\n', ' ')
    file.close()
    try:
        parsed_json = json.loads(line)
    except:
        assert False, 'Invalid JSON'
    return parsed_json

def get_vid_ext(vid_id, video_dir):
    """
    Returns video file extension
    vid_id: video id
    video_dir: directory path to video files
    """
    vid_prefix = os.path.join(video_dir, vid_id)
    if os.path.exists(vid_prefix+'.mp4'):
        return '.mp4'
    elif os.path.exists(vid_prefix+'.mkv'):
        return '.mkv'
    elif os.path.exists(vid_prefix+'.webm'):
        return '.webm'

In [6]:
def sample_frames(vid_id, video_dir, frame_dir, fps=5):
    """
    Sample video into frames at fixed fps
    vid_id: video id
    video_dir: directory path to video files
    frame_dir: directory path to video frames
    fps: fps for frame extraction
    """
    if not os.path.isdir(frame_dir):
        os.mkdir(frame_dir)
    vid_ext = get_vid_ext(vid_id, video_dir)
    ff_command = 'ffmpeg -i {}/{}{} -y -an -qscale 0 -vf fps={} {}/%06d.jpg'.format(video_dir, vid_id, vid_ext, fps, frame_dir)
    os.system(ff_command)


def remove_video(vid_id, video_dir):
    """
    Delete video
    vid_id: video id
    video_dir: directory path to video files
    """
    vid_prefix = os.path.join(video_dir, vid_id)
    vid_ext = get_vid_ext(vid_id, video_dir)
    os.remove(vid_prefix+vid_ext)


# # Note: Different for FI and YCII datasets
# def select_frames(vid_id, frame_dir, vg_annotations):
#     """
#     Returns representative frames for actions and removes unused frames
#     vid_id: video id
#     frame_dir: directory path to video frames
#     vg_annotations: dictionary of visual grounding annotations
#     Returns required_frames: set contataining names of representative frames
#     """
#     required_frames = set()

#     for entity in vg_annotations[vid_id]:
#         for bb in vg_annotations[vid_id][entity]['bboxes']:
#             required_frames.add(bb['img'].split('/')[1].split('.')[0] + '.jpg')

#     curr_frame_list = os.listdir(os.path.join(frame_dir, vid_id))
#     for frame in curr_frame_list:
#         if frame not in required_frames:
#             os.remove(os.path.join(frame_dir, vid_id, frame))
#     return required_frames


def select_frames(timestamps, num_frames_per_step):
    """
    Return representative frames for actions
    timestamps: list of timestamp tuples (start, end), in seconds, for each action step
    num_frames_per_step: number of frames per action step
    Return required_frames: list of lists of strings contataining names of representative frames for each step
    Return required_frames_set: set contataining names of representative frames
    """
    required_frames_set = set()
    required_frames = []
    for idx, timestamp in enumerate(timestamps):
        required_frames.append([])
        action_start = timestamp[0]
        action_end = timestamp[1]
        # action_delta = (action_end - action_start) / (num_frames_per_step + 1)    # need num_frames_per_step+1 intervals for num_frames_per_step inner frames
        action_delta = (action_end - action_start) / (num_frames_per_step - 1)    # take outer frames for consistency with FI dataset (frame--interval--frame--interval--frame)
        for i in range(num_frames_per_step):
            # frame_time = action_start + action_delta * (i+1)    # in seconds
            frame_time = action_start + action_delta * i    # in seconds
            # frame_id = int( frame_time*(num_frames_per_step + 1) )
            frame_id = int( frame_time*(num_frames_per_step) ) + 1
            frame_name = '{}.jpg'.format(str(frame_id).zfill(6))
            required_frames[idx].append(frame_name)
            required_frames_set.add(frame_name)
    return required_frames, required_frames_set


def remove_frames(frame_dir, required_frames):
    """
    Remove unused frames
    frame_dir: directory path to video frames
    required_frames: set contataining names of representative frames
    """
    if os.path.isdir(frame_dir):
        curr_frames = os.listdir(frame_dir)
        for frame in curr_frames:
            if frame not in required_frames:
                os.remove(os.path.join(frame_dir, frame))


def pickle_data(data, pickles_dir, fname):
    """
    Pickle data into bytestreams
    data: data to be pickled
    pickles_dir: directory path to pickled data
    fname: name of pickled file
    """
    if not os.path.isdir(pickles_dir):
        os.mkdir(pickles_dir)
    pickle_out = open(os.path.join(pickles_dir, fname+'.pickle'), 'wb')
    pickle.dump(data, pickle_out)
    pickle_out.close()


def depickle_data(pickles_dir, fname):
    """
    Depickle data from bytestreams
    pickles_dir: directory path to pickled data
    fname: name of pickled file
    Return data: depickled data
    """
    pickle_path = os.path.join(pickles_dir, fname+'.pickle')
    if os.path.exists(pickle_path):
        pickle_in = open(pickle_path, 'rb')
        data = pickle.load(pickle_in)
        return data
    return []

In [7]:
def generate_fi_vidlists(dataset_root='/h/mkhan/ece496-capstone/datasets'):
    """
    Generate video lists for FI dataset splits by reading from subsets.json
    dataset_root: directory path to dataset base
    """
    fi_root = os.path.join(dataset_root, 'fi_datasets')
#     if not os.path.isdir(fi_root):
#         os.mkdir(fi_root)

    subsets = read_json(os.path.join(fi_root, 'subsets.json'))   # Subsets: {all:58,simple:23,medium:12,hard:23}
    subset_all = subsets['ycii']['eval_recipes_all']
    subset_simple = subsets['ycii']['eval_recipes_simple']
    subset_medium = subsets['ycii']['eval_recipes_medium']
    subset_hard = subsets['ycii']['eval_recipes_hard']

    rr_base = os.path.join(fi_root, 'YCII/RR')  # directory path to reference resolution annotations

    with open(os.path.join(fi_root, 'fi_all.txt'), 'w') as f:
        for sample in subset_all:
            vid_id = os.listdir(os.path.join(rr_base, sample))[0].split('.')[0]  # transcript_path: <rr_base>/<sample>/<vid_id>.en.vtt
            f.write(vid_id + '\n')

    with open(os.path.join(fi_root, 'fi_simple.txt'), 'w') as f:
        for sample in subset_simple:
            vid_id = os.listdir(os.path.join(rr_base, sample))[0].split('.')[0]  # transcript_path: <rr_base>/<sample>/<vid_id>.en.vtt
            f.write(vid_id + '\n')

    with open(os.path.join(fi_root, 'fi_medium.txt'), 'w') as f:
        for sample in subset_medium:
            vid_id = os.listdir(os.path.join(rr_base, sample))[0].split('.')[0]  # transcript_path: <rr_base>/<sample>/<vid_id>.en.vtt
            f.write(vid_id + '\n')

    with open(os.path.join(fi_root, 'fi_hard.txt'), 'w') as f:
        for sample in subset_hard:
            vid_id = os.listdir(os.path.join(rr_base, sample))[0].split('.')[0]  # transcript_path: <rr_base>/<sample>/<vid_id>.en.vtt
            f.write(vid_id + '\n')


def generate_ycii_vidlists(dataset_root='/h/mkhan/ece496-capstone/datasets'):
    """
    Generate video lists for YCII dataset splits
    dataset_root: directory path to dataset base
    """
    vidlist_root = os.path.join(dataset_root, 'vid_list')
    subset_train = os.path.join(vidlist_root, 'train_list.txt')    # Subsets: {train:1333,val:457,test:210}
    subset_val = os.path.join(vidlist_root, 'val_list.txt')        # Subsets: {train:1333,val:457,test:210}
    subset_test = os.path.join(vidlist_root, 'test_list.txt')      # Subsets: {train:1333,val:457,test:210}

    ycii_train = []
    with open(subset_train, 'r') as f:
        lines = f.readlines()
        for line in lines:
            rcp_type,vid_id = line.replace('\n','').split('/')
            ycii_train.append(vid_id)

    with open(os.path.join(vidlist_root, 'ycii_train.txt'), 'w') as f:
        for vid_id in ycii_train:
            f.write(vid_id + '\n')

    ycii_val = []
    with open(subset_val, 'r') as f:
        lines = f.readlines()
        for line in lines:
            rcp_type,vid_id = line.replace('\n','').split('/')
            ycii_val.append(vid_id)

    with open(os.path.join(vidlist_root, 'ycii_val.txt'), 'w') as f:
        for vid_id in ycii_val:
            f.write(vid_id + '\n')

    ycii_test = []
    with open(subset_test, 'r') as f:
        lines = f.readlines()
        for line in lines:
            rcp_type,vid_id = line.replace('\n','').split('/')
            ycii_test.append(vid_id)

    with open(os.path.join(vidlist_root, 'ycii_test.txt'), 'w') as f:
        for vid_id in ycii_test:
            f.write(vid_id + '\n')


def remove_duplicates(fpath, orig_set):
    """
    Remove duplicates from fpath file that are within orig_set
    fpath: file path from where to remove lines
    orig_set: set of orignal lines that should not occur in fpath
    """
    all_items = []
    with open(fpath, 'r') as f:
        lines = f.readlines()
        for line in lines:
            item = line.replace('\n','')
            all_items.append(item)
    print("Length before: " + str(len(all_items)))

    with open(fpath, 'w') as f:
        for item in all_items:
            if item not in orig_set:
                f.write(item + '\n')
            # else:
                # print(item)
    print("Length after: " + str(len(open(fpath, 'r').readlines())))


def clean_vidlists(dataset_root='/h/mkhan/ece496-capstone/datasets'):
    """
    Remove duplicates from YCII dataset splits that are in FI validation set
    dataset_root: directory path to dataset base
    """
    fi_all = set()
    fi_root = os.path.join(dataset_root, 'fi_datasets')
    vidlist_root = os.path.join(dataset_root, 'vid_list')

    fpath = os.path.join(fi_root, 'fi_all.txt')
    with open(fpath, 'r') as f:
        lines = f.readlines()
        for line in lines:
            vid_id = line.replace('\n','')
            fi_all.add(vid_id)

    # Clean YCII train set
    remove_duplicates(os.path.join(vidlist_root, 'ycii_train.txt'), fi_all) # 1333 --> 1290
    remove_duplicates(os.path.join(vidlist_root, 'ycii_val.txt'), fi_all)   # 457 --> 444

    # Note: since YCII test split does not provide annotations,
    # the overlapping samples should be removed from FI dataset instead of from the YCII split
    ycii_test = set()
    fpath = os.path.join(vidlist_root, 'ycii_test.txt')
    with open(fpath, 'r') as f:
        lines = f.readlines()
        for line in lines:
            vid_id = line.replace('\n','')
            ycii_test.add(vid_id)
    # remove_duplicates(os.path.join(dataset_root, 'ycii_test.txt'), fi_all)  # 210 --> 208
    remove_duplicates(os.path.join(fi_root, 'fi_all.txt'), ycii_test)  # 58 --> 56

In [8]:
import difflib
import re

from ling_obj_classes import *

# Generate entity objects from (entity text, action ID) pairs.
def make_entities(raw_entities, entity_type):
    raw_entities = raw_entities.split(',')
    raw_entities = [raw_entity.strip() for raw_entity in raw_entities]
    
    entities = []

    for raw_entity in raw_entities:
        split = raw_entity.rsplit(' ', 1)
        action_id = re.search('\((.*\d*)\)', raw_entity)
        
        # Sometimes entity texts or action IDs are null.
        if action_id and len(split) == 2:
            entities.append(Entity(split[0], entity_type, action_id.group(1)))
        elif action_id:
            entities.append(Entity(None, entity_type, action_id.group(1)))
        else:
            entities.append(Entity(raw_entity, entity_type, None))
            
    return entities

#given the reference resolution vtt file name, parses into step text list + 
#action_step list (of type ActionStep, containing extracted entities) 
def parse_rr_vtt(rr_vtt_file_name):
    annotations = []
    action_steps = []

    #sample file
    #file = '/content/fi/YCII/RR/101/'+id+'.en.vtt'

    #parse file
    # Generate action steps.
    for caption in webvtt.read(rr_vtt_file_name):
        annotation = re.search('annot: (.*)\n?', caption.text).group(1)
        action_id = re.search('ACTID: (.*)\n?', caption.text).group(1)
        predicate = re.search('PRED: (.*)\n?', caption.text).group(1)
        
        # Number of direct objects ∈ [0, inf).
        direct_objects = re.search('\[DOBJ, .*\] (.*)?', caption.text)
        if direct_objects:
            direct_objects = make_entities(','.join(direct_objects.groups()), 'DOBJ')

        # Number of propositional phreases ∈ [0, inf).
        prop_phrases = re.search('\[PP, .*\] (.*)?', caption.text)
        if prop_phrases:
            prop_phrases = make_entities(','.join(prop_phrases.groups()), 'PP')
                        
        annotations.append(annotation)
        action_steps.append(ActionStep(action_id, predicate, direct_objects, prop_phrases))
            
    # Return both raw annotation texts and groudn truth action steps.
    return annotations, action_steps

def get_entity_list_from_action_steps(action_steps):
    action_step_entity_list = []
    for actions in action_steps:
        ent_list = actions.dobj_list + actions.pp_list
        entity_text_list = [ent.ent_text for ent in ent_list if ent] 
        action_step_entity_list.append(entity_text_list)

    return action_step_entity_list

In [9]:
def prepare_testset(dataset_root='/h/mkhan/ece496-capstone/datasets', max_detections=5):
    """
    Download and prepare FI dataset files
    dataset_root: directory path to dataset base
    vid_list: file path to video list
    num_frames_per_step: number of frames per action step
    max_detections: number of detections per frame
    """

    num_frames_per_step = 5    # for FI dataset, this should be fixed

    fi_root = os.path.join(dataset_root, 'fi')
    if not os.path.isdir(fi_root):
        os.mkdir(fi_root)
    
    videos_root = os.path.join(dataset_root, 'fi_videos')
    if not os.path.isdir(videos_root):
        os.mkdir(videos_root)

    fi_dataset_root = os.path.join(dataset_root, 'fi_datasets')
    annot_root = os.path.join(fi_dataset_root, 'annotations')
    if not os.path.isdir(annot_root):
        os.mkdir(annot_root)
    
    missing_vid_list = []
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    detector = Detector(device)
    
    vg_path = os.path.join(fi_dataset_root, 'YCII/VG/gnding_annot_all.json')  # file path to visual grounding annotations
    vg_annotations = read_json(vg_path)

    # copy all vtt files to annot_root and name them by vid_id
    rr_base = os.path.join(fi_dataset_root, 'YCII/RR')  # directory path to reference resolution annotations
    annotation_paths = os.listdir(rr_base)
    for annotation_path in annotation_paths:
        if annotation_path == 'README.md':
            continue
        annot_path = os.listdir(os.path.join(rr_base, annotation_path))[0]
        os.system('cp {} {}'.format(os.path.join(rr_base, annotation_path, annot_path), os.path.join(annot_root, annot_path)))

    # generate files containing list of video id's for different dataset
    generate_fi_vidlists(dataset_root)
    generate_ycii_vidlists(dataset_root)
    clean_vidlists(dataset_root)
    print('[INFO] Generated FI video lists')

    # load FI video list
    fi_all = []
    fpath = os.path.join(fi_dataset_root, 'fi_all.txt')
    with open(fpath, 'r') as f:
        lines = f.readlines()
        for line in lines:
            vid_id = line.replace('\n','')
            fi_all.append(vid_id)
    # print(len(fi_all))  # 58 --> 56

    # main video-wise loop
    for vid_id in fi_all[34:]:
        print('[INFO] Processing video {}'.format(vid_id))

        # download the video
        youtube_dl_wrapper(vid_id, videos_root)

        # check if the video is available
        vid_prefix = os.path.join(videos_root, vid_id)
        if os.path.exists(vid_prefix+'.mp4') or os.path.exists(vid_prefix+'.mkv') or os.path.exists(vid_prefix+'.webm'):
            print('[INFO] Downloaded video {}'.format(vid_id))
        else:
            missing_vid_list.append(line)
            print('[INFO] Cannot download video {}'.format(vid_id))
            continue

        # get annotations list (and action count)
        transcript_path = os.path.join(annot_root, (vid_id + '.en.vtt'))
        with open(transcript_path, 'r') as f:
            data = f.read().splitlines(True)
        if (data[0] == '\n'):    # remove first line of webvtt files if it's empty
            with open(transcript_path, 'w') as f:
                f.writelines(data[1:])

        actions_list, action_steps = parse_rr_vtt(transcript_path)    # list of action annotations for a single video
        actions_count = len(actions_list)

        print('[INFO] Extracted {} actions for video {}'.format(actions_count, vid_id))

        # setup directories
        parent_root = os.path.join(fi_root, str(actions_count))
        if not os.path.isdir(parent_root):
            os.mkdir(parent_root)

        sample_index = 0    # change this to 1 to ensure 1-indexing for samples
        samples_list = os.listdir(parent_root)    # list of samples of same actions_count
        if samples_list:
            sample_index = max([int(index) for index in samples_list]) + 1    # set sample counter to next available integer
        sample_index = str(sample_index).zfill(5)    # required to ensure sortability
        sample_root = os.path.join(parent_root, sample_index)    # all data for this video will be stored under here
        if not os.path.isdir(sample_root):
            os.mkdir(sample_root)

        frames_root = os.path.join(sample_root, 'frames')    # all sampled images for this video will be under here
        if not os.path.isdir(frames_root):
            os.mkdir(frames_root)
        pickles_root = os.path.join(sample_root, 'pickles')    # all raw data for this video will be under here (stored by variable names)
        if not os.path.isdir(pickles_root):
            os.mkdir(pickles_root)

        # sample frames at fixed fps
#         sample_frames(vid_id, videos_root, frames_root, fps=num_frames_per_step+1)    # need num_frames_per_step+1 intervals for num_frames_per_step inner frames
        sample_frames(vid_id, videos_root, frames_root, fps=num_frames_per_step)
        print('[INFO] Sampled frames for video {}'.format(vid_id))

        # remove sampled video file (optional)
        remove_video(vid_id, videos_root)
        print('[INFO] Removed video {}'.format(vid_id))

        # select representative frames for actions
        annotation = webvtt.read(transcript_path)
        timestamps = []
        for annot in annotation:
            start_time = sum(factor * int(coef) for factor, coef in zip([3600, 60, 1], annot.start.split('.')[0].split(":")))
            end_time = sum(factor * int(coef) for factor, coef in zip([3600, 60, 1], annot.end.split('.')[0].split(":")))
            timestamps.append((start_time, end_time))
            # print(annot.start + '-->' + str(start_time) + ' ' + annot.end + '-->' + str(end_time))
        required_frames, required_frames_set = select_frames(timestamps, num_frames_per_step=num_frames_per_step)
        print('[INFO] Selected frames for video {}'.format(vid_id))
        
        # remove unsued frames
        remove_frames(frames_root, required_frames_set)
        print('[INFO] Removed unused frames for video {}'.format(vid_id))

        # get candidates for images
#         frames = sorted(glob.glob(os.path.join(frames_root, '*.*')))
        frame_paths = [os.path.join(frames_root, frame) for action_frames in required_frames for frame in action_frames]
        candidates = [detector.inference(frame, max_detections=max_detections) for frame in frame_paths]
        print('[INFO] Extracted candidates for video {}'.format(vid_id))

        # save pickled files for vid_id
        pickle_data(vid_id, pickles_root, 'vid_id')
        print('[INFO] Saved vid_id for video {}'.format(vid_id))

        # save pickeled files for candidates
        pickle_data(candidates, pickles_root, 'candidates')
        print('[INFO] Saved candidates for video {}'.format(vid_id))
        
        # save pickeled files for frame paths
        pickle_data(frame_paths, pickles_root, 'frame_paths')
        print('[INFO] Saved frame_paths for video {}'.format(vid_id))

        # save pickled files for annotations list
        pickle_data(actions_list, pickles_root, 'actions_list')
        print('[INFO] Saved actions_list for video {}'.format(vid_id))
        
        # save pickled files for action steps
        pickle_data(action_steps, pickles_root, 'action_steps')
        print('[INFO] Saved action_steps for video {}'.format(vid_id))
    
    # write the missing videos to file
    missing_vid = open(os.path.join(fi_dataset_root, 'missing_videos.txt'), 'w')
    for line in missing_vid_list:
        missing_vid.write(line)

In [10]:
def load_sample(dataset_root='/h/mkhan/ece496-capstone/datasets', actions_count=10, sample_index=0):
    """
    Load the sample_index'th sample with actions_count actions from saved files
    dataset_root: directory path to dataset base
    actions_count: number of actions in sample (bucket id of sample)
    sample_index: index of sample within the bucket
    Return vid_id: video id
    Return candidates: list of candidate data (bboxes, features) for a single video
    Return actions_list: list of action annotations for a single video
    """
    pickles_root = os.path.join(dataset_root, 'fi', str(actions_count), str(sample_index).zfill(5), 'pickles')
    if not os.path.isdir(pickles_root):
        print('[INFO] Cannot load data for {}\'th sample with {} action(s)'.format(sample_index, actions_count))
        return '', [], [], []
    else:
        vid_id = depickle_data(pickles_root, 'vid_id')
        candidates = depickle_data(pickles_root, 'candidates')
        actions_list = depickle_data(pickles_root, 'actions_list')
        action_steps = depickle_data(pickles_root, 'action_steps')
        return vid_id, candidates, actions_list, action_steps

In [11]:
# USAGE: Run this just once to prepare and save FI data on disk
prepare_testset(dataset_root='/h/sagar/ece496-capstone/datasets', max_detections=5)

loading configuration file cache
loading weights file https://cdn.huggingface.co/unc-nlp/frcnn-vg-finetuned/pytorch_model.bin from cache at /h/sagar/.cache/torch/transformers/57f6df6abe353be2773f2700159c65615babf39ab5b48114d2b49267672ae10f.77b59256a4cf8343ae0f923246a81489fc8d82f98d082edc2d2037c977c0d9d0
All model checkpoint weights were used when initializing GeneralizedRCNN.

All the weights of GeneralizedRCNN were initialized from the model checkpoint at unc-nlp/frcnn-vg-finetuned.
If your task is similar to the task the model of the checkpoint was trained on, you can already use GeneralizedRCNN for predictions without further training.
Length before: 1333
Length after: 1290
Length before: 457
Length after: 444
Length before: 58
Length after: 56
[INFO] Generated FI video lists
[INFO] Processing video a3ZvOvo49WE
[INFO] Downloaded video a3ZvOvo49WE
[INFO] Extracted 15 actions for video a3ZvOvo49WE
[INFO] Sampled frames for video a3ZvOvo49WE
[INFO] Removed video a3ZvOvo49WE
[INFO] Sele

[INFO] Downloaded video VSrY0ORD394
[INFO] Extracted 13 actions for video VSrY0ORD394
[INFO] Sampled frames for video VSrY0ORD394
[INFO] Removed video VSrY0ORD394
[INFO] Selected frames for video VSrY0ORD394
[INFO] Removed unused frames for video VSrY0ORD394
[INFO] Extracted candidates for video VSrY0ORD394
[INFO] Saved vid_id for video VSrY0ORD394
[INFO] Saved candidates for video VSrY0ORD394
[INFO] Saved frame_paths for video VSrY0ORD394
[INFO] Saved actions_list for video VSrY0ORD394
[INFO] Saved action_steps for video VSrY0ORD394
[INFO] Processing video t9ry6TkD598
[INFO] Downloaded video t9ry6TkD598
[INFO] Extracted 4 actions for video t9ry6TkD598
[INFO] Sampled frames for video t9ry6TkD598
[INFO] Removed video t9ry6TkD598
[INFO] Selected frames for video t9ry6TkD598
[INFO] Removed unused frames for video t9ry6TkD598
[INFO] Extracted candidates for video t9ry6TkD598
[INFO] Saved vid_id for video t9ry6TkD598
[INFO] Saved candidates for video t9ry6TkD598
[INFO] Saved frame_paths for

In [73]:
# Required Cell

# USAGE: Run this to load the sample_index'th sample with actions_count actions from saved files for FI dataset
vid_id, candidates, actions_list, action_steps = load_sample(dataset_root='/h/sagar/ece496-capstone/datasets', actions_count=25, sample_index=0)

In [25]:
def tokenize_sample(pickles_root, actions_list=[], action_steps=[]):
    """
    Tokenize actions_list
    pickles_root: directory path to pickled data
    actions_list: list of action annotations for a single video
    action_steps: list of action steps for a single video
    Return steps: a single tokenized string representing all steps annotations for a single video
    Return entity_count: list of entity counts per action step of a single video
    Return entities: list of list of entities within each action step of a single video
    Return indices: list of lists of indices indicating entity spans within each action step of a single video
    Return max_step_length: word count of the longest pre-tokenized action step for a single video
    """
    # Strip all whitespaces and periods
    # Note, periods will be added back later
    actions = [action.strip('.') for action in actions_list]
#     print(actions)
    
    NULL = '[unused1]'
    ENTITY = '[unused2]'
    ACTION = '[unused3]'
    
    max_step_length = max(0, max([len(action.split()) for action in actions]))    # maximum word count in a single action step
#     print(max_step_length)
    
    entities = get_entity_list_from_action_steps(action_steps)
    indices = []
    
    for action_idx, step_entities in enumerate(entities):
#         print(step_entities)
#         print('ORIGINAL:  ' + actions[action_idx])
        step_indices = []
        for entity in reversed(step_entities):
            if entity:    # ignore if entity is None
#                 print(entity)
                char_idx = actions[action_idx].find(entity)
                if char_idx < 0:
                    print("NO MATCH:  " + entity)
                    entities[action_idx].remove(entity)
                    continue
                else:
                    word_idx = len(actions[action_idx][:char_idx].split())
                    words = actions[action_idx].split()
                    words.insert(word_idx, ENTITY)
                    actions[action_idx] = ' '.join(words)
                    step_indices.append(list(range(max_step_length*action_idx + word_idx, max_step_length*action_idx + word_idx + len(entity.split()))))
#                     print(step_indices)
#         print(step_indices[::-1])
        for ind in step_indices[::-1]:
            indices.append(ind)
#         print('TOKENIZED: ' + actions[action_idx])
#         print()
    
    for idx in range(len(entities)):
        while None in entities[idx]:
            entities[idx].remove(None)
    
    entity_count = [len(entity) for entity in entities]
    
    actions = [action + '.' if not action.endswith('.') else action for action in actions]
    steps = ''
    for action in actions:
        steps = steps + action + ' ' + ACTION + ' '
    steps = steps + ACTION    # TODO: check if this is correct
#     print(steps)

    pickle_data(steps, pickles_root, 'steps')
    pickle_data(entity_count, pickles_root, 'entity_count')
    pickle_data(entities, pickles_root, 'entities')
    pickle_data(indices, pickles_root, 'indices')
    pickle_data(max_step_length, pickles_root, 'max_step_length')

    return steps, entity_count, entities, indices, max_step_length

In [26]:
import os

num_action_directories = os.listdir("/h/sagar/ece496-capstone/datasets/fi/")

for num_action in num_action_directories:
    videos = os.listdir("/h/sagar/ece496-capstone/datasets/fi/{}/".format(num_action))
    print(videos)
    for video in videos:
        pickles_root = "/h/sagar/ece496-capstone/datasets/fi/{}/{}/pickles".format(num_action, video)
        vid_id, candidates, actions_list, action_steps = load_sample(dataset_root='/h/sagar/ece496-capstone/datasets', actions_count=num_action, sample_index=int(video))
        steps, entity_count, entities, indices, max_step_length = tokenize_sample(pickles_root, actions_list, action_steps)
        

['00001', '00000']
NO MATCH:  the bottom of the other side
['00003', '00000', '00002', '00001']
NO MATCH:  a food processor 
['00000']
['00000']
['00000', '00001']
['00000']
['00000']
['00000']
['00000']
['00000']
NO MATCH:  1/4 cup of plain yoghurt
['00000']
['00001', '00002', '00000', '00003']
['00000', '00001']
NO MATCH:  1/4 inch slices


In [19]:
# Required Cell

# USAGE: Run this to tokenize actions_list of a single video and save data on disk
pickles_root = '/h/sagar/ece496-capstone/datasets/fi/4/00000/pickles'
vid_id, candidates, actions_list, action_steps = load_sample(dataset_root='/h/sagar/ece496-capstone/datasets', actions_count=4, sample_index=0)


In [20]:
steps, entity_count, entities, indices, max_step_length = tokenize_sample(pickles_root, actions_list, action_steps)

In [21]:
steps

'cut [unused2] the salmon into [unused2] slices. [unused3] form [unused2] the rice into [unused2] a ball. [unused3] place [unused2] the rice on [unused2] the fish slice. [unused3] Flip [unused2] the piece of sushi over. [unused3] [unused3]'