In [1]:
import glob
import json
import numpy as np
import os
import pickle
import torch

from detector import Detector
from parser import parse
from transformers import LxmertModel, LxmertTokenizer

In [2]:
def read_json(path='output.json'):
    """
    Check for valid JSON format and read content
    path: path to JSON file
    """
    file = open(path)
    line = file.read().replace('\n', ' ')
    file.close()
    try:
        parsed_json = json.loads(line)
    except:
        assert False, 'Invalid JSON'
    return parsed_json

def get_vid_ext(vid_id, video_dir):
    """
    Returns video file extension
    vid_id: video id
    video_dir: directory path to video files
    """
    vid_prefix = os.path.join(video_dir, vid_id)
    if os.path.exists(vid_prefix+'.mp4'):
        return '.mp4'
    elif os.path.exists(vid_prefix+'.mkv'):
        return '.mkv'
    elif os.path.exists(vid_prefix+'.webm'):
        return '.webm'

In [3]:
def download_video(vid_id, video_dir):
    """
    Download video
    vid_id: video id
    video_dir: directory path to video files
    """
    # download the video
    vid_url = 'www.youtube.com/watch?v='+vid_id
    vid_prefix = os.path.join(video_dir, vid_id) 
    os.system(' '.join(("youtube-dl -o", vid_prefix, vid_url)))


def sample_frames(vid_id, video_dir, frame_dir, fps=5):
    """
    Sample video into frames at fixed fps
    vid_id: video id
    video_dir: directory path to video files
    frame_dir: directory path to video frames
    fps: fps for frame extraction
    """
    if not os.path.isdir(frame_dir):
        os.mkdir(frame_dir)
    vid_ext = get_vid_ext(vid_id, video_dir)
    ff_command = 'ffmpeg -i {}/{}{} -y -an -qscale 0 -vf fps={} {}/%06d.jpg'.format(video_dir, vid_id, vid_ext, fps, frame_dir)
    os.system(ff_command)


def remove_video(vid_id, video_dir):
    """
    Delete video
    vid_id: video id
    video_dir: directory path to video files
    """
    vid_prefix = os.path.join(video_dir, vid_id)
    vid_ext = get_vid_ext(vid_id, video_dir)
    os.remove(vid_prefix+vid_ext)


# def select_frames(actions, num_frames_per_step):
#     """
#     Return representative frames for actions
#     actions: list of action annotations from YCII annotations
#     num_frames_per_step: number of frames per action step
#     Return required_frames: set contataining names of representative frames
#     """
#     required_frames = set()
#     for action in actions:
#         action_start = action['segment'][0]
#         action_end = action['segment'][1]
#         action_delta = (action_end - action_start) / (num_frames_per_step + 1)    # need num_frames_per_step+1 intervals for num_frames_per_step inner frames
#         for i in range(num_frames_per_step):
#             frame_time = action_start + action_delta * (i+1)    # in seconds
#             frame_id = int( frame_time*(num_frames_per_step + 1) )
#             frame_name = '{}.jpg'.format(str(frame_id).zfill(6))
#             required_frames.add(frame_name)
#     return required_frames


def select_frames(actions, num_frames_per_step):
    """
    Return representative frames for actions
    actions: list of action annotations from YCII annotations
    num_frames_per_step: number of frames per action step
    Return required_frames: list of lists of strings contataining names of representative frames for each step
    Return required_frames_set: set contataining names of representative frames
    """
    required_frames_set = set()
    required_frames = []
    for idx, action in enumerate(actions):
        required_frames.append([])
        action_start = action['segment'][0]
        action_end = action['segment'][1]
        # action_delta = (action_end - action_start) / (num_frames_per_step + 1)    # need num_frames_per_step+1 intervals for num_frames_per_step inner frames
        action_delta = (action_end - action_start) / (num_frames_per_step - 1)    # take outer frames for consistency with FI dataset (frame--interval--frame--interval--frame)
        for i in range(num_frames_per_step):
            # frame_time = action_start + action_delta * (i+1)    # in seconds
            frame_time = action_start + action_delta * i    # in seconds
            # frame_id = int( frame_time*(num_frames_per_step + 1) )
            frame_id = int( frame_time*(num_frames_per_step) ) + 1
            frame_name = '{}.jpg'.format(str(frame_id).zfill(6))
            required_frames[idx].append(frame_name)
            required_frames_set.add(frame_name)
    return required_frames, required_frames_set


def remove_frames(frame_dir, required_frames):
    """
    Remove unused frames
    frame_dir: directory path to video frames
    required_frames: set contataining names of representative frames
    """
    if os.path.isdir(frame_dir):
        curr_frames = os.listdir(frame_dir)
        for frame in curr_frames:
            if frame not in required_frames:
                os.remove(os.path.join(frame_dir, frame))


def get_actions(actions):
    """
    Return list of actions text for video
    actions: list of action annotations from YCII annotations
    Return actions_text: list of actions text for video
    """
    actions_text = []
    for action in actions:
        actions_text.append(action['sentence'])
    return actions_text


def pickle_data(data, pickles_dir, fname):
    """
    Pickle data into bytestreams
    data: data to be pickled
    pickles_dir: directory path to pickled data
    fname: name of pickled file
    """
    if not os.path.isdir(pickles_dir):
        os.mkdir(pickles_dir)
    pickle_out = open(os.path.join(pickles_dir, fname+'.pickle'), 'wb')
    pickle.dump(data, pickle_out)
    pickle_out.close()


def depickle_data(pickles_dir, fname):
    """
    Depickle data from bytestreams
    pickles_dir: directory path to pickled data
    fname: name of pickled file
    Return data: depickled data
    """
    pickle_path = os.path.join(pickles_dir, fname+'.pickle')
    if os.path.exists(pickle_path):
        pickle_in = open(pickle_path, 'rb')
        data = pickle.load(pickle_in)
        return data
    return []

In [4]:
def prepare_dataset(dataset_root='/h/mkhan/ece496-capstone/datasets', vid_list='/h/mkhan/ece496-capstone/datasets/vid_list/vid_list_ycii_val_short.txt', num_frames_per_step=4, max_detections=5):
    """
    Download and prepare YCII dataset files
    dataset_root: directory path to dataset base
    num_frames_per_step: number of frames per action step
    max_detections: number of detections per frame
    """

    annotations = read_json(os.path.join(dataset_root, 'annotations', 'ycii_annotations_trainval.json'))['database']

    ycii_root = os.path.join(dataset_root, 'ycii')
    if not os.path.isdir(ycii_root):
        os.mkdir(ycii_root)

    videos_root = os.path.join(dataset_root, 'ycii_videos')
    if not os.path.isdir(videos_root):
        os.mkdir(videos_root)

    missing_vid_list = []

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    detector = Detector(device)

    with open(vid_list) as f:
        lines = f.readlines()
        for line in lines:
            rcp_type,vid_id = line.replace('\n','').split('/')
            print('[INFO] Processing video {}'.format(vid_id))

            # download the video
#             download_video(vid_id, videos_root)
            vid_url = 'www.youtube.com/watch?v='+vid_id
            vid_prefix = os.path.join(videos_root, vid_id)
            os.system(' '.join(("youtube-dl -o", vid_prefix, vid_url)))

            # check if the video is available
            if os.path.exists(vid_prefix+'.mp4') or os.path.exists(vid_prefix+'.mkv') or os.path.exists(vid_prefix+'.webm'):
                print('[INFO] Downloaded video {}'.format(vid_id))
            else:
                missing_vid_list.append(line)
                print('[INFO] Cannot download video {}'.format(vid_id))
                continue

            # get annotations list (and action count)
            actions = annotations[vid_id]['annotations']
            actions_list = get_actions(actions)    # list of action annotations for a single video
            actions_count = len(actions_list)
            print('[INFO] Extracted {} actions for video {}'.format(actions_count, vid_id))

            # setup directories
            parent_root = os.path.join(ycii_root, str(actions_count))
            if not os.path.isdir(parent_root):
                os.mkdir(parent_root)

            sample_index = 0    # change this to 1 to ensure 1-indexing for samples
            samples_list = os.listdir(parent_root)    # list of samples of same actions_count
            if samples_list:
                sample_index = max([int(index) for index in samples_list]) + 1    # set sample counter to next available integer
            sample_index = str(sample_index).zfill(5)    # required to ensure sortability
            sample_root = os.path.join(parent_root, sample_index)    # all data for this video will be stored under here
            if not os.path.isdir(sample_root):
                os.mkdir(sample_root)

            frames_root = os.path.join(sample_root, 'frames')    # all sampled images for this video will be under here
            if not os.path.isdir(frames_root):
                os.mkdir(frames_root)
            pickles_root = os.path.join(sample_root, 'pickles')    # all raw data for this video will be under here (stored by variable names)
            if not os.path.isdir(pickles_root):
                os.mkdir(pickles_root)

            # sample frames at fixed fps
#             sample_frames(vid_id, videos_root, frames_root, fps=num_frames_per_step+1)    # need num_frames_per_step+1 intervals for num_frames_per_step inner frames
            sample_frames(vid_id, videos_root, frames_root, fps=num_frames_per_step)
            print('[INFO] Sampled frames for video {}'.format(vid_id))

            # remove sampled video file (optional)
            remove_video(vid_id, videos_root)
            print('[INFO] Removed video {}'.format(vid_id))

            # select representative frames for actions
            required_frames, required_frames_set = select_frames(actions, num_frames_per_step)
            print('[INFO] Selected frames for video {}'.format(vid_id))

            # remove unsued frames
            remove_frames(frames_root, required_frames_set)
            print('[INFO] Removed unused frames for video {}'.format(vid_id))

            # get candidates for images
#             frames = sorted(glob.glob(os.path.join(frames_root, '*.*')))
            frame_paths = [os.path.join(frames_root, frame) for action_frames in required_frames for frame in action_frames]
            candidates = [detector.inference(frame, max_detections=max_detections) for frame in frame_paths]
            print('[INFO] Extracted candidates for video {}'.format(vid_id))

            # save pickled files for vid_id
            pickle_data(vid_id, pickles_root, 'vid_id')
            print('[INFO] Saved vid_id for video {}'.format(vid_id))

            # save pickeled files for candidates
            pickle_data(candidates, pickles_root, 'candidates')
            print('[INFO] Saved candidates for video {}'.format(vid_id))
            
            # save pickeled files for frame paths
            pickle_data(frame_paths, pickles_root, 'frame_paths')
            print('[INFO] Saved frame_paths for video {}'.format(vid_id))

            # save pickled files for annotations list
            pickle_data(actions_list, pickles_root, 'actions_list')
            print('[INFO] Saved actions_list for video {}'.format(vid_id))

    # write the missing videos to file
    missing_vid = open(os.path.join(dataset_root, 'vid_list', 'missing_videos.txt'), 'w')
    for line in missing_vid_list:
        missing_vid.write(line)

    # sanitize and remove the intermediate files
    # os.system("find {} -name '*.part*' -delete".format(dataset_root))
    os.system("find {} -name '*.f*' -delete".format(dataset_root))

In [36]:
def load_sample(dataset_root='/h/mkhan/ece496-capstone/datasets/ycii', actions_count=10, sample_index=0):
    """
    Load the sample_index'th sample with actions_count actions from saved files
    dataset_root: directory path to dataset base
    actions_count: number of actions in sample (bucket id of sample)
    sample_index: index of sample within the bucket
    Return vid_id: video id
    Return candidates: list of candidate data (bboxes, features) for a single video
    Return actions_list: list of action annotations for a single video
    """
    pickles_root = os.path.join(dataset_root, str(actions_count), str(sample_index).zfill(5), 'pickles')
    if not os.path.isdir(pickles_root):
        print('[INFO] Cannot load data for {}\'th sample with {} action(s)'.format(sample_index, actions_count))
        return '', [], []
    else:
        vid_id = depickle_data(pickles_root, 'vid_id')
        candidates = depickle_data(pickles_root, 'candidates')
        actions_list = depickle_data(pickles_root, 'actions_list')
        return vid_id, candidates, actions_list

In [40]:
def tokenize_sample(pickles_root, actions_list=[]):
    """
    Tokenize actions_list
    pickles_root: directory path to pickled data
    actions_list: list of action annotations for a single video
    Return steps: a single tokenized string representing all steps annotations for a single video
    Return entity_count: list of entity counts per action step of a single video
    Return entities: list of list of entities within each action step of a single video
    Return indices: list of lists of indices indicating entity spans within each action step of a single video
    Return max_step_length: word count of the longest pre-tokenized action step for a single video
    """
    # Strip all whitespaces and periods
    # Note, periods will be added back later; need to temporarily remove periods before passing into parser
    
    actions = [action.strip('.') for action in actions_list]
    
    ENTITY = '[unused2]'
    ACTION = '[unused3]'
    
    max_step_length = max(0, max([len(action.split()) for action in actions]))    # maximum word count in a single action step
    
    print(actions)
    entities, indices = parse(actions, max_step_length=max_step_length)
    entity_count = [len(entity) for entity in entities]
    
    # insert in reverse so preceeding word indices can still be used for modified actions
    for ind in reversed(indices):
        if len(ind) == 0:
            continue
        
        action_idx = ind[0]//max_step_length
        entity_idx = ind[0]%max_step_length
        words = actions[action_idx].split()
        words.insert(entity_idx, ENTITY)
        actions[action_idx] = ' '.join(words)

    actions = [action + '.' if not action.endswith('.') else action for action in actions]

    steps = ''
    for action in actions:
        steps = steps + action + ' ' + ACTION + ' '
    steps = steps + ACTION

    pickle_data(steps, pickles_root, 'steps')
    pickle_data(entity_count, pickles_root, 'entity_count')
    pickle_data(entities, pickles_root, 'entities')
    pickle_data(indices, pickles_root, 'indices')
    pickle_data(max_step_length, pickles_root, 'max_step_length')

    return steps, entity_count, entities, indices, max_step_length

In [41]:
import importlib
import parser
importlib.reload(parser)
from parser import parse

In [51]:
import os

num_action_directories = os.listdir("/h/sagar/ece496-capstone/datasets/fi_20/")

for num_action in num_action_directories:
    videos = os.listdir("/h/sagar/ece496-capstone/datasets/fi_20/{}/".format(num_action))
    for video in videos:
        pickles_root = "/h/sagar/ece496-capstone/datasets/fi_20/{}/{}/pickles".format(num_action, video)
        print(pickles_root)
        _, _, actions_list = load_sample(dataset_root='/h/sagar/ece496-capstone/datasets/fi_20', actions_count=num_action, sample_index=int(video))
        steps, entity_count, entities, indices, max_step_length = tokenize_sample(pickles_root, actions_list)
        print(max_step_length, indices, entities)
        

/h/sagar/ece496-capstone/datasets/fi_20/21/00000/pickles
['Bring the vegetable broth to boil', 'remove from heat', 'add shiitake mushrooms', 'cover it', 'rest it for twenty mins', 'Grate the carrot and ginger', 'Tear the seaweed into 1 inch pieces', 'Remove the mushrooms from the broth', 'cut the stems', 'discard them', 'slice the caps', 'Add the tofu, seaweed to the vegetable broth', 'simmer for 3 min', 'Add the broccoli, mushroom and carrots to the broth,', 'cover it', 'simmer for another min', 'Remove one cup of broth', 'add it to a small bowl', 'gently stir it with miso until it is dissolved', 'Add the dissolved miso into the soup', 'stir']
9 [[1, 2, 3], [11], [19, 20], [28], [37], [39, 40], [46, 47], [55, 56], [58, 59, 60], [64, 65], [67, 68], [73, 74], [82], [91, 92], [100], [110, 111], [118], [124], [127], [137, 138], [145, 146], [148], [154], [156, 157, 158], [164], [166], [172, 173, 174], [176, 177], []] [['the vegetable broth'], ['heat'], ['shiitake mushrooms'], ['it'], ['it'

10 [[1], [6, 7], [11], [21, 22], [31, 32], [43, 44], [51, 52, 53], [55, 56], [61, 62], [64, 65], [71, 72, 73], [75], [81], [91, 92], [101, 102], [104], [111, 112], [121, 122], [124, 125], [132, 133]] [['some milk', 'a cup'], ['them'], ['a little salt'], ['sour cream'], ['a spoon'], ['some crushed walnuts', 'the mixture'], ['an apple', 'small cubes'], ['some green grapes', 'it'], ['black grapes'], ['the fruits'], ['the sauce', 'it'], ['the fruits'], ['the salad', 'fresh lettuce'], ['fresh apple']]
/h/sagar/ece496-capstone/datasets/fi_20/14/00002/pickles
['season the bread crumbs with black pepper and salt', 'coating sliced chicken breasts with flour mixture', 'coating of eggs and bread crumbs', 'put oil in the pan', 'fry the chicken for 10 mins', 'Pour some sauce on the plate', 'put the chicken on it', 'put more sauce on top of it', 'Bake the chicken in the oven at 400 degrees for 15 mins', 'Put some mozzarella cheese on the chicken', 'melt the mozzarella by putting the chicken in the o

9 [[3, 4], [15, 16], [19, 20], [22, 23, 24], [28, 29], [37, 38, 39], [41, 42], [52, 53], [55, 56], [58, 59], [64, 65], [67, 68], [73], [75, 76], [82, 83], [85, 86]] [['the garlic'], ['a bowl'], ['the tomoatos', 'the oil mixture'], ['the basal'], ['the oil mixture', 'the dough'], ['the dough'], ['the basal', 'the pizza'], ['the tomatos', 'the pizza'], ['cheese', 'the pizza'], ['the pizza', 'an oven']]
/h/sagar/ece496-capstone/datasets/fi_20/10/00005/pickles
['Put some drained soaked chickpeas in a food processor', 'add some garlic oil', 'Add chopped onion, parsley, 1/2 spoon baking powder, coriander and sea salt', 'Add some sauce of your favourite', 'Blend everything in the food processor', 'Take the falafel paste out in a bowl', 'Add some gluten free flour', 'mix it together', 'Shape it into small balls by hand', 'fry on a baking tray with oil']
12 [[1, 2, 3, 4], [6, 7, 8], [13, 14, 15], [], [37, 38], [49], [51, 52, 53], [61, 62], [63], [66, 67], [73, 74, 75, 76], [85], [97], [99, 100]

['Preheat the oven at 425 degree', 'drizzle little bit of olive oil on both the sides of 4 bread slices', 'Lay the bacon slices on a broiler pan', 'place both bread and bacon in the oven', 'cook for 10-15 minutes', 'cut the avocado into half', 'take off the seed', 'scoop the pulp of half into a bowl', 'Squeeze half of a lemon juice', 'add a pinch of salt and some fresh parsley', 'mash them all together with a fork to get a paste', 'Slice the other half of the avocado', 'take out the bacon and bread toast from the oven', 'To assemble bottom layer of sandwich', 'take 2 slices of bread', 'spread the avocado mixture over', 'Place some tomato slices on the avocado spread and season with little salt and pepper', 'season with little salt and pepper', 'Drizzle a little bit of olive oil over it', 'Top it with the bacon slices', 'on the other slices of bread, place the sliced avocadoes', 'Season avocado slices with little bit of salt', 'top the avocadoes with lettuce leaves', 'Place the avocado 

13 [[], [14, 15, 16], [18, 19], [28], [40, 41, 42], [44, 45], [57, 58, 59], [66, 67], [81], [83, 84], [96, 97, 98], [100, 101], [105, 106], [108, 109, 110], [118, 119], [123, 124], [131, 132, 133, 134], [136, 137], [], [157, 158, 159, 160], [170], [172, 173], [183, 184, 185], [187, 188, 189], [191, 192], [196, 197], [199, 200, 201], [203, 204], [209, 210], [222], [224, 225], [235, 236], [238, 239], [241, 242], [248, 249], [251, 252], [254, 255], [261, 262, 263], [274], [288, 289, 290], [300, 301, 302], [317, 318, 319], [327, 328], [339, 340], [342], [348, 349]] [[], ['the apple pieces', 'a plate'], ['salt'], ['some duck fat', 'a pan'], ['both the sides'], ['little bit'], ['them', 'a plate'], ['3 ounces slice', 'foie gras'], ['the vein', 'the foie gras'], ['some salt', 'both sides'], ['the seasoned foie gras', 'the pan'], [], ['the cooked foie gras'], ['it', 'a towel'], ['the foie gras', 'the bottom slice', 'the apple'], ['one slice', 'the cep mushroom', 'duck fat'], ['the sandwich'], [

9 [[3, 4], [6], [12, 13], [19, 20], [22, 23], [28], [42, 43], [], [55], [61, 62], [], [73, 74], [76, 77], [82], [86], [91], [93], [101, 102]] [['a pan', 'oil'], ['a pan'], ['the flour', 'the butter'], ['milk'], ['the sauce'], [], ['the pasta', 'a pot'], [], ['the pasta', 'a dish'], ['breadcrumbs', 'top'], ['oil', 'top'], ['the oven']]
/h/sagar/ece496-capstone/datasets/fi_20/12/00003/pickles
['Add oil to a pan', 'Add mushrooms and oil to a pan', 'mix soy sauce, oyster sauce, sugar, ponzu, miso, and sake', 'Add red pepper and onion to the pan', 'Add ginger, garlic, and green onion to the pan', 'cut the green onion into large pieces', 'add to the pan', 'Add the sauce and udon noodles to the pan', 'stir', 'place watercress on the dish', 'place the stir-fry on top of the watercress', 'Add carrots, green onion, cilantro, and watercress on top']
10 [[1], [3, 4], [15, 16], [21], [31, 32], [36, 37], [47, 48], [51, 52, 53], [55, 56], [62, 63], [71, 72], [77, 78], [], [93, 94], [101], [104], [106

13 [[1, 2], [14], [16], [27, 28], [30, 31], [33, 34], [36], [40, 41], [43, 44, 45], [53, 54], [66, 67], [79, 80], [82, 84], [], [], [118, 119], [131, 132], [144, 145], [147, 148], [157, 158], [171, 172], [174], [183, 184, 185], [196], [209, 210], [212, 213], [222, 223], [225, 226], [228, 229, 230], [235], [248, 249, 250], [261, 262], [264], [274, 275], [277, 278], [288, 289], [291, 292]] [['a onion'], ['it', 'pieces'], ['the onion', 'a pot', 'olive oil', 'it'], ['some beef', 'the same pot'], ['some chili powder'], ['the ingredients'], ['one tin', 'chopped tomato'], [], [], ['hot water'], ['the ingredients'], ['one tin', 'kidney beans'], ['the beans'], ['10 minutes', 'lid'], ['one red pepper'], ['core'], ['pepper slices', 'the pot'], ['the ingredients', 'the middle', 'a tortilla wrap'], ['it'], ['a large pan'], ['some oil', 'it'], ['the burrito', 'the pan'], ['the burrito', 'the pan']]
/h/sagar/ece496-capstone/datasets/fi_20/11/00003/pickles
['mix the chicken broth, bean paste, soy sauc