In [1]:
import glob
import json
import numpy as np
import os
import pickle
import torch

from detector import Detector
from parser import parse
from transformers import LxmertModel, LxmertTokenizer

PyTorch version 1.6.0 available.


In [2]:
def read_json(path='output.json'):
    """
    Check for valid JSON format and read content
    path: path to JSON file
    """
    file = open(path)
    line = file.read().replace('\n', ' ')
    file.close()
    try:
        parsed_json = json.loads(line)
    except:
        assert False, 'Invalid JSON'
    return parsed_json

def get_vid_ext(vid_id, video_dir):
    """
    Returns video file extension
    vid_id: video id
    video_dir: directory path to video files
    """
    vid_prefix = os.path.join(video_dir, vid_id)
    if os.path.exists(vid_prefix+'.mp4'):
        return '.mp4'
    elif os.path.exists(vid_prefix+'.mkv'):
        return '.mkv'
    elif os.path.exists(vid_prefix+'.webm'):
        return '.webm'

In [3]:
def download_video(vid_id, video_dir):
    """
    Download video
    vid_id: video id
    video_dir: directory path to video files
    """
    # download the video
    vid_url = 'www.youtube.com/watch?v='+vid_id
    vid_prefix = os.path.join(video_dir, vid_id) 
    os.system(' '.join(("youtube-dl -o", vid_prefix, vid_url)))


def sample_frames(vid_id, video_dir, frame_dir, fps=5):
    """
    Sample video into frames at fixed fps
    vid_id: video id
    video_dir: directory path to video files
    frame_dir: directory path to video frames
    fps: fps for frame extraction
    """
    if not os.path.isdir(frame_dir):
        os.mkdir(frame_dir)
    vid_ext = get_vid_ext(vid_id, video_dir)
    ff_command = 'ffmpeg -i {}/{}{} -y -an -qscale 0 -vf fps={} {}/%06d.jpg'.format(video_dir, vid_id, vid_ext, fps, frame_dir)
    os.system(ff_command)


def remove_video(vid_id, video_dir):
    """
    Delete video
    vid_id: video id
    video_dir: directory path to video files
    """
    vid_prefix = os.path.join(video_dir, vid_id)
    vid_ext = get_vid_ext(vid_id, video_dir)
    os.remove(vid_prefix+vid_ext)


# def select_frames(actions, num_frames_per_step):
#     """
#     Return representative frames for actions
#     actions: list of action annotations from YCII annotations
#     num_frames_per_step: number of frames per action step
#     Return required_frames: set contataining names of representative frames
#     """
#     required_frames = set()
#     for action in actions:
#         action_start = action['segment'][0]
#         action_end = action['segment'][1]
#         action_delta = (action_end - action_start) / (num_frames_per_step + 1)    # need num_frames_per_step+1 intervals for num_frames_per_step inner frames
#         for i in range(num_frames_per_step):
#             frame_time = action_start + action_delta * (i+1)    # in seconds
#             frame_id = int( frame_time*(num_frames_per_step + 1) )
#             frame_name = '{}.jpg'.format(str(frame_id).zfill(6))
#             required_frames.add(frame_name)
#     return required_frames


def select_frames(actions, num_frames_per_step):
    """
    Return representative frames for actions
    actions: list of action annotations from YCII annotations
    num_frames_per_step: number of frames per action step
    Return required_frames: list of lists of strings contataining names of representative frames for each step
    Return required_frames_set: set contataining names of representative frames
    """
    required_frames_set = set()
    required_frames = []
    for idx, action in enumerate(actions):
        required_frames.append([])
        action_start = action['segment'][0]
        action_end = action['segment'][1]
        # action_delta = (action_end - action_start) / (num_frames_per_step + 1)    # need num_frames_per_step+1 intervals for num_frames_per_step inner frames
        action_delta = (action_end - action_start) / (num_frames_per_step - 1)    # take outer frames for consistency with FI dataset (frame--interval--frame--interval--frame)
        for i in range(num_frames_per_step):
            # frame_time = action_start + action_delta * (i+1)    # in seconds
            frame_time = action_start + action_delta * i    # in seconds
            # frame_id = int( frame_time*(num_frames_per_step + 1) )
            frame_id = int( frame_time*(num_frames_per_step) ) + 1
            frame_name = '{}.jpg'.format(str(frame_id).zfill(6))
            required_frames[idx].append(frame_name)
            required_frames_set.add(frame_name)
    return required_frames, required_frames_set


def remove_frames(frame_dir, required_frames):
    """
    Remove unused frames
    frame_dir: directory path to video frames
    required_frames: set contataining names of representative frames
    """
    if os.path.isdir(frame_dir):
        curr_frames = os.listdir(frame_dir)
        for frame in curr_frames:
            if frame not in required_frames:
                os.remove(os.path.join(frame_dir, frame))


def get_actions(actions):
    """
    Return list of actions text for video
    actions: list of action annotations from YCII annotations
    Return actions_text: list of actions text for video
    """
    actions_text = []
    for action in actions:
        actions_text.append(action['sentence'])
    return actions_text


def pickle_data(data, pickles_dir, fname):
    """
    Pickle data into bytestreams
    data: data to be pickled
    pickles_dir: directory path to pickled data
    fname: name of pickled file
    """
    if not os.path.isdir(pickles_dir):
        os.mkdir(pickles_dir)
    pickle_out = open(os.path.join(pickles_dir, fname+'.pickle'), 'wb')
    pickle.dump(data, pickle_out)
    pickle_out.close()


def depickle_data(pickles_dir, fname):
    """
    Depickle data from bytestreams
    pickles_dir: directory path to pickled data
    fname: name of pickled file
    Return data: depickled data
    """
    pickle_path = os.path.join(pickles_dir, fname+'.pickle')
    if os.path.exists(pickle_path):
        pickle_in = open(pickle_path, 'rb')
        data = pickle.load(pickle_in)
        return data
    return []

In [4]:
def prepare_dataset(dataset_root='/h/mkhan/ece496-capstone/datasets', vid_list='/h/mkhan/ece496-capstone/datasets/vid_list/vid_list_ycii_val_short.txt', num_frames_per_step=4, max_detections=5):
    """
    Download and prepare YCII dataset files
    dataset_root: directory path to dataset base
    num_frames_per_step: number of frames per action step
    max_detections: number of detections per frame
    """

    annotations = read_json(os.path.join(dataset_root, 'annotations', 'ycii_annotations_trainval.json'))['database']

    ycii_root = os.path.join(dataset_root, 'ycii')
    if not os.path.isdir(ycii_root):
        os.mkdir(ycii_root)

    videos_root = os.path.join(dataset_root, 'ycii_videos')
    if not os.path.isdir(videos_root):
        os.mkdir(videos_root)

    missing_vid_list = []

    detector = Detector()

    with open(vid_list) as f:
        lines = f.readlines()
        for line in lines:
            rcp_type,vid_id = line.replace('\n','').split('/')
            print('[INFO] Processing video {}'.format(vid_id))

            # download the video
#             download_video(vid_id, videos_root)
            vid_url = 'www.youtube.com/watch?v='+vid_id
            vid_prefix = os.path.join(videos_root, vid_id)
            os.system(' '.join(("youtube-dl -o", vid_prefix, vid_url)))

            # check if the video is available
            if os.path.exists(vid_prefix+'.mp4') or os.path.exists(vid_prefix+'.mkv') or os.path.exists(vid_prefix+'.webm'):
                print('[INFO] Downloaded video {}'.format(vid_id))
            else:
                missing_vid_list.append(line)
                print('[INFO] Cannot download video {}'.format(vid_id))
                continue

            # get annotations list (and action count)
            actions = annotations[vid_id]['annotations']
            actions_list = get_actions(actions)    # list of action annotations for a single video
            actions_count = len(actions_list)
            print('[INFO] Extracted {} actions for video {}'.format(actions_count, vid_id))

            # setup directories
            parent_root = os.path.join(ycii_root, str(actions_count))
            if not os.path.isdir(parent_root):
                os.mkdir(parent_root)

            sample_index = 0    # change this to 1 to ensure 1-indexing for samples
            samples_list = os.listdir(parent_root)    # list of samples of same actions_count
            if samples_list:
                sample_index = max([int(index) for index in samples_list]) + 1    # set sample counter to next available integer
            sample_index = str(sample_index).zfill(5)    # required to ensure sortability
            sample_root = os.path.join(parent_root, sample_index)    # all data for this video will be stored under here
            if not os.path.isdir(sample_root):
                os.mkdir(sample_root)

            frames_root = os.path.join(sample_root, 'frames')    # all sampled images for this video will be under here
            if not os.path.isdir(frames_root):
                os.mkdir(frames_root)
            pickles_root = os.path.join(sample_root, 'pickles')    # all raw data for this video will be under here (stored by variable names)
            if not os.path.isdir(pickles_root):
                os.mkdir(pickles_root)

            # sample frames at fixed fps
#             sample_frames(vid_id, videos_root, frames_root, fps=num_frames_per_step+1)    # need num_frames_per_step+1 intervals for num_frames_per_step inner frames
            sample_frames(vid_id, videos_root, frames_root, fps=num_frames_per_step)
            print('[INFO] Sampled frames for video {}'.format(vid_id))

            # remove sampled video file (optional)
            remove_video(vid_id, videos_root)
            print('[INFO] Removed video {}'.format(vid_id))

            # select representative frames for actions
            required_frames, required_frames_set = select_frames(actions, num_frames_per_step)
            print('[INFO] Selected frames for video {}'.format(vid_id))

            # remove unsued frames
            remove_frames(frames_root, required_frames_set)
            print('[INFO] Removed unused frames for video {}'.format(vid_id))

            # get candidates for images
#             frames = sorted(glob.glob(os.path.join(frames_root, '*.*')))
            frame_paths = [os.path.join(frames_root, frame) for action_frames in required_frames for frame in action_frames]
            candidates = [detector.inference(frame, max_detections=max_detections) for frame in frame_paths]
            print('[INFO] Extracted candidates for video {}'.format(vid_id))

            # save pickled files for vid_id
            pickle_data(vid_id, pickles_root, 'vid_id')
            print('[INFO] Saved vid_id for video {}'.format(vid_id))

            # save pickeled files for candidates
            pickle_data(candidates, pickles_root, 'candidates')
            print('[INFO] Saved candidates for video {}'.format(vid_id))
            
            # save pickeled files for frame paths
            pickle_data(frame_paths, pickles_root, 'frame_paths')
            print('[INFO] Saved frame_paths for video {}'.format(vid_id))

            # save pickled files for annotations list
            pickle_data(actions_list, pickles_root, 'actions_list')
            print('[INFO] Saved actions_list for video {}'.format(vid_id))

    # write the missing videos to file
    missing_vid = open(os.path.join(dataset_root, 'vid_list', 'missing_videos.txt'), 'w')
    for line in missing_vid_list:
        missing_vid.write(line)

    # sanitize and remove the intermediate files
    # os.system("find {} -name '*.part*' -delete".format(dataset_root))
    os.system("find {} -name '*.f*' -delete".format(dataset_root))

In [5]:
def load_sample(dataset_root='/h/mkhan/ece496-capstone/datasets', actions_count=10, sample_index=0):
    """
    Load the sample_index'th sample with actions_count actions from saved files
    dataset_root: directory path to dataset base
    actions_count: number of actions in sample (bucket id of sample)
    sample_index: index of sample within the bucket
    Return vid_id: video id
    Return candidates: list of candidate data (bboxes, features) for a single video
    Return actions_list: list of action annotations for a single video
    """
    pickles_root = os.path.join(dataset_root, 'ycii', str(actions_count), str(sample_index).zfill(5), 'pickles')
    if not os.path.isdir(pickles_root):
        print('[INFO] Cannot load data for {}\'th sample with {} action(s)'.format(sample_index, actions_count))
        return '', [], []
    else:
        vid_id = depickle_data(pickles_root, 'vid_id')
        candidates = depickle_data(pickles_root, 'candidates')
        actions_list = depickle_data(pickles_root, 'actions_list')
        return vid_id, candidates, actions_list

In [6]:
# USAGE: Run this just once to prepare and save data on disk
prepare_dataset(dataset_root='/h/mkhan/ece496-capstone/datasets', vid_list='/h/mkhan/ece496-capstone/datasets/vid_list/vid_list_ycii_val_short.txt', num_frames_per_step=5, max_detections=20)

loading configuration file cache
loading weights file https://cdn.huggingface.co/unc-nlp/frcnn-vg-finetuned/pytorch_model.bin from cache at /h/mkhan/.cache/torch/transformers/57f6df6abe353be2773f2700159c65615babf39ab5b48114d2b49267672ae10f.77b59256a4cf8343ae0f923246a81489fc8d82f98d082edc2d2037c977c0d9d0
All model checkpoint weights were used when initializing GeneralizedRCNN.

All the weights of GeneralizedRCNN were initialized from the model checkpoint at unc-nlp/frcnn-vg-finetuned.
If your task is similar to the task the model of the checkpoint was trained on, you can already use GeneralizedRCNN for predictions without further training.
[INFO] Processing video sdB8qBlLS2E
[INFO] Cannot download video sdB8qBlLS2E
[INFO] Processing video fn9anlEL4FI
[INFO] Downloaded video fn9anlEL4FI
[INFO] Extracted 8 actions for video fn9anlEL4FI
[INFO] Sampled frames for video fn9anlEL4FI
[INFO] Removed video fn9anlEL4FI
[INFO] Selected frames for video fn9anlEL4FI
[INFO] Removed unused frames for 

In [7]:
# Required Cell

# USAGE: Run this to load the sample_index'th sample with actions_count actions from saved files
vid_id, candidates, actions_list = load_sample(dataset_root='/h/mkhan/ece496-capstone/datasets', actions_count=8, sample_index=0)

In [8]:
print(vid_id)
print(candidates)
print(actions_list)

fn9anlEL4FI
[(tensor([[[6.5771e-01, 7.9962e-02, 9.9711e-01, 8.7974e-01],
         [6.4448e-01, 2.2019e-01, 9.9700e-01, 9.9185e-01],
         [6.3769e-01, 3.0864e-02, 9.3216e-01, 9.7160e-01],
         [5.6988e-01, 1.0561e-01, 9.7087e-01, 9.2660e-01],
         [5.2192e-01, 2.3066e-01, 9.4790e-01, 9.9412e-01],
         [2.9683e-01, 1.6157e-01, 9.1868e-01, 8.2801e-01],
         [7.8615e-01, 0.0000e+00, 9.9944e-01, 8.4471e-01],
         [5.8736e-01, 1.7402e-01, 8.9894e-01, 9.9217e-01],
         [1.4740e-01, 4.1034e-01, 8.3588e-01, 9.9516e-01],
         [3.0148e-01, 3.5527e-01, 9.3962e-01, 9.9401e-01],
         [4.6350e-01, 8.9982e-02, 9.2833e-01, 9.1800e-01],
         [2.2532e-01, 2.7684e-01, 8.7477e-01, 9.4955e-01],
         [3.8670e-01, 1.0931e-01, 1.0000e+00, 8.5437e-01],
         [4.1013e-01, 1.8069e-01, 8.7457e-01, 8.9694e-01],
         [5.7178e-02, 1.6893e-01, 7.0676e-01, 8.2941e-01],
         [6.4896e-01, 8.1653e-04, 9.9217e-01, 3.7995e-01],
         [3.6432e-01, 2.5123e-01, 9.9852e-

In [9]:
# # Old Cell (do NOT run)

# # USAGE: Run this to load all candidate and actions data from disk
# all_candidates, all_actions = load_dataset(dataset_root='/h/mkhan/ece496-capstone/datasets')

In [17]:
def tokenize_sample(pickles_root, actions_list=[]):
    """
    Tokenize actions_list
    pickles_root: directory path to pickled data
    actions_list: list of action annotations for a single video
    Return steps: a single tokenized string representing all steps annotations for a single video
    Return entity_count: list of entity counts per action step of a single video
    Return entities: list of list of entities within each action step of a single video
    Return indices: list of lists of indices indicating entity spans within each action step of a single video
    Return max_step_length: word count of the longest pre-tokenized action step for a single video
    """
    # Strip all whitespaces and periods
    # Note, periods will be added back later; need to temporarily remove periods before passing into parser
    
    actions = [action.strip('.') for action in actions_list]
#     print(actions)
    
    NULL = '[unused1]'
    ENTITY = '[unused2]'
    ACTION = '[unused3]'
    
    max_step_length = max(0, max([len(action.split()) for action in actions]))    # maximum word count in a single action step
#     print(max_step_length)
    
    entities, indices = parse(actions, max_step_length=max_step_length)
    entities.append([NULL])
    entity_count = [len(entity) for entity in entities]
    
    # insert in reverse so preceeding word indices can still be used for modified actions
    for ind in reversed(indices):
        action_idx = ind[0]//max_step_length
        entity_idx = ind[0]%max_step_length
        words = actions[action_idx].split()
        words.insert(entity_idx, ENTITY)
        actions[action_idx] = ' '.join(words)

    actions = [action + '.' if not action.endswith('.') else action for action in actions]

    steps = ''
    for action in actions:
        steps = steps + action + ' ' + ACTION + ' '
    steps = steps + NULL + ' ' + ACTION    # TODO: check if this is correct
#     print(steps)

    pickle_data(steps, pickles_root, 'steps')
    pickle_data(entity_count, pickles_root, 'entity_count')
    pickle_data(entities, pickles_root, 'entities')
    pickle_data(indices, pickles_root, 'indices')
    pickle_data(max_step_length, pickles_root, 'max_step_length')

    return steps, entity_count, entities, indices, max_step_length

In [18]:
# # Old Cell (do NOT run)
# def tokenize_sample(pickles_root, actions_list=[]):
#     """
#     Tokenize actions_list
#     pickles_root: directory path to pickled data
#     actions_list: list of action annotations for a single video
#     Return steps: a single tokenized string representing all steps annotations for a single video
#     Return entity_count: list of entity counts per action step of a single video
#     Return entities: list of list of entities within each action step of a single video
#     Return indices: list of lists of indices indicating entity spans within each action step of a single video
#     """
#     # Strip all whitespaces and periods
#     # Note, periods will be added back later; need to temporarily remove periods before passing into parser
    
#     actions = [action.strip('.') for action in actions_list]
#     print(actions)
    
#     NULL = '[unused1]'
#     ENTITY = '[unused2]'
#     ACTION = '[unused3]'
    
#     max_step_length = max(0, max([len(action.split()) for action in actions]))    # maximum word count in a single action step
#     print(max_step_length)
    
#     tokenizer = LxmertTokenizer.from_pretrained("unc-nlp/lxmert-base-uncased")
#     tokenizer.add_special_tokens({"additional_special_tokens": [NULL, ENTITY]})   # TODO: check if list should contain ACTION as well
#     tokenizer.encode([NULL, ENTITY], add_special_tokens=True)    # TODO: check if this tokenization is required
    
#     entities, indices = parse(actions, max_step_length=max_step_length)
#     entities.append([NULL])
#     entity_count = [len(entity) for entity in entities]
    
#     # insert in reverse so preceeding word indices can still be used for modified actions
#     for ind in reversed(indices):
#         action_idx = ind[0]//max_step_length
#         entity_idx = ind[0]%max_step_length
#         words = actions[action_idx].split()
#         words.insert(entity_idx, ENTITY)
#         actions[action_idx] = ' '.join(words)

#     actions = [action + '.' if not action.endswith('.') else action for action in actions]

#     tokens_steps = tokenizer(
#                     actions,
#                     return_token_type_ids=False,
#                     return_attention_mask=False,
# #                     add_special_tokens=True,
#                     add_special_tokens=False,
# #                     padding="max_length",
#                     max_length=max_step_length + 2,
#                     return_tensors="pt"
#                 )
#     tokens_steps = tokens_steps['input_ids'].flatten()
#     tokens_steps = tokens_steps[tokens_steps != 101]
#     tokens_steps = tokenizer.decode(tokens_steps) + ' ' + NULL + ' ' + ACTION    # TODO: check if this is correct
#     steps = tokens_steps
    
#     pickle_data(steps, pickles_root, 'steps')
#     pickle_data(entity_count, pickles_root, 'entity_count')
#     pickle_data(entities, pickles_root, 'entities')
#     pickle_data(indices, pickles_root, 'indices')
    
#     return steps, entity_count, entities, indices



In [19]:
# Required Cell

# USAGE: Run this to tokenize actions_list of a single video and save data on disk
pickles_root = '/h/mkhan/ece496-capstone/datasets/ycii/8/00000/pickles'
# steps, entity_count, entities, indices = tokenize_sample(pickles_root, actions_list)
steps, entity_count, entities, indices, max_step_length = tokenize_sample(pickles_root, actions_list)

In [20]:
print(actions_list)

['add garram masala seeds and a bay leaf to the oil', 'add the lamb to the pot', 'add garlic ginger paste and chopped onions to the pot', 'add chili tumeric coriander cumin and salt', 'add water to the pot', 'add potatos to the pot', 'add the tomatos to the pot', 'add chili to the pot']


In [21]:
print(steps)
print(entity_count)
print(entities)
print(indices)
print(max_step_length)

add [unused2] garram masala seeds and a bay leaf to [unused2] the oil. [unused3] add [unused2] the lamb to [unused2] the pot. [unused3] add [unused2] garlic ginger paste and chopped onions to the pot. [unused3] add [unused2] chili tumeric coriander cumin and salt. [unused3] add [unused2] water to [unused2] the pot. [unused3] add [unused2] potatos to [unused2] the pot. [unused3] add [unused2] the tomatos to [unused2] the pot. [unused3] add [unused2] chili to [unused2] the pot. [unused3] [unused1] [unused3]
[2, 2, 1, 1, 2, 2, 2, 2, 1]
[['garram masala seeds', 'the oil'], ['the lamb', 'the pot'], ['garlic ginger paste'], ['chili tumeric coriander cumin'], ['water', 'the pot'], ['potatos', 'the pot'], ['the tomatos', 'the pot'], ['chili', 'the pot'], ['[unused1]']]
[[1, 2, 3], [9, 10], [12, 13], [15, 16], [23, 24, 25], [34, 35, 36, 37], [45], [47, 48], [56], [58, 59], [67, 68], [70, 71], [78], [80, 81]]
11


In [None]:
# # Old Cell (do NOT run)
# # Handles building steps and entity_count


# # for actions in all_actions:
# #     for i, action in enumerate(actions):
# #         print(str(i) + ': ' + action)


# # Strip all whitespaces and periods
# # Note, periods will be added back later; need to temporarily remove periods before passing into parser
# all_actions = [[action.strip('.') for action in actions] for actions in all_actions]

# NULL = '[unused1]'
# ENTITY = '[unused2]'
# ACTION = '[unused3]'

# # TODO: iterate through all annotations to find max_step_length
# MAX_STEP_LENGTH = 30

# tokenizer = LxmertTokenizer.from_pretrained("unc-nlp/lxmert-base-uncased", pad_token=PAD)
# tokenizer.add_special_tokens({"additional_special_tokens": [NULL, PAD, ENTITY]})
# tokenizer.encode([NULL, PAD, ENTITY], add_special_tokens=True)

# # TODO: iterate through all annotations to find max_step_length
# max_step_length = 30

# entity_count = []
# steps = []

# for idx in range(len(all_actions)):
    
#     entities, indices = parse(all_actions[idx], max_step_length=max_step_length)
#     entities.append([NULL])
#     entity_count.append([len(en) for en in entities])
    
#     # insert in reverse so preceeding word indices can still be used for modified actions
#     for ind in reversed(indices):
#         action_idx = ind[0]//max_step_length
#         entity_idx = ind[0]%max_step_length
#         words = all_actions[idx][action_idx].split()
#         words.insert(entity_idx, ENTITY)
#         all_actions[idx][action_idx] = ' '.join(words)

#         all_actions[idx] = [action + '.' if not action.endswith('.') else action for action in all_actions[idx]]

#     tokens_steps = tokenizer(
#                     all_actions[idx],
#                     return_token_type_ids=False,
#                     return_attention_mask=False,
#                     add_special_tokens=True,
#                     padding="max_length",
#                     max_length=MAX_STEP_LENGTH + 2,
#                     return_tensors="pt"
#                 )
#     tokens_steps = tokens_steps['input_ids'].flatten()
#     tokens_steps = tokens_steps[tokens_steps != 101]
#     tokens_steps = tokenizer.decode(tokens_steps) + ' ' + NULL
#     steps.append(tokens_steps)


# # for actions in all_actions:
# #     for i, action in enumerate(actions):
# #         print(str(i) + ': ' + action)

In [None]:
# # Old Cell (do NOT run)
#
# for idx in range(len(all_actions)):
#     print(idx)
#     print(len(all_actions[idx]))
#     for action in all_actions[idx]:
#         print(action)

In [None]:
# WIP Cell: previously, all_candidates contained candidates from multiple videos.
# This needs to be adjusted given a single video input
# If a single video is used at a time,
# try replacing "candidate[0].numpy() for candidate in all_candidates"
# with "candidates.numpy()"
# And check if dimensionality is as expected for model input

# Reshape all of the candidate bounding box and feature tensors.
boxes = torch.tensor([candidate[0].numpy() for candidate in all_candidates]).squeeze(1)
features = torch.tensor([candidate[1].numpy() for candidate in all_candidates]).squeeze(1)

boxes = boxes.flatten(start_dim=0, end_dim=1)
features = features.flatten(start_dim=0, end_dim=1)

# TODO: please check this stacking here. This was originally adapted from the train notebook.
# But I am not sure why/whether this is needed
boxes = torch.stack((boxes, boxes))
features = torch.stack((features, features))

In [None]:
print(steps)

In [None]:
print(features)

In [None]:
print(boxes)

In [None]:
print(entity_count)

In [None]:
print(entities)

In [None]:
print(indices)

In [None]:
%matplotlib inline

In [None]:
import matplotlib.image as mpimg
import matplotlib.patches as patches
import matplotlib.pyplot as plt
import numpy as np
import os

from PIL import Image

In [None]:
def get_first_frame(action_id, action_start, action_end, num_frames_per_step):
    """
    Returns first frame name of a given action, and time delay into the segment when the frame appears (in sec)
    action_id: id of action within video
    action_start: start time (in sec) of action
    action_end: end time (in sec) of action
    num_frames_per_step: number of frames per action step
    Returns frame_name: name of first frame for an action
    Returns action_delta: time delay into the segment when the frame appears (in sec)
    """
    action_delta = (action_end - action_start) / (num_frames_per_step + 1)    # need num_frames_per_step+1 intervals for num_frames_per_step inner frames
    frame_time = action_start + action_delta    # in seconds
    frame_id = int( frame_time*(num_frames_per_step + 1) )
    frame_name = '{}.jpg'.format(str(frame_id).zfill(6))
    return frame_name, action_delta

In [None]:
def visualize_dataset(dataset_root='/h/mkhan/ece496-capstone/datasets', num_frames_per_step=4):
    """
    Visualize the first frame of each step in dataset videos
    dataset_root: directory path to dataset base
    num_frames_per_step: number of frames per action step
    """

    vids_list = os.listdir(os.path.join(dataset_root, 'ycii_frames'))
    annotations_bb_all = read_json(os.path.join(dataset_root, 'yc2_bb/annotations', 'yc2_bb_val_annotations.json'))['database']
    
    detector = Detector()
#     visualize_candidates = []
    
    for vid_id in vids_list:
        print('vid_id: ' + vid_id)

        annotations_bb_vid = annotations_bb_all[vid_id]
        vid_segments = annotations_bb_vid['segments']

        annot_width = annotations_bb_vid['rwidth']
        annot_height = annotations_bb_vid['rheight']
        print('annotated image size: ' + str(annot_width) + ', ' + str(annot_height))

        for segment_id in sorted(vid_segments):
#             print(segment_id)    # 'segment_id'th segment of the video 'vid_id'
#             print(vid_segments[segment_id]['segment'])    # displays start and end time (in sec) of the segment

            frame_name, segment_delta = get_first_frame(segment_id, vid_segments[segment_id]['segment'][0], vid_segments[segment_id]['segment'][1], num_frames_per_step)
            print('frame_name: ' + frame_name)
            print('segment_start: ' + str(vid_segments[segment_id]['segment'][0]))
            print('segment_delta: ' + str(segment_delta))            
            
            segment_frame_id = int(round(segment_delta))
            
            # read the image from disk
            img = Image.open(os.path.join(dataset_root, 'ycii_frames', vid_id, frame_name))

            # get saved image dimensions
            saved_width = img.size[0]
            saved_height = img.size[1]
            print('saved image size: ' + str(saved_width) + ', ' + str(saved_height))
            
            # create figure and axes
            fig, ax = plt.subplots()
            
            # display the image on the plot
            ax.imshow(img)

            # add ground truth bounding boxes
            for entity in vid_segments[segment_id]['objects']:
                label = entity['label']
                bboxes = entity['boxes']
                bbox = bboxes[segment_frame_id]
                
                if (bbox['occluded']==1):
                    continue
                
#                 print(bbox) 
                print('label: ' + label)
                
                left = int(bbox['xtl'] / annot_width * saved_width)
                top = int(bbox['ytl'] / annot_height * saved_height)
                width = int( (bbox['xbr'] - bbox['xtl']) / annot_width * saved_width)
                height = int( (bbox['ybr'] - bbox['ytl']) / annot_height * saved_height)
                

                # Create a Rectangle patch ( (x, y), width, height )
                rect = patches.Rectangle((left, top), width, height, linewidth=2, edgecolor='g', facecolor='none')
                # Add the patch to the Axes
                ax.add_patch(rect)
            
            
            # get candidates for images
            candidates = detector.inference(os.path.join(dataset_root, 'ycii_frames', vid_id, frame_name), max_detections=10)
#             print(candidates)
#             visualize_candidates.append(candidates)
            boxes = torch.tensor([candidates[0].numpy()]).squeeze(1)
            boxes = boxes.flatten(start_dim=0, end_dim=1)
            boxes = boxes.numpy().tolist()
            
            # add detector candidate bounding boxes
            for candidate_box in boxes:
                print(candidate_box)
                
                left = int(round(candidate_box[0] * saved_width))
                top = int(round(candidate_box[1] * saved_height))
                width = int( ( candidate_box[2] - candidate_box[0] ) * saved_width )
                height = int( ( candidate_box[3] - candidate_box[1] ) * saved_height )
                
                # Create a Rectangle patch ( (x, y), width, height )
                rect = patches.Rectangle((left, top), width, height, linewidth=1, edgecolor='r', facecolor='none')
                # Add the patch to the Axes
                ax.add_patch(rect)

            # display the plot
            plt.show()
#             return visualize_candidates



In [None]:
# Optional Cell
# USAGE: Run this to display first frame of each step in dataset videos
visualize_dataset(dataset_root='/h/mkhan/ece496-capstone/datasets', num_frames_per_step=4)
# candidates = visualize_dataset(dataset_root='/h/mkhan/ece496-capstone/datasets', num_frames_per_step=4)

In [None]:
# NOTE: Do not execute the cells below, they are rough code for testing stuff out

In [None]:
print(boxes.size())
print(features.size())

In [None]:
for vid in os.listdir(os.path.join('/h/mkhan/ece496-capstone/datasets/ycii_frames/', '')):
    print(os.listdir(os.path.join('/h/mkhan/ece496-capstone/datasets/ycii_frames/', vid)))
    print('\n')

In [None]:
8+11+8

In [None]:
import collections

dataset_root = '/h/mkhan/ece496-capstone/datasets'
annotations = read_json(os.path.join(dataset_root, 'annotations', 'ycii_annotations_trainval.json'))['database']
max_len = 0
min_len = 1000
actions_lenths = []
steps_lenths = []
vid_count = 0
min_duration = 100

with open(os.path.join(dataset_root, 'vid_list', 'vid_list_ycii_train.txt')) as f:
    lines = f.readlines()
    for line in lines:
        rcp_type,vid_id = line.replace('\n','').split('/')
#         print('[INFO] Processing video {}'.format(vid_id))
        actions = annotations[vid_id]['annotations']
        actions_lenths.append(len(actions))
        for action in actions:
#             print(len(action['sentence'].split()))
            steps_lenths.append(len(action['sentence'].split()))
            
#         print(actions[0]['segment'])
#         print()
        if min_duration > actions[0]['segment'][1] - actions[0]['segment'][0]:
            min_duration = actions[0]['segment'][1] - actions[0]['segment'][0]
        
#         print(len(actions))
        if max_len < len(actions):
            max_len = len(actions)
        if min_len > len(actions):
            min_len = len(actions)
        vid_count += 1

# print(max_len)

# arr = [1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 4, 5, 5] 
freq = collections.Counter(actions_lenths)
word_len_freq = collections.Counter(steps_lenths)
# print(freq)

for key in sorted(freq):
    print("%s: %s" % (key, freq[key]))
print('\n')
for key in sorted(word_len_freq):
    print("%s: %s" % (key, word_len_freq[key]))

print('# of Videos: ' + str(vid_count))
print('Max step count: ' + str(max_len))
print('Min step count: ' + str(min_len))
print('Min step duration: ' + str(min_duration))


In [None]:
# img = mpimg.imread('/h/mkhan/ece496-capstone/datasets/ycii_frames/fn9anlEL4FI/000069.jpg')

In [None]:
# imgplot = plt.imshow(img)

In [None]:
img = Image.open('/h/mkhan/ece496-capstone/datasets/ycii_frames/fn9anlEL4FI/000159.jpg')

In [None]:

# Create figure and axes
fig, ax = plt.subplots()

# Display the image
ax.imshow(img)

# Create a Rectangle patch
rect = patches.Rectangle((50, 100), 40, 30, linewidth=1, edgecolor='r', facecolor='none')

# Add the patch to the Axes
ax.add_patch(rect)

plt.show()

In [None]:
# print(boxes.size())
# print(boxes[0].size())
print(boxes[0][0])

In [None]:
dataset_root = '/h/mkhan/ece496-capstone/datasets/'

annotations = read_json(os.path.join(dataset_root, 'annotations', 'ycii_annotations_trainval.json'))['database']
print(annotations['fn9anlEL4FI'])

In [None]:
dataset_root = '/h/mkhan/ece496-capstone/datasets'
annotations_bb = read_json(os.path.join(dataset_root, 'yc2_bb/annotations', 'yc2_bb_val_annotations.json'))['database']
# print(annotations_bb['fn9anlEL4FI'])
# for key in annotations_bb:
#     print(key)

sample_vid = annotations_bb['fn9anlEL4FI']
for key in sample_vid:
    print(key)


In [None]:
# for key in sorted(sample_vid['segments']):
#     print(sample_vid['segments'][key])

sample_segment = sample_vid['segments']['1']

print(sample_segment['segment'])

# for key in sorted(sample_segment):
#     print(key)

# print(sample_segment['objects'][0])
# for key in sorted(sample_segment['objects'][0]):
#     print(key)

# print(sample_segment['objects'][0]['label'])
# print(sample_segment['objects'][0]['boxes'])
for bbox in sample_segment['objects'][0]['boxes']:
    print(bbox)


In [None]:
# NOTE: Do not execute the cells below, they are rough code for testing stuff out

In [None]:
# print(all_actions)

for actions in all_actions:
    for action in actions:
        print(action)
# actions = all_actions[0]
# print(actions)
# actions = [action + '.' if not action.endswith('.') else action for action in actions]
# print(actions)

# print(all_actions)
for actions in all_actions:
    for action in actions:
        print(action)


# for actions in all_actions:
#     for action in actions:
# #         print(action)
#         if not action.endswith('.'):
#             action += '.'

# print(all_actions)

In [None]:
# Ensure all actions terminate with periods (Note, make sure NULL action is not yet added to actions lists)
# all_actions = [[action + '.' if not action.endswith('.') else action for action in actions] for actions in all_actions]

In [None]:

idx = 0
# print(all_actions[idx])
print(all_actions[idx][0])
words = all_actions[idx][0].split()
print(words)

indices = [[1, 2, 3], [9, 10]]

for ind in reversed(indices):
    print(ind[0])
    words.insert(ind[0], '[unused3]')

all_actions[idx][0] = ' '.join(words)
print(all_actions[idx][0])

# sentence = 'add  garram masala seeds and a bay leaf to the oil.'
# print(sentence)
# words = sentence.split()
# print(words)



In [None]:
# print(all_actions)

steps = all_actions[0]
steps1 = [step.strip() for step in steps]

# print(steps)
# print(steps1)
for s in steps1:
    print(s)


In [None]:
entities, indices = parse(steps, max_step_length=20)
entities.append([NULL])

In [None]:
print(len(steps))
print(len(entities))
print(len(indices))

print(steps)
print(entities)
print(indices)

In [None]:
# entities.append(NULL)
ent_len = [len(e) for e in entities]
# ent_len.append(1)    # For appened NULL (alternatively, do entities.append(null))
print(ent_len)

In [None]:
steps1 = []
steps1.append('Grill [unused3] the tomatoes in [unused3] a pan.')
steps1.append('Add [unused3] oil into [unused3] the pan.')
steps1.append('Add [unused3] oil into [unused3] the pan.')
steps1.append('Cook [unused3] the bacon.')
steps1.append('Spread [unused3] some mayonnaise onto [unused3] the bread.')
steps1.append('Place [unused3] a piece of [unused3] lettuce onto [unused3] it.')
steps1.append('Place [unused3] the tomatoes over [unused3] it.')
steps1.append('Sprinkle [unused3] some salt and pepper onto [unused3] it.')
steps1.append('Place [unused3] the bacon at [unused3] the top.')
steps1.append('Place the [unused3] piece of bread at the top.')


In [None]:
NULL = '[unused1]'
PAD = '[unused2]'
ENTITY = '[unused3]'
ACTION = '[SEP]'

MAX_STEP_LENGTH = 30

In [None]:
from transformers import LxmertModel, LxmertTokenizer

tokenizer = LxmertTokenizer.from_pretrained("unc-nlp/lxmert-base-uncased", pad_token=PAD)
tokenizer.add_special_tokens({"additional_special_tokens": [NULL, PAD, ENTITY]})
tokenizer.encode([NULL, PAD, ENTITY], add_special_tokens=True)

tokens_steps1 = tokenizer(
                    steps1,
                    return_token_type_ids=False,
                    return_attention_mask=False,
                    add_special_tokens=True,
                    padding="max_length",
                    max_length=MAX_STEP_LENGTH + 2,
                    return_tensors="pt"
                )


In [None]:
print(tokens_steps1['input_ids'])

In [None]:
steps1_flat = tokens_steps1['input_ids'].flatten()
# print(steps1_flat)
steps1_flat = steps1_flat[steps1_flat != 101]
# print(steps1_flat)
steps1_flat = tokenizer.decode(steps1_flat) + ' ' + NULL
print(steps1_flat)

In [None]:
asdf = 'hello'
qwer = 'world'
print(type([asdf,qwer]))


In [None]:
print(steps1_flat)

In [None]:
# print(tokens_steps1['input_ids'])

steps1_flat = tokens_steps1['input_ids'].flatten()
print(steps1_flat)
steps1_flat = steps1_flat[steps1_flat != 101]
print(steps1_flat)


In [None]:
annotations = read_json(os.path.join('/h/mkhan/ece496-capstone/datasets', 'annotations', 'ycii_annotations_trainval.json'))['database']

In [None]:
actions = annotations['fn9anlEL4FI']['annotations']

In [None]:
print(actions)

In [None]:
i = 0
for item in annotations:
    if annotations[item]['subset']=='validation':
        print(item)
#         print(annotations[item])
        print(annotations[item]['duration'])
#         print(annotations[item]['annotations'])
        segments = annotations[item]['annotations']
        for segment in segments:
            print(segment)
#             start = segment['segment'][0]
            end = segment['segment'][1]
#             print(str(start) + " " + str(end))
        
        print(end)

        i += 1
        if i==3:
            break