In [None]:
# Sample video is referred to on YouCookII dataset homepage (http://youcook2.eecs.umich.edu). Transcript taken from there and modified.
#
# Title: Super Quick BLT Sandwich | Breakfast Special | My Recipe Book By Tarika Singh.
# Link: https://youtu.be/4eWzsx1vAi8

In [None]:
%ls ./graph

In [None]:
video = inference('./video.mp4', './transcript.vtt')

In [None]:
# Display details.
display_video(video)

# Generate json.
video.generate_json('graph')

# Visualize graph (PDF).
file_to_viz('graph.json')

In [None]:
!cat graph.json | python -m json.tool

In [None]:
!zip -r ./graph.zip ./graph

In [None]:
def inference(video_path, transcript_path):
  video = Video(video_path, transcript_path)
  video.align()

  rr = RR(video.steps)
  rr.run()

  # Generate frames.
  video.generate_frames('graph')

  #set bboxes for video entities
  vg_inference(video)

  return video

In [None]:
import matplotlib.pyplot as plt

from torchvision.utils import make_grid  
  
def display_video(video):
  # Merge all the frames into one image.
  grid = make_grid(video.vframes_aligned)

  # Plot the combined image.
  plt.figure(figsize=(15, 15))
  plt.imshow(grid.permute(1, 2, 0))

  for step in video.steps:
    print(step)

In [None]:
import webvtt
import json

from torchvision.io import read_video

def get_frame(start, end, vframes):
  middle = datetime.today() + start + (end - start)
  middle = int(middle.hour * 3600 + middle.minute * 60 + middle.second)
  vframe = vframes[middle]
  return vframe

class Video:

  def __init__(self, video_path, transcript_path):
    # Get all of information from the video.
    self.vframes, self.aframes, self.info = read_video(video_path, pts_unit='sec')

    # Get the FPS. Note that sometimes the file may not contain metadata causing
    # this to fail. Ensure that the video contains metadata!
    self.fps = int(self.info.get('video_fps'))

    if not self.fps:
      raise Exception('Video {} does not contain required metadata.'.format(video_path))

    # Change the axes from [T, H, W, C] -> [T, C, H, W].
    self.vframes = self.vframes.permute(0, 3, 1, 2)

    # Parse through the transcript.
    self.captions = webvtt.read(transcript_path)
    
    # We haven't aligned the frames yet.
    self.vframes_aligned = None

  def downsample(self):
    # Downsample by striding along the array.
    self.vframes = self.vframes[::self.fps]

  def align(self):
    self.downsample()
    self.steps = [Step(idx, caption, self.vframes) for idx, caption in enumerate(self.captions)]
    self.vframes_aligned = [step.vframe for step in self.steps]

  def generate_frames(self, path):
    for step in self.steps:
      step.generate_frames(path)

  def generate_json(self, file):
    info = [step.generate_json() for step in self.steps]
    json.dump(info, open('{}.json'.format(file), 'w'))

In [None]:
from datetime import datetime
from torchvision.utils import save_image

import os

class Step:

  time_format = '%H:%M:%S.%f'
  print_format = '%H:%M:%S'
  offset = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0)

  def __init__(self, idx, caption, vframes):
    self.idx = idx
    self.text = caption.text
    self.start = datetime.strptime(caption.start, Step.time_format)
    self.end = datetime.strptime(caption.end, Step.time_format)

    self.set_frames(vframes)
    self.set_frame()

    # Set by the model.
    self.DOBJ = []
    self.PRED = None
    self.PP = []

    self.path = None

  def generate_frames(self, path):
    self.path = os.path.abspath('./{}/{}.png'.format(path, self.idx))
    save_image(self.vframe.float()/255, self.path)

  @staticmethod
  def get_seconds(time):
    return time.hour * 3600 + time.minute * 60 + time.second

  def get_interval(self):
    start_index = self.get_seconds(self.start)
    end_index = self.get_seconds(self.start + (self.end - self.start))
    return (start_index - 1), (end_index - 1)
  
  def set_frames(self, vframes):
    start_index, end_index = self.get_interval()
    self.vframes = vframes[start_index:(end_index + 1)]

  def get_index(self):
    index = int(len(self.vframes) / 2)
    return index

  def set_frame(self):
    self.vframe = self.vframes[self.get_index()]

  def __str__(self):
    s = 'Action ID {}: {} ({} -> {})\n'.format(self.idx, self.text, self.start.strftime(Step.print_format), self.end.strftime(Step.print_format))
    s += 'Predicate: {}\n'.format(self.PRED)

    for DOBJ in self.DOBJ:
      s += 'DOBJ: {} ({}), BB: {}\n'.format(DOBJ.text, DOBJ.reference, DOBJ.bb)

    for PP in self.PP:
      s += 'PP: {} ({}), BB: {}\n'.format(PP.text, PP.reference, PP.bb)

    return s

  def generate_json(self):
    attr = dict()
    attr['annot'] = self.text
    attr['img'] = self.path
    attr['pred'] = self.PRED

    attr['entities'] = []
    attr['bboxes'] = []
    attr['ea'] = []
    attr['eb'] = []

    for idx, entity in enumerate(self.DOBJ + self.PP):
      attr['entities'].append(entity.text)
      attr['bboxes'].append(entity.bb)
      attr['ea'].append(entity.reference)
      attr['eb'].append(-1 if not entity.bb else idx)

    return attr

In [None]:
class Object:

  def __init__(self, step, text, bb=None, reference=-1):
    self.step = step
    self.text = text
    self.reference = reference
    self.bb = bb

## Reference Resolution

In [None]:
# Note that this is a temporary solution where we use the exiting RR module to
# perform reference resolution. We simply copy the references, PRED, DOBJ and PP
# from the output. In the future, a separate class for the Parser should be made
# which only outputs the DOBJ, PP and PRED. This should be integrated into the
# design above (easy). The RR will be done by the BERT model and will no longer
# need the token-based approach from Neuralcoref.

from importlib import reload
from ref_res_model import ReferenceResolver

class RR:

  rr = ReferenceResolver()

  def __init__(self, steps):
    self.steps = steps
    
  def run(self):
    resolved_steps = RR.rr.parse_and_resolve_all_refs([step.text for step in self.steps])
    
    for step, resolved_step in zip(self.steps, resolved_steps):
      step.PRED = resolved_step.pred
      step.DOBJ = [Object(step, str(DOBJ.ent_text), reference = DOBJ.act_id_ref) for DOBJ in resolved_step.dobj_list]
      step.PP = [Object(step, str(PP.ent_text), reference = PP.act_id_ref) for PP in resolved_step.pp_list]              

## Visual Grounding

### Installations


### Model


In [None]:
from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
import yaml
import cv2
import torch
import requests
import numpy as np
import gc
import torch.nn.functional as F
import pandas as pd


import torchvision.models as models
import torchvision.transforms as transforms

from PIL import Image
from IPython.display import display, HTML, clear_output
from ipywidgets import widgets, Layout
from io import BytesIO
from argparse import Namespace


from maskrcnn_benchmark.config import cfg
from maskrcnn_benchmark.layers import nms
from maskrcnn_benchmark.modeling.detector import build_detection_model
from maskrcnn_benchmark.structures.image_list import to_image_list
from maskrcnn_benchmark.utils.model_serialization import load_state_dict


from mmf.datasets.processors.processors import VocabProcessor, VQAAnswerProcessor
from mmf.models.pythia import Pythia
from mmf.common.registry import registry
from mmf.common.sample import Sample, SampleList
from mmf.utils.env import setup_imports
from mmf.utils.configuration import Configuration

setup_imports()

class MMFDemo:
  TARGET_IMAGE_SIZE = [448, 448]
  CHANNEL_MEAN = [0.485, 0.456, 0.406]
  CHANNEL_STD = [0.229, 0.224, 0.225]
  
  def __init__(self):
    self._init_processors()
    self.visual_bert = registry.get_model_class(
            "visual_bert"
        ).from_pretrained(
            "visual_bert.pretrained.coco"
        )

    # Add this option so that it only output hidden states
    self.visual_bert.model.output_hidden_states = True

    self.visual_bert.model.to("cuda")
    self.visual_bert.model.eval()

    # Add this option so that losses are not pushed into output
    self.visual_bert.training_head_type = "finetuning"

    self.detection_model = self._build_detection_model()
    
  def _init_processors(self):
    args = Namespace()
    args.opts = [
        "config=projects/pythia/configs/vqa2/defaults.yaml",
        "datasets=vqa2",
        "model=visual_bert",
        "evaluation.predict=True"
    ]
    args.config_override = None

    configuration = Configuration(args=args)
    
    config = self.config = configuration.config
    vqa2_config = config.dataset_config.vqa2
    text_processor_config = vqa2_config.processors.text_processor
    
    text_processor_config.params.vocab.vocab_file = "../model_data/vocabulary_100k.txt"

    # Add preprocessor as that will needed when we are getting questions from user
    self.text_processor = VocabProcessor(text_processor_config.params)

    registry.register("coco_text_processor", self.text_processor)
  

  def _multi_gpu_state_to_single(self, state_dict):
    new_sd = {}
    for k, v in state_dict.items():
        if not k.startswith('module.'):
            raise TypeError("Not a multiple GPU state of dict")
        k1 = k[7:]
        new_sd[k1] = v
    return new_sd
  
  def predict(self, url, text):
    with torch.no_grad():
      detectron_features = self.get_detectron_features(url)

      sample = Sample()

      processed_text = self.text_processor({"text": text})
      #sample.text = processed_text["text"]
      sample.text_len = len(processed_text["tokens"])

      encoded_input = tokenizer(text, return_tensors='pt')
      sample.input_ids = encoded_input.input_ids
      sample.input_mask = encoded_input.attention_mask
      sample.segment_ids = encoded_input.token_type_ids

      sample.image_feature_0 = detectron_features
      sample.image_info_0 = Sample({
          "max_features": torch.tensor(100, dtype=torch.long)
      })

      sample_list = SampleList([sample])
      sample_list = sample_list.to("cuda")

      output = self.visual_bert(sample_list)
    
    gc.collect()
    torch.cuda.empty_cache()
    
    return output
    
  
  def _build_detection_model(self):

      cfg.merge_from_file('../model_data/detectron_model.yaml')
      cfg.freeze()

      model = build_detection_model(cfg)
      checkpoint = torch.load('../model_data/detectron_model.pth', 
                              map_location=torch.device("cpu"))

      load_state_dict(model, checkpoint.pop("model"))

      model.to("cuda")
      model.eval()
      return model
  
  def get_actual_image(self, image_path):
      if image_path.startswith('http'):
          path = requests.get(image_path, stream=True).raw
      else:
          path = image_path
      
      return path

  def _image_transform(self, image_path):
      path = self.get_actual_image(image_path)

      img = Image.open(path)
      im = np.array(img).astype(np.float32)
      im = im[:, :, ::-1]
      im -= np.array([102.9801, 115.9465, 122.7717])
      im_shape = im.shape
      im_size_min = np.min(im_shape[0:2])
      im_size_max = np.max(im_shape[0:2])
      im_scale = float(800) / float(im_size_min)
      # Prevent the biggest axis from being more than max_size
      if np.round(im_scale * im_size_max) > 1333:
           im_scale = float(1333) / float(im_size_max)
      im = cv2.resize(
           im,
           None,
           None,
           fx=im_scale,
           fy=im_scale,
           interpolation=cv2.INTER_LINEAR
       )
      img = torch.from_numpy(im).permute(2, 0, 1)
      return img, im_scale


  def _process_feature_extraction(self, output,
                                 im_scales,
                                 feat_name='fc6',
                                 conf_thresh=0.2):
      batch_size = len(output[0]["proposals"])
      n_boxes_per_image = [len(_) for _ in output[0]["proposals"]]
      score_list = output[0]["scores"].split(n_boxes_per_image)
      score_list = [torch.nn.functional.softmax(x, -1) for x in score_list]
      feats = output[0][feat_name].split(n_boxes_per_image)
      cur_device = score_list[0].device

      feat_list = []

      for i in range(batch_size):
          dets = output[0]["proposals"][i].bbox / im_scales[i]
          scores = score_list[i]

          max_conf = torch.zeros((scores.shape[0])).to(cur_device)

          for cls_ind in range(1, scores.shape[1]):
              cls_scores = scores[:, cls_ind]
              keep = nms(dets, cls_scores, 0.5)
              max_conf[keep] = torch.where(cls_scores[keep] > max_conf[keep],
                                           cls_scores[keep],
                                           max_conf[keep])

          keep_boxes = torch.argsort(max_conf, descending=True)[:100]
          feat_list.append(feats[i][keep_boxes])
      return feat_list

  def masked_unk_softmax(self, x, dim, mask_idx):
      x1 = F.softmax(x, dim=dim)
      x1[:, mask_idx] = 0
      x1_sum = torch.sum(x1, dim=1, keepdim=True)
      y = x1 / x1_sum
      return y
   
    
  def get_detectron_features(self, image_path):
      im, im_scale = self._image_transform(image_path)
      img_tensor, im_scales = [im], [im_scale]
      current_img_list = to_image_list(img_tensor, size_divisible=32)
      current_img_list = current_img_list.to('cuda')
      with torch.no_grad():
          output = self.detection_model(current_img_list)
      feat_list = self._process_feature_extraction(output, im_scales, 
                                                  'fc6', 0.2)
      return feat_list[0]

  def get_detectron_features_and_out(self, image_path):
      im, im_scale = self._image_transform(image_path)
      img_tensor, im_scales = [im], [im_scale]
      current_img_list = to_image_list(img_tensor, size_divisible=32)
      current_img_list = current_img_list.to('cuda')
      with torch.no_grad():
          output = self.detection_model(current_img_list)
      feat_list = self._process_feature_extraction(output, im_scales, 
                                                  'fc6', 0.2)
      return feat_list[0], output[0]["proposals"], im
    

  def get_detectron_features_and_out(self, image_path):
      im, im_scale = self._image_transform(image_path)
      img_tensor, im_scales = [im], [im_scale]
      current_img_list = to_image_list(img_tensor, size_divisible=32)
      current_img_list = current_img_list.to('cuda')
      with torch.no_grad():
          output = self.detection_model(current_img_list)
      feat_list = self._process_feature_extraction(output, im_scales, 
                                                  'fc6', 0.2)
      return feat_list[0], output[0]["proposals"], im
  
  
  def _process_bbox_extraction(self, output,
                                 im_scales,
                                 conf_thresh=0.2):
      batch_size = len(output[0]["proposals"])
      n_boxes_per_image = [len(_) for _ in output[0]["proposals"]]
      score_list = output[0]["scores"].split(n_boxes_per_image)
      score_list = [torch.nn.functional.softmax(x, -1) for x in score_list]
      cur_device = score_list[0].device

      bbox_list = []

      for i in range(batch_size):
          bboxes = output[0]["proposals"][i].bbox
          dets = output[0]["proposals"][i].bbox / im_scales[i]
          scores = score_list[i]

          max_conf = torch.zeros((scores.shape[0])).to(cur_device)

          for cls_ind in range(1, scores.shape[1]):
              cls_scores = scores[:, cls_ind]
              keep = nms(dets, cls_scores, 0.5)
              max_conf[keep] = torch.where(cls_scores[keep] > max_conf[keep],
                                           cls_scores[keep],
                                           max_conf[keep])

          keep_boxes = torch.argsort(max_conf, descending=True)[:100]
          bbox_list.append(bboxes[keep_boxes])
      return bbox_list

  def detectron_get_bbox(self, image_path):
        im, im_scale = self._image_transform(image_path)
        img_tensor, im_scales = [im], [im_scale]
        current_img_list = to_image_list(img_tensor, size_divisible=32)
        current_img_list = current_img_list.to('cuda')
        with torch.no_grad():
            output = self.detection_model(current_img_list)
        bbox_list = self._process_bbox_extraction(output, im_scales, 0.2)
        return bbox_list[0]

    #returns list of bounding box coordinates, in order of input entity_list
    #bb_embed_list assumed to be 100 by 768 (the embedding size)
  def visual_ground(self, output, entity_list, bbox_list, full_sentence, step_sentence):
        
        bbox_for_entity = []

        encoded_input = tokenizer(full_sentence, return_tensors='pt')
        step_encoded_input = tokenizer(step_sentence, return_tensors='pt')

        all_tokens = tokenizer.convert_ids_to_tokens(encoded_input["input_ids"][0])
        step_tokens = tokenizer.convert_ids_to_tokens(step_encoded_input["input_ids"][0])

        #find start of step_token list in all_tokens
        start_step_ind = len(all_tokens) - len(step_tokens) +1
        step_tokens = all_tokens[start_step_ind:]

        bb_embed_list = output['sequence_output'][0, len(encoded_input["input_ids"][0]):].cpu().detach().numpy()
        text_embed_list = output['sequence_output'][0, 0:len(encoded_input["input_ids"][0])].cpu().detach().numpy()

        #move through the entities, we will form a meta-embeding for each entity
        # by averaging the embeddings given by BERT (since an entity could be a longer string of many sub-word/tokens)
        for entity in entity_list:
            #print("\n{}".format(entity))
            #run through sub-words that are in the step, if its a part of the entity, add it
            related_embeddings=  [text_embed_list[start_step_ind + i] for i in range(len(step_tokens)) if check_subword_in_word(step_tokens[i], entity)]
            
            #print(len(related_embeddings))
            related_embeddings = np.array(related_embeddings)
            #print(len(related_embeddings))
            #print(np.array(related_embeddings).shape)

            
            #average over them to get embedding for entity
            entity_embedding = np.mean(related_embeddings, axis = 0)

            #print(entity_embedding)

            #now we have the entity embedding, compare to all bounding boxes to get scores for alignment
            #print(bb_embed_list.shape)
            #print(entity_embedding.shape)
            scores = bb_embed_list@entity_embedding
            bbox_ind = np.argmax(scores)
            #scores = scores/np.sum(scores)

            bbox_for_entity.append(bbox_list[bbox_ind].cpu().detach().numpy())
        
        return bbox_for_entity




In [None]:
#pass subword (from BERT tokenizer) string, and word string
#true if subword came from word
def check_subword_in_word(subword, string):
    #entity may be string of multiple words
    words = string.split()

    if (subword[:2] == "##" and any([subword[2:] in word for word in words])) or (subword[-2:] == "##" and any([subword[-2:] in word for word in words])) or (subword in words):
        return True
    else:
        return False

In [None]:
demo = MMFDemo()

In [None]:
 def get_im_resize_factor(img_path): 
    img = Image.open(img_path)
    
    im = np.array(img).astype(np.float32)
    im_shape = im.shape
    im_size_min = np.min(im_shape[0:2])
    im_size_max = np.max(im_shape[0:2])
    im_scale = float(800) / float(im_size_min)
    
    # Prevent the biggest axis from being more than max_size
    if np.round(im_scale * im_size_max) > 1333:
        im_scale = float(1333) / float(im_size_max)
    #im = cv2.resize(
    #    im,
    #    None,
    #    None,
    #    fx=im_scale,
    #    fy=im_scale,
    #    interpolation=cv2.INTER_LINEAR
    #)

    return im_scale

In [None]:
#takes in video, and sets bb values for entities
def vg_inference(video):
    #demo = MMFDemo()

    step_text = [step.text for step in video.steps]

    #assuming all of same size
    im_scale = get_im_resize_factor(video.steps[0].path)

    for step_num in range(len(video.steps)):
        full_sentence = " ".join(step_text[:step_num+1])
        sentence = video.steps[step_num].text

        #construct entity list for step
        entity_list = []

        #add dobjs
        entity_list.extend([video.steps[step_num].DOBJ[i].text for i in range(len(video.steps[step_num].DOBJ))])
        #add pps
        entity_list.extend([video.steps[step_num].PP[i].text for i in range(len(video.steps[step_num].PP))])


        img_path = video.steps[step_num].path
        output = demo.predict(img_path, full_sentence)

        bbox_list = demo.detectron_get_bbox(img_path)

        #list of bbox coordinates for scaled image
        entity_bb = demo.visual_ground(output = output, 
                                entity_list = entity_list, 
                                bbox_list = bbox_list, 
                                full_sentence = full_sentence, 
                                step_sentence = sentence)
        
        for bb_ind in range(len(entity_bb)):
            entity_bb[bb_ind] = np.round(entity_bb[bb_ind]/im_scale).astype(int)


        #set bbox coordinates for all entities
        count = 0
        #re-order and put in dict for step
        for dobj in video.steps[step_num].DOBJ:
            dobj.bb = {'left': int(entity_bb[count][0]), 'top': int(entity_bb[count][3]), 'bot': int(entity_bb[count][1]), 'right': int(entity_bb[count][2])}
            count += 1

        for pp in video.steps[step_num].PP:
            pp.bb = {'left': int(entity_bb[count][0]), 'top': int(entity_bb[count][3]), 'bot': int(entity_bb[count][1]), 'right': int(entity_bb[count][2])}
            count+=1

    return

In [None]:
from graphviz import Graph, Digraph, nohtml
import imagesize
import json

def read_json(path='output.json'):
    file = open(path)
    line = file.read().replace('\n', ' ')
    file.close()
    try:
        parsed_json = json.loads(line)
    except:
        assert False, 'Invalid JSON'
    return parsed_json

def file_to_viz(path='output.json'):
    json_to_viz(read_json(path))

def insert_newlines(string, length=32, lower=True):
    lines = []
    for i in range(0, len(string), length):
        lines.append(string[i:i+length])
    if lower:
        # return ('\n'.join(lines)).lower()     # required with html
        return ('\\n'.join(lines)).lower()    # required with nohtml
    # return '\n'.join(lines)                   # required with html
    return '\\n'.join(lines)                  # required with nohtml

def get_scaled_center(img_center_x, img_center_y, img_width, img_height, img_scale, bb_left, bb_bot, bb_right, bb_top):
    bb_center_x = img_center_x - img_width/2 + img_scale * (bb_left + (bb_right - bb_left) / 2)
    bb_center_y = img_center_y - img_height/2 + img_scale * (bb_bot + (bb_top - bb_bot) / 2)
    pos=str(bb_center_x) + "," + str(bb_center_y) + "!"
    return pos

# Required

def json_to_viz(parsed_json):
    # graph dimensions (adjust these values)
    g_left = 0                  # left boundary of page
    # g_bot = 0                 # bottom boundary of page
    g_horizontal_gap = 1
    g_vertical_gap = 1

    g_txt_width = 5
    g_txt_height = 0.5
    g_txt_border = 0.125        # padding around each side

    g_img_width = 4
    g_img_height = 3

    # g_inner_txt_boundary_color = 'invis'  # required for html
    g_inner_txt_boundary_color = 'gray'
    g_outer_txt_boundary_color = 'dimgray'
    g_txt_font_color = 'midnightblue'

    g_bb_color = 'limegreen'

    g_edge_to_bb_color = 'limegreen'
    g_edge_to_step_color = 'darkorange'

    # resulting attributes
    g_item_height = g_vertical_gap + max(g_txt_height+2*g_txt_border, g_img_height)

    g_txt_center_x = g_left + g_img_width + g_horizontal_gap + g_txt_border + g_txt_width/2
    g_img_center_x = g_left + g_img_width/2

    g_outer_txt_width = g_txt_width + 2*g_txt_border
    g_outer_txt_height = g_txt_height + 2*g_txt_border

    # create graph
    g = Digraph('text_core', engine='neato')

    # set graph and node attributes
    g.graph_attr['splines'] = 'curved'

    g.node_attr['shape']='rect'
    g.node_attr['fixedsize']='true'
    g.node_attr['fontsize'] = '12'
    g.node_attr['fontcolor'] = str(g_txt_font_color)
    g.node_attr['imagescale'] = 'true'

    step_count = len(parsed_json)

    # draw nodes for images and texts
    for i in range(step_count):
        # formulate nohtml for text node
        text = "<f0>" + insert_newlines(parsed_json[i]['pred'])
        entity_count = len(parsed_json[i]['entities'])
        for j in range(entity_count):
            text += "|<f" + str(j+1) + ">"
            text += insert_newlines(parsed_json[i]['entities'][j])

        # # formulate html for text node
        # text = '''<<TABLE BORDER="0" CELLBORDER="0" CELLSPACING="1"><TR><TD PORT="f0">''' + insert_newlines(parsed_json[i]['pred']) + '''</TD>'''
        # entity_count = len(parsed_json[i]['entities'])
        # for j in range(entity_count):
        #     text += '''<TD BGCOLOR="#EEFFDD" PORT="f''' + str(j+1) + '''">''' + insert_newlines(parsed_json[i]['entities'][j]) + '''</TD>'''
        # text += '''</TR></TABLE>>'''

        # draw text node
        g.node("text_"+str(i),text,pos=str(g_txt_center_x)+","+str(i*g_item_height)+"!",width=str(g_txt_width),height=str(g_txt_height),color=str(g_inner_txt_boundary_color))
        
        g.node("text_shell_"+str(i),'',pos=str(g_txt_center_x)+","+str(i*g_item_height)+"!",width=str(g_outer_txt_width),height=str(g_outer_txt_height),color=str(g_outer_txt_boundary_color))
        g.node("text_"+str(i),nohtml(text),shape='Mrecord',pos=str(g_txt_center_x)+","+str(i*g_item_height)+"!",width=str(g_txt_width),height=str(g_txt_height),color=str(g_inner_txt_boundary_color))    # required with nohtml
        
        # draw image node
        g.node("img_"+str(i),pos=str(g_img_center_x)+","+str(i*g_item_height)+"!",width=str(g_img_width),height=str(g_img_height),label='',image=parsed_json[i]['img'],color='invis')
        
        # draw text-image pairing edge (needed for connected graph to manage overlapping)
        g.edge("text_"+str(i), "img_"+str(i), color='invis')

    # draw edges from entity to action step
    for i in range(step_count):
        entity_count = len(parsed_json[i]['ea'])
        for j in range(entity_count):
            action_id_ref = parsed_json[i]['ea'][j]
            if (action_id_ref != -1):
                g.edge("text_"+str(i)+":f"+str(j+1)+":s","text_shell_"+str(action_id_ref)+":n",color=str(g_edge_to_step_color))

    # draw bounding boxes
    for i in range(step_count):
        bb_count = len(parsed_json[i]['bboxes'])
        width, height = imagesize.get(parsed_json[i]['img'])
        w_ratio = g_img_width/width
        h_ratio = g_img_height/height
        min_ratio = min(w_ratio, h_ratio)

        is_width_bounded = (w_ratio < h_ratio)      # bounded due to width
        w=h_ratio*width if not is_width_bounded else g_img_width
        h=w_ratio*height if is_width_bounded else g_img_height

        for j in range(bb_count):
            bb = parsed_json[i]['bboxes'][j]
            g.node("bb_"+str(i)+"_"+str(j),label='',pos=get_scaled_center(g_img_center_x,i*g_item_height,w,h,min_ratio,bb['left'],bb['bot'],bb['right'],bb['top']),width=str((bb['right']-bb['left'])*min_ratio),height=str((bb['top']-bb['bot'])*min_ratio),color=str(g_bb_color))

    # draw edges from entity to bounding boxes
    for i in range(step_count):
        edge_orientation = ":s" if (i < step_count/2) else ":n"
        entity_count = len(parsed_json[i]['eb'])
        for j in range(entity_count):
            bb_id_ref = parsed_json[i]['eb'][j]
            if (bb_id_ref != -1):
                g.edge("text_"+str(i)+":f"+str(j+1)+edge_orientation,"bb_"+str(i)+"_"+str(bb_id_ref),color=str(g_edge_to_bb_color))
                # g.edge("text_"+str(i)+":f"+str(j+1)+":n","bb_"+str(i)+"_"+str(bb_id_ref),color=str(g_edge_to_bb_color),n='1',pos="e,152.13,411.67 91.566,463.4 108.12,449.26 127.94,432.34 144.37,418.3")

    g.render('visualizer-output/graph', view=True)

BACKUP

In [None]:
step_text = [step.text for step in video.steps]

#assuming all of same size
im_scale = get_im_resize_factor(video.steps[0].path)

for step_num in range(len(video.steps)):
    full_sentence = " ".join(step_text[:step_num+1])
    sentence = video.steps[step_num].text

    #construct entity list for step
    entity_list = []

    #add dobjs
    entity_list.extend([video.steps[step_num].DOBJ[i].text for i in range(len(video.steps[step_num].DOBJ))])
    #add pps
    entity_list.extend([video.steps[step_num].PP[i].text for i in range(len(video.steps[step_num].PP))])


    img_path = video.steps[step_num].path
    output = demo.predict(img_path, full_sentence)

    bbox_list = demo.detectron_get_bbox(img_path)

    #list of bbox coordinates for scaled image
    entity_bb = demo.visual_ground(output = output, 
                              entity_list = entity_list, 
                              bbox_list = bbox_list, 
                              full_sentence = full_sentence, 
                              step_sentence = sentence)
    
    for bb_ind in range(len(entity_bb)):
        entity_bb[bb_ind] = np.round(entity_bb[bb_ind]/im_scale).astype(int)


    #set bbox coordinates for all entities
    count = 0
    #re-order and put in dict for step
    for dobj in video.steps[step_num].DOBJ:
        dobj.bb = {'left': int(entity_bb[count][0]), 'top': int(entity_bb[count][3]), 'bot': int(entity_bb[count][1]), 'right': int(entity_bb[count][2])}
        count += 1

    for pp in video.steps[step_num].PP:
        pp.bb = {'left': int(entity_bb[count][0]), 'top': int(entity_bb[count][3]), 'bot': int(entity_bb[count][1]), 'right': int(entity_bb[count][2])}
        count+=1

In [None]:
entity_bb

In [None]:
    step_num = -1
    #construct entity list for step
    entity_list = []

    #add dobjs
    entity_list.extend([video.steps[step_num].DOBJ[i].text for i in range(len(video.steps[step_num].DOBJ))])
    #add pps
    entity_list.extend([video.steps[step_num].PP[i].text for i in range(len(video.steps[step_num].PP))])

In [None]:
entity_list

In [None]:
entity_list[2:]