In [1]:
import itertools
import torch
import einops
import torch.nn.functional as F

from transformers import LxmertModel, LxmertTokenizer
from transformers import LxmertForPreTraining
from torch import nn
from torch.nn.utils.rnn import pad_sequence
from loss import loss_RA_MIL

import numpy as np
import torch
import matplotlib.pyplot as plt

from dataset import YouCookII
from dataset import YouCookIICollate
from torch.utils.data import DataLoader
from loss import loss_RA_MIL
from transformers import get_linear_schedule_with_warmup
from model import Model


PyTorch version 1.6.0 available.


In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [3]:
lxmert_tokenizer = LxmertTokenizer.from_pretrained("unc-nlp/lxmert-base-uncased")

lxmert = LxmertModel.from_pretrained("unc-nlp/lxmert-base-uncased")
lxmert.to(device)
lxmert.eval()

LxmertModel(
  (embeddings): LxmertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768, padding_idx=0)
    (token_type_embeddings): Embedding(2, 768, padding_idx=0)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): LxmertEncoder(
    (visn_fc): LxmertVisualFeatureEncoder(
      (visn_fc): Linear(in_features=2048, out_features=768, bias=True)
      (visn_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (box_fc): Linear(in_features=4, out_features=768, bias=True)
      (box_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (layer): ModuleList(
      (0): LxmertLayer(
        (attention): LxmertSelfAttentionLayer(
          (self): LxmertAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear

In [4]:
from dataset import depickle_data

NUM_FRAMES_PER_STEP = 5
DETECTION_EMBEDDING_SIZE = 2048
BOUNDING_BOX_SIZE = 4

YCII = "/h/sagar/ece496-capstone/datasets/ycii"
FI = "/h/sagar/ece496-capstone/datasets/fi"

root = FI
num_actions=10
index =2 

root = "{}/{}/{}".format(root, num_actions, str(index).zfill(5))

pickle_root = "{}/pickles".format(root)
frames_root = "{}/frames".format(root)

frame_paths = depickle_data(pickle_root, 'frame_paths')
entities = depickle_data(pickle_root, 'entities')
actions = depickle_data(pickle_root, 'actions_list')
actions.append("[NULL]")
candidates = depickle_data(pickle_root, 'candidates')

steps = depickle_data(pickle_root, 'steps')
entity_count = depickle_data(pickle_root, 'entity_count')
bboxes = torch.stack(list(zip(*candidates))[0]).squeeze(1).reshape(-1, BOUNDING_BOX_SIZE)
features = torch.stack(list(zip(*candidates))[1]).squeeze(1).reshape(-1, DETECTION_EMBEDDING_SIZE)

In [5]:
steps

'crush and chop [unused2] the garlic. [unused3] Add oil, garlic, and salt to [unused2] a bowl. [unused3] mix [unused2] the tomoatos with [unused2] the oil mixture. [unused3] Chop [unused2] the basal. [unused3] Spread [unused2] the oil mixture onto [unused2] the dough. [unused3] place provolone cheese and mozzerella cheese onto [unused2] the dough. [unused3] Add [unused2] the basal to [unused2] the pizza. [unused3] place [unused2] the tomatos on [unused2] the pizza. [unused3] sprinkle [unused2] cheese onto [unused2] the pizza. [unused3] Bake [unused2] the pizza in [unused2] an oven. [unused3] [unused3]'

In [6]:
from model import *

In [7]:


steps = [steps]
entity_count = [entity_count]
entities = [entities]

bboxes = bboxes.unsqueeze(0)
features = features.unsqueeze(0)


In [8]:
steps

['crush and chop [unused2] the garlic. [unused3] Add oil, garlic, and salt to [unused2] a bowl. [unused3] mix [unused2] the tomoatos with [unused2] the oil mixture. [unused3] Chop [unused2] the basal. [unused3] Spread [unused2] the oil mixture onto [unused2] the dough. [unused3] place provolone cheese and mozzerella cheese onto [unused2] the dough. [unused3] Add [unused2] the basal to [unused2] the pizza. [unused3] place [unused2] the tomatos on [unused2] the pizza. [unused3] sprinkle [unused2] cheese onto [unused2] the pizza. [unused3] Bake [unused2] the pizza in [unused2] an oven. [unused3] [unused3]']

In [9]:
CANDIDATES = 20 * 5

features = features.to(device)
boxes = bboxes.to(device)



###############################
#remove [unused2]
steps = remove_unused2(steps)

inputs = lxmert_tokenizer(
    steps,
    padding="longest",
    truncation=False,
    return_token_type_ids=True,
    return_attention_mask=True,
    add_special_tokens=True,
    return_tensors="pt"
)

inputs.input_ids = inputs.input_ids.to(device)
inputs.attention_mask = inputs.attention_mask.to(device)
inputs.token_type_ids = inputs.token_type_ids.to(device)

output = lxmert(
    input_ids=inputs.input_ids,
    attention_mask=inputs.attention_mask,
    visual_feats=features,
    visual_pos=boxes,
    token_type_ids=inputs.token_type_ids,
    return_dict=True,
    output_attentions=True
)

In [10]:
NUM_CANDIDATES_PER_FRAME = 20
NUM_FRAMES_PER_STEP = 5
NUM_STEPS = 10


In [39]:
## Linear layer

class LxmertVGHead(nn.Module):
    def __init__(self, config, num_labels):
        super().__init__()
        hid_dim = config.hidden_size
        self.logit_fc = nn.Sequential(
            nn.Linear(hid_dim, hid_dim * 2),
            torch.nn.GELU(),
            nn.LayerNorm(hid_dim * 2, eps=1e-12),
            nn.Linear(hid_dim * 2, num_labels),
        )

    def forward(self, hidden_states):
        return self.logit_fc(hidden_states)

In [38]:
lin_layer = LxmertVGHead(lxmert.config, NUM_STEPS*NUM_CANDIDATES_PER_FRAME*NUM_FRAMES_PER_STEP)

In [40]:
lin_layer.to(device)

LxmertVGHead_2(
  (logit_fc): Sequential(
    (0): Linear(in_features=768, out_features=1536, bias=True)
    (1): GELU()
    (2): LayerNorm((1536,), eps=1e-12, elementwise_affine=True)
    (3): Linear(in_features=1536, out_features=1000, bias=True)
  )
)

In [17]:
model = Model(device, MAX_DETECTIONS=20)

In [18]:
entity_idx = get_ent_inds(model, entities, steps)

In [19]:
#use this as stand-in for entity index

entity_end_ind= [entity_tuple[-1] for entity_tuple in entity_idx[0]]

In [20]:
entity_end_ind

[5, 17, 24, 28, 33, 39, 42, 58, 63, 66, 72, 75, 81, 84, 90, 93]

In [23]:
len(entity_end_ind)

16

In [47]:
#compute scores using output and linear layer

vg_ent_scores = lin_layer(output["language_output"][:,entity_end_ind,:])

In [49]:
vg_ent_scores_2.argmax(dim=2)

tensor([[837, 743, 489, 697, 901, 374, 489, 489, 451, 840, 606, 575, 268, 743,
         220, 152]], device='cuda:0')