In [1]:
import numpy as np
import torch

from torch import nn

In [2]:
from video import Video

video_path = "/home/sagarpatel/Desktop/ece496-capstone/train/sample/video.mp4"
transcript_path = "/home/sagarpatel/Desktop/ece496-capstone/train/sample/transcript.vtt"
transcript2_path = "/home/sagarpatel/Desktop/ece496-capstone/train/sample/transcript2.vtt"

v = Video(video_path, transcript_path)
v.align()
v.generate_frames("sample", swap=True)

v2 = Video(video_path, transcript2_path)
v2.align()
v2.generate_frames("sample", swap=True)

In [3]:
import glob
import itertools

from loss import loss_RA_MIL
from detector import Detector
from parser import parse

detector = Detector()

loading configuration file cache
loading weights file https://cdn.huggingface.co/unc-nlp/frcnn-vg-finetuned/pytorch_model.bin from cache at /home/sagarpatel/.cache/torch/transformers/57f6df6abe353be2773f2700159c65615babf39ab5b48114d2b49267672ae10f.77b59256a4cf8343ae0f923246a81489fc8d82f98d082edc2d2037c977c0d9d0
All model checkpoint weights were used when initializing GeneralizedRCNN.

All the weights of GeneralizedRCNN were initialized from the model checkpoint at unc-nlp/frcnn-vg-finetuned.
If your task is similar to the task the model of the checkpoint was trained on, you can already use GeneralizedRCNN for predictions without further training.


In [4]:
images = sorted(glob.glob("/home/sagarpatel/Desktop/ece496-capstone/train/sample/*.png"))
candidates = [detector.inference(image, max_detections=5) for image in images]

In [5]:
NULL = '[unused1]'
PAD = '[unused2]'
ENTITY = '[unused3]'
ACTION = '[SEP]'

MAX_STEP_LENGTH = 30

In [6]:
steps1 = [step.text.strip() for step in v.steps]
steps2 = [step.text.strip() for step in v2.steps]

In [7]:
from transformers import LxmertModel, LxmertTokenizer

tokenizer = LxmertTokenizer.from_pretrained("unc-nlp/lxmert-base-uncased", pad_token=PAD)
tokenizer.add_special_tokens({"additional_special_tokens": [NULL, PAD, ENTITY]})
tokenizer.encode([NULL, PAD, ENTITY], add_special_tokens=True)

tokens_steps1 = tokenizer(
                    steps1,
                    return_token_type_ids=False,
                    return_attention_mask=False,
                    add_special_tokens=True,
                    padding="max_length",
                    max_length=MAX_STEP_LENGTH + 2,
                    return_tensors="pt"
                )

tokens_steps2 = tokenizer(
                    steps2,
                    return_token_type_ids=False,
                    return_attention_mask=False,
                    add_special_tokens=True,
                    padding="max_length",
                    max_length=MAX_STEP_LENGTH + 2,
                    return_tensors="pt"
                )

In [8]:
steps1_flat = tokens_steps1['input_ids'].flatten()
steps1_flat = steps1_flat[steps1_flat != 101]
steps1_flat = tokenizer.decode(steps1_flat) + ' ' + NULL

In [9]:
steps2_flat = tokens_steps2['input_ids'].flatten()
steps2_flat = steps2_flat[steps2_flat != 101]
steps2_flat = tokenizer.decode(steps2_flat) + ' ' + NULL

In [10]:
# Find all of the bounding boxes for the detections and their features.
boxes = torch.tensor([candidate[0].numpy() for candidate in candidates]).squeeze(1)
features = torch.tensor([candidate[1].numpy() for candidate in candidates]).squeeze(1)

boxes = boxes.flatten(start_dim=0, end_dim=1)
features = features.flatten(start_dim=0, end_dim=1)

boxes = torch.stack((boxes, boxes))
features = torch.stack((features, features))

In [12]:
model = Model()

In [13]:
steps = [steps1_flat, steps2_flat]

ENTITIES_COUNT = [
    [2, 2, 1, 2, 3, 2, 2, 2, 1, 1],
    [1, 1, 1, 1, 3, 2, 1, 2, 1, 1]
]

In [21]:
# https://github.com/google-research/bert/issues/635
# https://colab.research.google.com/drive/18TyuMfZYlgQ_nXo-tr8LCnzUaoX0KS-h?usp=sharing#scrollTo=W4cZIVrg82ua.

import itertools
import torch
import einops

from transformers import LxmertModel, LxmertTokenizer
from torch import nn
from torch.nn.utils.rnn import pad_sequence

class Model(nn.Module):
    NULL = '[unused1]'
    PAD = '[unused2]'
    ENTITY = '[unused3]'
    ACTION = '[SEP]'
    
    DETECTION_EMBEDDING_SIZE = 2048
    OUTPUT_EMBEDDING_SIZE = 768
        
    def __init__(self):
        super(Model, self).__init__()
        
        self.lxmert_tokenizer = LxmertTokenizer.from_pretrained("unc-nlp/lxmert-base-uncased")
        self.lxmert_tokenizer.add_special_tokens({"additional_special_tokens": [NULL, PAD, ENTITY]})
        self.lxmert_tokenizer.encode([NULL, PAD, ENTITY])
        
        self.NULL_TOKEN = self.lxmert_tokenizer.convert_tokens_to_ids(NULL)
        self.ENTITY_TOKEN = self.lxmert_tokenizer.convert_tokens_to_ids(ENTITY)
        self.ACTION_TOKEN = self.lxmert_tokenizer.convert_tokens_to_ids(ACTION)
        
        self.lxmert = LxmertModel.from_pretrained("unc-nlp/lxmert-base-uncased")
        
    def forward(self, MAX_INSTRUCTION_LENGTH, BATCH_SIZE, NUM_ACTIONS, CANDIDATES, steps, features, boxes, entity_count):
        '''
            MAX_INSTRUCTION_LENGTH
                : maximum number of words in a combined string of steps for a video,
                  this instruction string must contain the NULL step.
                  
                  ex. | E ... E . [SEP] [PAD] [PAD] | .... [SEP] [PAD] | NULL | 
                  
            steps
                : batched video steps of size (BATCH_SIZE, NUM_ACTIONS), list of lists,
                  each instructional video must have the same number of steps = NUM_ACTIONS.
                
                  ex. [['step 1.1', 'step 1.2'], ['step 2.1', 'step 2.2']]
                
            features
                : batched detection features of size (BATCH_SIZE, CANDIDATES * NUM_ACTIONS, 2048)
                
            boxes
                : batched bounding boxes of detection features of size (BATCH_SIZE, CANDIDATES * NUM_ACTIONS, 4)
                
            entity_count:
                : number of entities per action, size (BATCH_SIZE, NUM_ACTIONS), list of lists
        '''

        inputs = self.lxmert_tokenizer(
            steps,
            padding="max_length",
            max_length= MAX_INSTRUCTION_LENGTH + 2, # [CLS] and [SEP] token
            truncation=False,
            return_token_type_ids=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors="pt"
        )

        output = self.lxmert(
            input_ids=inputs.input_ids,
            attention_mask=inputs.attention_mask,
            visual_feats=features,
            visual_pos=boxes,
            token_type_ids=inputs.token_type_ids,
            return_dict=True,
            output_attentions=True
        )
        
        token_ids = inputs.input_ids
        
        entity_idx = ((token_ids == self.ENTITY_TOKEN) | (token_ids == self.NULL_TOKEN))
        action_idx = (token_ids == self.ACTION_TOKEN)
        
        entity_embeddings = output['language_output'][entity_idx]
        action_embeddings = output['language_output'][action_idx]
        vision_embeddings = output['vision_output']
        
        split_sizes = torch.tensor(entity_count).flatten().tolist()
        entities = entity_embeddings.split(split_sizes)
        
        E = pad_sequence(entities, batch_first=True)
        max_entities = E.shape[1]
        E = E.reshape(-1, NUM_ACTIONS, E.shape[1], E.shape[2])

        A = action_embeddings.reshape(BATCH_SIZE, NUM_ACTIONS, -1)
        V = vision_embeddings.reshape(BATCH_SIZE, NUM_ACTIONS, CANDIDATES, -1)
        
        # Calculate loss_E.
        loss_E = E
        
        # Calculate VG (VG_scores_index) and loss_V.
        VG_scores = torch.einsum('bacs, baes -> baec', V, E)
        VG_scores_max, VG_scores_index = VG_scores.max(dim=-1)

        V_flat = V.reshape(-1, self.OUTPUT_EMBEDDING_SIZE)

        VG_scores_index_flat = VG_scores_index.flatten()
        offsets = torch.arange(0, BATCH_SIZE * NUM_ACTIONS * CANDIDATES, CANDIDATES)
        offsets = offsets.repeat_interleave(max_entities)

        VG_scores_index_flat = VG_scores_index_flat + offsets

        loss_V = V_flat[VG_scores_index_flat, :].reshape(BATCH_SIZE, NUM_ACTIONS, max_entities, -1).shape
        
        # Calculate RR (RR_scores_index).
        RR_scores = torch.einsum('baes, bcs -> baec', E, A)

        edge_mask = torch.ones(NUM_ACTIONS, NUM_ACTIONS).tril(diagonal=-1)
        edge_mask[-1, :] = 0
        edge_mask[:, -1] = 1
        edge_mask = einops.repeat(edge_mask, 'x y -> b x c y', b=BATCH_SIZE, c=max_entities)

        RR_scores_max, RR_scores_index = (RR_scores * edge_mask).max(dim=-1)
        
        # Calculate loss_R.
        loss_R = np.ones((BATCH_SIZE, NUM_ACTIONS, NUM_ACTIONS))

        dim_1 = RR_scores_index.reshape(BATCH_SIZE, -1)
        dim_2 = torch.arange(NUM_ACTIONS).repeat_interleave(max_entities)
        dim_2 = einops.repeat(dim_2, 'd -> b d', b=BATCH_SIZE)

        loss_R[:, dim_1, dim_2] = 0
        
        return loss_E, loss_V, loss_R

In [22]:
def train(model, epochs=25, lr=0.001, batch_size=10, y=0.5):
    '''
    Training loop for the model.
    '''
    
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    
    # Ouput losses.
    train_loss = np.zeros(epochs)
    
    # Output accuracies.
    train_accuracy = np.zeros(epochs)
    
    m_RR = None
    m_VG = None
    
    model.train()
    
    for epoch in range(epochs):
        # Zero out any gradients.
        optimizer.zero_grad()

        # Run inference (forward pass).
        loss_E, loss_V, loss_R = model(265, 2, 10, 5, steps, features, boxes, ENTITIES_COUNT)

        # Loss from alignment.
        loss = loss_RA_MIL(y, loss_R, loss_E, loss_V)
        print(loss)

        # Backpropagation (backward pass).
        loss.backward()

        # Update parameters.
        optimizer.step()

        m_RR = RR
        m_VG = VG
            
        # TODO: save loss and accuracy at each epoch, plot (and checkpoint).
        
    return m_RR, m_VG

In [23]:
model = Model()
RR, VG = train(model, epochs=50, lr=0.001)

TypeError: expected Tensor as element 1 in argument 1, but got torch.Size