In [1]:
import numpy as np
import torch

from torch import nn

In [2]:
from video import Video

video_path = "/home/sagarpatel/Desktop/ece496-capstone/train/sample/video.mp4"
transcript_path = "/home/sagarpatel/Desktop/ece496-capstone/train/sample/transcript.vtt"

v = Video(video_path, transcript_path)
v.align()
v.generate_frames("sample", swap=True)

In [3]:
import glob
import itertools

from model import Model
from loss import loss_RA_MIL
from detector import Detector
from parser import parse

detector = Detector()

#Sample visualizer.
#URL = "/home/sagarpatel/Desktop/ece496-capstone/train/sample/9.png"
#detector.inference(URL, max_detections=3, visualize=True)

loading configuration file cache
loading weights file https://cdn.huggingface.co/unc-nlp/frcnn-vg-finetuned/pytorch_model.bin from cache at /home/sagarpatel/.cache/torch/transformers/57f6df6abe353be2773f2700159c65615babf39ab5b48114d2b49267672ae10f.77b59256a4cf8343ae0f923246a81489fc8d82f98d082edc2d2037c977c0d9d0
All model checkpoint weights were used when initializing GeneralizedRCNN.

All the weights of GeneralizedRCNN were initialized from the model checkpoint at unc-nlp/frcnn-vg-finetuned.
If your task is similar to the task the model of the checkpoint was trained on, you can already use GeneralizedRCNN for predictions without further training.


In [4]:
images = sorted(glob.glob("/home/sagarpatel/Desktop/ece496-capstone/train/sample/*.png"))
candidates = [detector.inference(image, max_detections=5) for image in images]

In [5]:
# Get all of the steps in the video.
steps = [step.text.strip().replace('.', '') for step in v.steps]

# Find all of the bounding boxes for the detections and their features.
boxes = torch.tensor([candidate[0].numpy() for candidate in candidates]).squeeze(1)
features = torch.tensor([candidate[1].numpy() for candidate in candidates]).squeeze(1)

# Find all of the entities in each of the sentences and their ordinal positions.
entities, indices = parse(steps, max_step_length=10)

# We need to add a dummy step that can be referred to as NULL.
entities.append(["unused0"])
indices.append([len(steps) * 10])
steps.append("unused0")

In [6]:
def train(model, data, epochs=25, lr=0.001, batch_size=10, y=0.5):
    '''
    Training loop for the model.
    '''
    
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    
    # Ouput losses.
    train_loss = np.zeros(epochs)
    
    # Output accuracies.
    train_accuracy = np.zeros(epochs)
    
    m_RR = None
    m_VG = None
    
    for epoch in range(epochs):
        for boxes, features, entities, indices in data:
            # Zero out any gradients.
            optimizer.zero_grad()
            
            # Run inference (forward pass).
            _, RR, VG, _, loss_V, loss_E, loss_R, _, _, _  = model(steps, boxes, features, entities, indices)
            
            # Loss from alignment.
            loss = loss_RA_MIL(y, loss_R, loss_E, loss_V)
            print(loss)
            
            # Backpropagation (backward pass).
            loss.backward()
            
            # Update parameters.
            optimizer.step()
            
            m_RR = RR
            m_VG = VG
            
        # TODO: save loss and accuracy at each epoch, plot (and checkpoint).
    return m_RR, m_VG

In [9]:
model = Model(max_step_length=10)
data = [(boxes, features, entities, indices)]

RR, VG = train(model, data, epochs=10, lr=0.1)

tensor(52.8683, grad_fn=<SumBackward0>)
tensor(428.3448, grad_fn=<SumBackward0>)
tensor(79.1530, grad_fn=<SumBackward0>)
tensor(149.3543, grad_fn=<SumBackward0>)
tensor(165.3580, grad_fn=<SumBackward0>)
tensor(22.3414, grad_fn=<SumBackward0>)
tensor(23.1942, grad_fn=<SumBackward0>)
tensor(61.6232, grad_fn=<SumBackward0>)
tensor(22.9629, grad_fn=<SumBackward0>)
tensor(7.7691, grad_fn=<SumBackward0>)


In [12]:
entities

[['the tomatoes', 'a pan'],
 ['oil', 'the pan'],
 ['the bacon'],
 ['some mayonnaise', 'the bread'],
 ['a piece', 'lettuce', 'it'],
 ['the tomatoes', 'it'],
 ['some salt', 'it'],
 ['the bacon', 'the top'],
 ['the piece', 'bread', 'the top'],
 ['unused0']]

In [13]:
steps

['Grill the tomatoes in a pan',
 'Add oil into the pan',
 'Cook the bacon',
 'Spread some mayonnaise onto the bread',
 'Place a piece of lettuce onto it',
 'Place the tomatoes over it',
 'Sprinkle some salt and pepper onto it',
 'Place the bacon at the top',
 'Place the piece of bread at the top',
 'unused0']

In [21]:
entities_flat = [e for es in entities for e in es]
RR_edges = RR.argmax(dim=1)

for entity, edge in zip(entities_flat, RR_edges):
    print('{} --> {}'.format(entity, steps[edge]))

the tomatoes --> unused0
a pan --> unused0
oil --> unused0
the pan --> unused0
the bacon --> Add oil into the pan
some mayonnaise --> unused0
the bread --> unused0
a piece --> Cook the bacon
lettuce --> Cook the bacon
it --> Cook the bacon
the tomatoes --> Place a piece of lettuce onto it
it --> Place a piece of lettuce onto it
some salt --> unused0
it --> Cook the bacon
the bacon --> Sprinkle some salt and pepper onto it
the top --> Cook the bacon
the piece --> Sprinkle some salt and pepper onto it
bread --> unused0
the top --> Sprinkle some salt and pepper onto it
unused0 --> unused0


In [22]:
VG

tensor([[3, 3, 0],
        [3, 2, 0],
        [3, 0, 0],
        [2, 2, 0],
        [1, 3, 0],
        [1, 4, 0],
        [0, 1, 0],
        [1, 1, 0],
        [4, 0, 0],
        [2, 0, 0]])

In [24]:
boxes.shape

torch.Size([10, 5, 4])

$$P\left(d_{i j}=(l, k) \mid E, A, B, R\right)=\operatorname{sigmoid}\left(\psi\left(b_{l k}\right)^{T} \phi_{e}^{R}\left(e_{i j}\right)\right)$$

$$\phi_{e}^{R}\left(e_{i j}\right)=w o r d Embd\left(e_{i j}\right)+\phi_{a}^{R}\left(a_{o}\right)$$

\begin{aligned}
\max _{D_{l}} P\left(D_{l} \mid \bar{G}_{l}, B_{l}\right) &>\max _{D_{l}} P\left(D_{l} \mid \bar{G}_{l}, B_{m}\right) \\
\max _{D_{l}} P\left(D_{l} \mid \bar{G}_{l}, B_{l}\right) &>\max _{D_{n}} P\left(D_{n} \mid \bar{G}_{n}, B_{l}\right)
\end{aligned}

$$S_{l m}^{R}=\sum_{j} \max _{k}\left\langle\phi_{e}^{R}\left(e_{m j}\right), \psi_{b}\left(b_{l k}\right)\right\rangle$$

\begin{aligned}
\mathcal{L}_{R A-M I L}=\sum_{l}[& \sum_{m} \gamma_{l m} \cdot \max \left(0, S_{l m}^{R}-S_{l l}^{R}+\Delta\right) \\
&\left.+\sum_{m} \gamma_{m l} \cdot \max \left(0, S_{m l}^{R}-S_{l l}^{R}+\Delta\right)\right]
\end{aligned}

Loss function terminology:
    
        l, m: step indices
        j, k: entity indices
        R: reference resolution edges

        a_m: action step m
        e_mj: j'th entity in step m
        b_lk: bounding box for the k'th entity in step l

        a(R, m, j): action referred to by the entity e_mj

        ψ(l, k):
            visual embedding of bounding box
            -----------------------
            ψ = VisualBERT_embedding(b_lk)

        φA(m):
            action embedding
            -----------------------
            φA = avg_j(VisualBERT_embedding(e_mj))

        φE(R, m, j):
            reference-aware entity embedding
            -----------------------
            φE = word_embedding(e_mj) + φA(a(R, m, j))

        γ(l, m):
            reference-based penalty
            -----------------------
            γ = 1     : if none of the entities in step m (a_m) have a reference to step l (a_l)
            0 < γ < 1 : if atleast one entity in step m (a_m) has a reference to step l (a_l)

        score(R, m, j, l, k):
            alignment score between entity (e_mj) and bounding box (b_lk)
            -----------------------
            score = φE(R, m, j) · ψ(l, k)

        S(R, l, m):
            alignment score between steps (a_l and a_m)
            -----------------------
            S = sum_j(max_k(φE(R, m, j), ψ(l, k)))

        Loss = sum_l
               (
                    sum_m [   γ(l, m) * max(0, S(R, l, m) - S(R, l, l))   ] 
                    sum_m [   γ(m, l) * max(0, S(R, m, l) - S(R, l, l))   ]
               )