In [None]:
from dataset import YouCookII
from dataset import YouCookIICollate
from torch.utils.data import DataLoader
from loss import loss_RA_MIL
from transformers import get_linear_schedule_with_warmup
from model import Model
from model_FC import ModelFC

import numpy as np
import torch
import matplotlib.pyplot as plt

import itertools
import torch
import einops
import torch.nn.functional as F

from transformers import LxmertModel, LxmertTokenizer
from torch import nn
from torch.nn.utils.rnn import pad_sequence
from loss import loss_RA_MIL

from model import *

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
ACTION = '[unused3]'

lxmert_tokenizer = LxmertTokenizer.from_pretrained("unc-nlp/lxmert-base-uncased")
lxmert_tokenizer.add_special_tokens({"additional_special_tokens": [ACTION]})
lxmert_tokenizer.encode([ACTION])

lxmert = LxmertModel.from_pretrained("unc-nlp/lxmert-base-uncased")
lxmert.to(device)

ACTION_TOKEN = lxmert_tokenizer.convert_tokens_to_ids(ACTION)

In [None]:
num_actions = 8
MAX_DETECTIONS=20
batch_size = 1

DETECTION_EMBEDDING_SIZE = 2048
OUTPUT_EMBEDDING_SIZE = 768
NUM_FRAMES_PER_STEP=5
MAX_DETECTIONS=20
CANDIDATES = NUM_FRAMES_PER_STEP * MAX_DETECTIONS

dataset = YouCookII(num_actions, "/h/sagar/ece496-capstone/datasets/ycii")
collate = YouCookIICollate(MAX_DETECTIONS=MAX_DETECTIONS)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, collate_fn=collate)

In [None]:
model = Model(device, MAX_DETECTIONS=20)

In [None]:
_, boxes, features, steps_list, entity_list, entity_count_list, _, _ = next(iter(dataloader))

In [None]:
steps_list = remove_unused2(steps_list)

In [None]:
entities_count = entity_count_list[0]
entities = entity_list[0]
steps = steps_list[0]

In [None]:
steps = [step.strip() for step in steps.split(ACTION)[:-2]]

In [None]:
boxes = boxes.squeeze(0)
boxes = boxes.reshape(num_actions, CANDIDATES, -1)
boxes = boxes.to(device)

features = features.squeeze(0)
features = features.reshape(num_actions, CANDIDATES, -1)
features = features.to(device)

In [None]:
boxes.shape

In [None]:
from random import choice
from random import seed

seed(0)

steps_pairs = []
bboxes_pairs = torch.zeros(num_actions, 2, CANDIDATES, 4)
features_pairs = torch.zeros(num_actions, 2, CANDIDATES, DETECTION_EMBEDDING_SIZE)
entity_list_pairs = []
num_steps = len(steps)
pairs = []

for idx, step in enumerate(steps):
    idx_1 = idx
    idx_2 = choice([idx_2 for idx_2 in range(num_steps) if idx_2 != idx_1])
    
    pairs.append((idx_1, idx_2))
    
    steps_pairs.append(steps[idx_1] + " " + ACTION + " " + steps[idx_2] + " " + ACTION + " " + ACTION)
    
    bboxes_pairs[idx_1][0] = boxes[idx_1]
    bboxes_pairs[idx_1][1] = boxes[idx_2]
    
    features_pairs[idx_1][0] = features[idx_1]
    features_pairs[idx_1][1] = features[idx_2]
    
    entity_list_pairs.append([entities[idx_1], entities[idx_2]])
    
bboxes_pairs = bboxes_pairs.reshape(num_actions, 2 * CANDIDATES, -1)
bboxes_pairs = bboxes_pairs.to(device)

features_pairs = features_pairs.reshape(num_actions, 2 * CANDIDATES, DETECTION_EMBEDDING_SIZE)
features_pairs = features_pairs.to(device)

entity_count = [len(action) for action in sum(entity_list_pairs, [])]

In [None]:
boxes.shape

In [None]:
pairs

In [None]:
steps_pairs

In [None]:
entity_list_pairs

In [None]:
print(bboxes_pairs.shape)
print(features_pairs.shape)

In [None]:
inputs = lxmert_tokenizer(
            steps_pairs,
            padding="longest",
            truncation=False,
            return_token_type_ids=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors="pt"
        )

inputs.input_ids = inputs.input_ids.to(device)
inputs.attention_mask = inputs.attention_mask.to(device)
inputs.token_type_ids = inputs.token_type_ids.to(device)

output = lxmert(
            input_ids=inputs.input_ids,
            attention_mask=inputs.attention_mask,
            visual_feats=features_pairs,
            visual_pos=bboxes_pairs,
            token_type_ids=inputs.token_type_ids,
            return_dict=True,
            output_attentions=True
        )

In [None]:
entity_idx = get_ent_inds(model, entity_list_pairs, steps_pairs)
entity_embeddings = get_entity_embeddings(output['language_output'], entity_idx).split(entity_count)

action_idx = (inputs.input_ids == ACTION_TOKEN)
A = output['language_output'][action_idx]

V = output['vision_output']

In [None]:
E = []

for i in range(num_actions):
    idx_1 = 2 * i
    idx_2 = idx_1 + 1
    E.append([entity_embeddings[idx_1], entity_embeddings[idx_2]])

In [None]:
VG = []

for i in range(num_actions):    
    pair_1 = []
    pair_2 = []
    
    for ent_1 in E[i][0]:
        alignment_scores = (ent_1 * V[i][0:100]).sum(dim=-1)
        pair_1.append(int(alignment_scores.argmax()))
                
    for ent_2 in E[i][1]:
        alignment_scores = (ent_2 * V[i][100:]).sum(dim=-1)
        pair_2.append(int(alignment_scores.argmax()))
        
    VG.append([pair_1, pair_2])

In [None]:
VG_V = []

for i, pair in enumerate(VG):
    VG_V.append([])
    for j, action in enumerate(pair):
        VG_V[i].append([])
        for k, entity in enumerate(action):
            if j == 0:
                VG_V[i][j].append(V[i][0:100][VG[i][j][k]])
            else:
                VG_V[i][j].append(V[i][100:][VG[i][j][k]])

In [None]:
VG

In [None]:
def get_loss(E, VG_V):
    loss = 0

    for entity, box in zip(E, VG_V):
        loss = loss + loss_pair(entity, box)
        
    print(int(loss))
        
    return loss

In [None]:
def S_lm(l, m, E, VG_V):
    #print("Computing S_{}{}".format(l, m))
    
    entities = E[m]
    scores = 0
    
    if (len(VG_V[l]) == 0):
        return 0
    
    boxes = torch.stack(VG_V[l])
    
    for j, ent in enumerate(entities):
        scores = scores + (ent * boxes).sum(dim=-1).max()
        
    return scores

In [None]:
def loss_pair(E, VG_V):
    delta = torch.full((1, 1), 1000.0).to(device)
    
    loss = 0

    assert(len(VG_V) == len(E))

    num_actions = len(E)
    
    zero = torch.zeros((1)).to(device)

    for l in range(num_actions):
        for m in range(num_actions):
            before_delta = S_lm(l, m, E, VG_V) - S_lm(l, l, E, VG_V)
            loss = loss + torch.max(zero, (before_delta + delta))[0]

        for m in range(num_actions):
            before_delta = S_lm(m, l, E, VG_V) - S_lm(l, l, E, VG_V)
            loss = loss + torch.max(zero, (before_delta + delta))[0]
            
    #print("Loss: {}".format(loss))
            
    return loss

# Training

In [7]:
from dataset import YouCookII
from dataset import YouCookIICollate
from torch.utils.data import DataLoader
from loss import loss_RA_MIL
from transformers import get_linear_schedule_with_warmup
from model import Model
from model_FC import ModelFC

import numpy as np
import torch
import matplotlib.pyplot as plt

def train(model, num_actions, batch_size, epochs=25, lr=0.001, y=0.5, MAX_DETECTIONS=20):
    dataset = YouCookII(num_actions, "/h/sagar/ece496-capstone/datasets/ycii")
    train_size = int(len(dataset) * (2/3))
    valid_size = int(len(dataset) - train_size)
    
    train_dataset, valid_dataset = torch.utils.data.random_split(dataset, [train_size, valid_size])
    
    collate = YouCookIICollate(MAX_DETECTIONS=MAX_DETECTIONS)
    
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate)
    valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate)
    
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    scheduler = get_linear_schedule_with_warmup(optimizer, int(0.2*epochs), epochs)

    train_loss = np.zeros(epochs)
    valid_loss = np.zeros(epochs)
    
    model.train()
    
    for epoch in range(epochs):
        epoch_loss = 0.0
        num_batches = 0
        for data in train_dataloader:
            _, bboxes_tensor, features_tensor, steps_list, entity_list, entity_count_list, _, _ = data
            batch_size = len(data[0])
            
            # Zero out any gradients.
            optimizer.zero_grad()
            
            # Run inference (forward pass).
            E, VG_V = model(batch_size, num_actions + 1, steps_list, features_tensor, bboxes_tensor, entity_count_list, entity_list)            
            
            # Loss from alignment.
            loss_ = get_loss(E, VG_V)

            # Backpropagation (backward pass).
            loss_.backward()

            # Update parameters.
            optimizer.step()
            
            epoch_loss += loss_
            num_batches += 1
        
        # learning rate schedule
        # update after each epoch
        scheduler.step()
        epoch_loss = epoch_loss / num_batches
        
        # Save loss and accuracy at each epoch, plot (and checkpoint).
        train_loss[epoch] = epoch_loss
        valid_loss[epoch] = get_validation_loss(model, num_actions, y, valid_dataloader)
        
        # after epoch completes
        print("Epoch {} - Train Loss: {}, Validation Loss: {}".format(epoch + 1, train_loss[epoch], valid_loss[epoch]))
    
    plt.plot(train_loss, label='train loss')
    plt.plot(valid_loss, label='valid loss')
    plt.legend()
    plt.show()
        
    return train_loss, valid_loss

def get_validation_loss(model, num_actions, y, valid_dataloader):
    epoch_loss = 0.0
    num_batches = 0
        
    with torch.no_grad():
        for data in valid_dataloader:
            _, bboxes_tensor, features_tensor, steps_list, entity_list, entity_count_list, _, _ = data
            batch_size = len(data[0])

            # Run inference (forward pass).
            E, VG_V = model(batch_size, num_actions + 1, steps_list, features_tensor, bboxes_tensor, entity_count_list, entity_list)

            # Loss from alignment.
            loss_ = get_loss(E, VG_V)
            
            epoch_loss += loss_
            num_batches += 1
            
    epoch_loss = epoch_loss / num_batches
    
    return epoch_loss

In [17]:
import itertools
import torch
import einops
import torch.nn.functional as F

from transformers import LxmertModel, LxmertTokenizer
from torch import nn
from torch.nn.utils.rnn import pad_sequence
from loss import loss_RA_MIL

from model import *

from random import choice
from random import seed

class ModelTesting(nn.Module):
    ACTION = '[unused3]'

    DETECTION_EMBEDDING_SIZE = 2048
    OUTPUT_EMBEDDING_SIZE = 768

    def __init__(self, device, NUM_FRAMES_PER_STEP=5, MAX_DETECTIONS=20):
        super(ModelTesting, self).__init__()

        self.device = device

        self.NUM_FRAMES_PER_STEP = NUM_FRAMES_PER_STEP
        self.MAX_DETECTIONS = MAX_DETECTIONS
        self.CANDIDATES = self.NUM_FRAMES_PER_STEP * self.MAX_DETECTIONS

        self.lxmert_tokenizer = LxmertTokenizer.from_pretrained("unc-nlp/lxmert-base-uncased")
        self.lxmert_tokenizer.add_special_tokens({"additional_special_tokens": [self.ACTION]})
        self.lxmert_tokenizer.encode([self.ACTION])

        self.ACTION_TOKEN = self.lxmert_tokenizer.convert_tokens_to_ids(self.ACTION)

        self.lxmert = LxmertModel.from_pretrained("unc-nlp/lxmert-base-uncased")
        self.lxmert.to(device)

    def forward(self, BATCH_SIZE, NUM_ACTIONS, steps_list, features, boxes, entity_count_list, entity_list):
        assert(BATCH_SIZE == 1)
        
        NUM_ACTIONS = NUM_ACTIONS - 1
        
        steps_list = remove_unused2(steps_list)
        
        entities_count = entity_count_list[0]
        entities = entity_list[0]
        steps = steps_list[0]
        steps = [step.strip() for step in steps.split(self.ACTION)[:-2]]
        
        boxes = boxes.squeeze(0)
        boxes = boxes.to(self.device)
        boxes = boxes.reshape(NUM_ACTIONS, self.CANDIDATES, -1)
        
        features = features.squeeze(0)
        features = features.to(self.device)
        features = features.reshape(NUM_ACTIONS, self.CANDIDATES, -1)

        steps_pairs = []
        
        bboxes_pairs = torch.zeros(NUM_ACTIONS, 2, self.CANDIDATES, 4)
        bboxes_pairs = bboxes_pairs.to(self.device)
        
        features_pairs = torch.zeros(NUM_ACTIONS, 2, self.CANDIDATES, self.DETECTION_EMBEDDING_SIZE)
        features_pairs = features_pairs.to(self.device)
        
        entity_list_pairs = []
        num_steps = len(steps)
        pairs = []
        
        for idx, step in enumerate(steps):
            idx_1 = idx
            idx_2 = choice([idx_2 for idx_2 in range(num_steps) if idx_2 != idx_1])

            pairs.append((idx_1, idx_2))

            steps_pairs.append(steps[idx_1] + " " + self.ACTION + " " + steps[idx_2] + " " + self.ACTION + " " + self.ACTION)

            bboxes_pairs[idx_1][0] = boxes[idx_1]
            bboxes_pairs[idx_1][1] = boxes[idx_2]

            features_pairs[idx_1][0] = features[idx_1]
            features_pairs[idx_1][1] = features[idx_2]

            entity_list_pairs.append([entities[idx_1], entities[idx_2]])
    
        bboxes_pairs = bboxes_pairs.reshape(NUM_ACTIONS, 2 * self.CANDIDATES, -1)
        bboxes_pairs = bboxes_pairs.to(self.device)

        features_pairs = features_pairs.reshape(NUM_ACTIONS, 2 * self.CANDIDATES, self.DETECTION_EMBEDDING_SIZE)
        features_pairs = features_pairs.to(self.device)

        entity_count = [len(action) for action in sum(entity_list_pairs, [])]
        
        inputs = self.lxmert_tokenizer(
            steps_pairs,
            padding="longest",
            truncation=False,
            return_token_type_ids=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors="pt"
        )

        inputs.input_ids = inputs.input_ids.to(self.device)
        inputs.attention_mask = inputs.attention_mask.to(self.device)
        inputs.token_type_ids = inputs.token_type_ids.to(self.device)

        output = self.lxmert(
                    input_ids=inputs.input_ids,
                    attention_mask=inputs.attention_mask,
                    visual_feats=features_pairs,
                    visual_pos=bboxes_pairs,
                    token_type_ids=inputs.token_type_ids,
                    return_dict=True,
                    output_attentions=True
                )
        
        entity_idx = get_ent_inds(self, entity_list_pairs, steps_pairs)
        entity_embeddings = get_entity_embeddings(output['language_output'], entity_idx).split(entity_count)

        action_idx = (inputs.input_ids == self.ACTION_TOKEN)
        A = output['language_output'][action_idx]

        V = output['vision_output']
        
        E = []
        for i in range(NUM_ACTIONS):
            idx_1 = 2 * i
            idx_2 = idx_1 + 1
            E.append([entity_embeddings[idx_1], entity_embeddings[idx_2]])
        
        VG = []
        
        for i in range(NUM_ACTIONS):    
            pair_1 = []
            pair_2 = []

            for ent_1 in E[i][0]:
                alignment_scores = (ent_1 * V[i][0:100]).sum(dim=-1)
                pair_1.append(alignment_scores.argmax())

            for ent_2 in E[i][1]:
                alignment_scores = (ent_2 * V[i][100:]).sum(dim=-1)
                pair_2.append(alignment_scores.argmax())

            VG.append([pair_1, pair_2])
            
        VG_V = []

        for i, pair in enumerate(VG):
            VG_V.append([])
            for j, action in enumerate(pair):
                VG_V[i].append([])
                for k, entity in enumerate(action):
                    if j == 0:
                        VG_V[i][j].append(V[i][0:100][VG[i][j][k]])
                    else:
                        VG_V[i][j].append(V[i][100:][VG[i][j][k]])
                        
        VG_ind = torch.zeros((1, NUM_ACTIONS, max(entity_count))).to(self.device)

        for i, pair in enumerate(VG):
            for k, entity in enumerate(pair[0]):
                VG_ind[0][i][k] = VG[i][0][k]
                        
        #return None, None, None, NUM_ACTIONS, None, None, VG_ind, None
        return E, VG_V, output, inputs, E, V

In [18]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_test = ModelTesting(device)

In [14]:
dataset = YouCookII(8, "/h/sagar/ece496-capstone/datasets/ycii")
collate = YouCookIICollate(MAX_DETECTIONS=20)
dataloader = DataLoader(dataset, batch_size=1, shuffle=False, collate_fn=collate)

In [6]:
correct = 0
incorrect = 0

for data in dataloader:
    _, boxes, features, steps_list, entity_list, entity_count_list, _, _ = data
    E, VG_V, outputs, inputs, E, V = model_test(1, 8 + 1, steps_list, features, boxes, entity_count_list, entity_list)
    
    for i in range(8):    
        for ent_1 in E[i][0]:
            aligned = (ent_1 * V[i][0:100]).sum(dim=-1)
            aligned = aligned.argmax()

            unaligned = (ent_1 * V[i][100:]).sum(dim=-1)
            unaligned = unaligned.argmax()

            #print("Aligned: {}, Unaligned: {}".format(aligned, unaligned))

            if (aligned > unaligned):
                correct+=1
            else:
                incorrect+=1

        for ent_2 in E[i][1]:
            aligned = (ent_2 * V[i][100:]).sum(dim=-1)
            aligned = aligned.argmax()

            unaligned = (ent_2 * V[i][0:100]).sum(dim=-1)
            unaligned = unaligned.argmax()

            #print("Aligned: {}, Unaligned: {}".format(aligned, unaligned))

            if (aligned > unaligned):
                correct+=1
            else:
                incorrect+=1
            
print("Accuracy: {}".format(correct / (correct + incorrect)))

Accuracy: 0.4860661049902787


In [15]:
_, boxes, features, steps_list, entity_list, entity_count_list, _, _ = next(iter(dataloader))

In [20]:
E, VG_V, outputs, inputs, E, V = model_test(1, 8 + 1, steps_list, features, boxes, entity_count_list, entity_list)

In [28]:
V[0]

tensor([[-1.2660, -1.2161, -1.2227,  ...,  1.0998, -1.2849, -1.2352],
        [-1.2660, -1.2161, -1.2227,  ...,  1.0998, -1.2849, -1.2352],
        [-1.2660, -1.2161, -1.2227,  ...,  1.0998, -1.2849, -1.2352],
        ...,
        [-1.2660, -1.2161, -1.2227,  ...,  1.0998, -1.2849, -1.2352],
        [-1.2660, -1.2161, -1.2227,  ...,  1.0998, -1.2849, -1.2352],
        [-1.2660, -1.2161, -1.2227,  ...,  1.0998, -1.2849, -1.2352]],
       device='cuda:0', grad_fn=<SelectBackward>)

In [32]:
V[7]

tensor([[-1.2660, -1.2161, -1.2227,  ...,  1.0998, -1.2849, -1.2352],
        [-1.2660, -1.2161, -1.2227,  ...,  1.0998, -1.2849, -1.2352],
        [-1.2660, -1.2161, -1.2227,  ...,  1.0998, -1.2849, -1.2352],
        ...,
        [-1.2660, -1.2161, -1.2227,  ...,  1.0998, -1.2849, -1.2352],
        [-1.2660, -1.2161, -1.2227,  ...,  1.0998, -1.2849, -1.2352],
        [-1.2660, -1.2161, -1.2227,  ...,  1.0998, -1.2849, -1.2352]],
       device='cuda:0', grad_fn=<SelectBackward>)

In [None]:
tokens = model_test.lxmert_tokenizer.convert_ids_to_tokens(inputs.input_ids[0])

In [None]:
tokens_1 = tokens
tokens_2 = ["C{}".format(i + 1) for i in range(200)]

In [None]:
action = 6

attention_1 = [attn[action].unsqueeze(0) for attn in outputs.language_attentions]
attention_2 = [attn[action].unsqueeze(0) for attn in outputs.vision_attentions]

cross = [attn[action].unsqueeze(0) for attn in outputs.cross_encoder_attentions]

In [None]:
cross

In [None]:
torch.stack(cross).min()

In [None]:
torch.stack(cross).max()

In [None]:
from bertviz import model_view
from bertviz import head_view

head_view(
    encoder_attention=attention_2,
    decoder_attention=attention_1,
    cross_attention=cross,
    encoder_tokens=tokens_2,
    decoder_tokens=tokens_1,
    layer=0,
    heads=[2]
)

In [None]:
train_loss, valid_loss = train(model_test, 8, 1, epochs=100, lr=1e-2)

In [None]:
from eval_fi import eval_all_dataset
eval_all_dataset(model_test)

In [None]:
torch.save(model_test.state_dict(), "/h/sagar/ece496-capstone/weights/weights-nv-5")

In [19]:
model_test.load_state_dict(torch.load("/h/sagar/ece496-capstone/weights/weights-nv-5"))

<All keys matched successfully>