In [1]:
import os
import glob
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
import numpy as np
import clip
import json 
import time
import math

import random
import pandas as pd
from PIL import Image
import matplotlib.pyplot as plt
from tokenizers import Tokenizer
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
# os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
# os.environ["CUDA_VISIBLE_DEVICES"] = "5"


In [4]:
x = torch.tensor([1, 2, 3, 4, 5, 6, 7, 8]).view(4, 2)
print(x)
print(torch.roll(x, -1))

tensor([[1, 2],
        [3, 4],
        [5, 6],
        [7, 8]])
tensor([[2, 3],
        [4, 5],
        [6, 7],
        [8, 1]])


In [9]:
x = [1,2,3,4,5]
print(10 in x)

False


In [3]:
train_dir = "/data/dlcv/hw3/hw3_data/p2_data/images/train"
valid_dir = "/data/dlcv/hw3/hw3_data/p2_data/images/val"
train_json_dir = "/data/dlcv/hw3/hw3_data/p2_data/train.json"
val_json_dir = "/data/dlcv/hw3/hw3_data/p2_data/val.json"
ckpt_dir = "./p2"
tokenizer = Tokenizer.from_file("caption_tokenizer.json")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# if device == "cuda":
torch.cuda.set_device(5)
print('Device used:', device)

Device used: cuda


In [4]:
class p2dataset(Dataset):
    def __init__(self, inputPath, json_dir, transform=None):
        self.inputPath = inputPath
        self.transform = transform
        self.inputName = sorted(os.listdir(inputPath))
        with open(json_dir, 'r') as j:
            caption_json = json.loads(j.read())
        self.caption_json = caption_json
        self.filecap = {}
        for dic in self.caption_json["images"]:
            self.filecap[dic['id']] = {"filename" : dic["file_name"], "captions": []}
        for dic in self.caption_json["annotations"]:
            self.filecap[dic['image_id']]['captions'].append(dic['caption'])
        self.files = [dic for dic in self.filecap.values()]
        
    def __getitem__(self, index):
        img = Image.open(os.path.join(self.inputPath, self.files[index]['filename']))
        if self.transform:
            img = self.transform(img)
        
        cap = self.files[index]['captions'][random.randint(0, len(self.files[index]['captions'])-1)]
        tokenized_caption = tokenizer.enable_padding(length=100)
        tokenized_caption = tokenizer.encode(cap)
        # [# of str, str len]
        return img, torch.Tensor(tokenized_caption.ids).long(), self.files[index]['filename'].replace(".jpg", "")

    def __len__(self):
        return len(self.inputName)


In [5]:
_, preprocess = clip.load("ViT-B/32", device)

In [6]:
preprocess

Compose(
    Resize(size=224, interpolation=bicubic, max_size=None, antialias=None)
    CenterCrop(size=(224, 224))
    <function _convert_image_to_rgb at 0x7f59874690d0>
    ToTensor()
    Normalize(mean=(0.48145466, 0.4578275, 0.40821073), std=(0.26862954, 0.26130258, 0.27577711))
)

In [5]:
train_dataset = p2dataset(train_dir, train_json_dir, transform=preprocess)
valid_dataset = p2dataset(valid_dir, val_json_dir, transform=preprocess)
print('# images in train:', len(train_dataset))
print('# images in valid:', len(valid_dataset))

train_loader = DataLoader(train_dataset, batch_size=72, shuffle=True, num_workers=4)
valid_loader = DataLoader(valid_dataset, batch_size=1, shuffle=False, num_workers=0)

dataiter = iter(valid_loader)
images, ids, filename = dataiter.next()
# print(images[0])
# print(ids)
# print(filename)
# def imshow(img):
#     npimg = img.numpy()
#     plt.imshow(np.transpose(npimg, (1, 2, 0)))
# # show images
# imshow(torchvision.utils.make_grid(images))


# images in train: 10604
# images in valid: 1789


### Model

In [6]:
class PositionwiseFeedforwardLayer(nn.Module):
    def __init__(self, hid_dim, pf_dim, dropout):
        super().__init__()
        
        self.fc_1 = nn.Linear(hid_dim, pf_dim)
        self.fc_2 = nn.Linear(pf_dim, hid_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        
        #x = [batch size, seq len, hid dim]
        
        x = self.dropout(torch.relu(self.fc_1(x)))
        
        #x = [batch size, seq len, pf dim]
        
        x = self.fc_2(x)
        
        #x = [batch size, seq len, hid dim]
        
        return x
    
class MultiHeadAttentionLayer(nn.Module):
    def __init__(self, hid_dim, n_heads, dropout, device):
        super().__init__()
        
        assert hid_dim % n_heads == 0
        
        self.hid_dim = hid_dim
        self.n_heads = n_heads
        self.head_dim = hid_dim // n_heads
        
        self.fc_q = nn.Linear(hid_dim, hid_dim)
        self.fc_k = nn.Linear(hid_dim, hid_dim)
        self.fc_v = nn.Linear(hid_dim, hid_dim)
        
        self.fc_o = nn.Linear(hid_dim, hid_dim)
        
        self.dropout = nn.Dropout(dropout)
        
        self.scale = torch.sqrt(torch.FloatTensor([self.head_dim])).to(device)
        
    def forward(self, query, key, value, mask = None):
        
        batch_size = query.shape[0]
        # print("query = [batch size, query len, hid dim]:", query.shape)
        # print("key = [batch size, key len, hid dim]:", key.shape)
        # print("value = [batch size, value len, hid dim]", value.shape)
        #query = [batch size, query len, hid dim]
        #key = [batch size, key len, hid dim]
        #value = [batch size, value len, hid dim]
                
        Q = self.fc_q(query)
        K = self.fc_k(key)
        V = self.fc_v(value)
        # print("Q = [batch size, query len, hid dim]:", Q.shape)
        # print("K = [batch size, key len, hid dim]:", K.shape)
        # print("V = [batch size, value len, hid dim]", V.shape)
        #Q = [batch size, query len, hid dim]
        #K = [batch size, key len, hid dim]
        #V = [batch size, value len, hid dim]
                        
        Q = Q.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        K = K.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        V = V.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        # print("Q = [batch size, n heads, query len, head dim]:", Q.shape)
        # print("K = [batch size, n heads, key len, head dim]:", K.shape)
        # print("V = [batch size, n heads, value len, head dim]", V.shape)
        #Q = [batch size, n heads, query len, head dim]
        #K = [batch size, n heads, key len, head dim]
        #V = [batch size, n heads, value len, head dim]
                
        energy = torch.matmul(Q, K.permute(0, 1, 3, 2)) / self.scale
        # print("energy = [batch size, n heads, query len, key len]: ", energy.shape)
        #energy = [batch size, n heads, query len, key len]
        
        if mask is not None:
            energy = energy.masked_fill(mask == 0, -1e10)
        
        attention = torch.softmax(energy, dim = -1)
        # print("attention = [batch size, n heads, query len, key len]: ", attention.shape)      
        #attention = [batch size, n heads, query len, key len]
                
        x = torch.matmul(self.dropout(attention), V)
        # print("x = [batch size, n heads, query len, key len]: ", x.shape)
        #x = [batch size, n heads, query len, head dim]
        
        x = x.permute(0, 2, 1, 3).contiguous()
        # print("x = [batch size, n heads, query len, key len]: ", x.shape)    
        #x = [batch size, query len, n heads, head dim]
        
        x = x.view(batch_size, -1, self.hid_dim)
        # print("x = [batch size, n heads, query len, key len]: ", x.shape)            
        #x = [batch size, query len, hid dim]
        
        x = self.fc_o(x)
        # print("x = [batch size, n heads, query len, key len]: ", x.shape)                    
        #x = [batch size, query len, hid dim]
        
        return x, attention

class DecoderLayer(nn.Module):
    def __init__(self, 
                 hid_dim, 
                 n_heads, 
                 pf_dim, 
                 dropout, 
                 device):
        super().__init__()
        
        self.self_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.enc_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.ff_layer_norm = nn.LayerNorm(hid_dim)
        self.self_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.encoder_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.positionwise_feedforward = PositionwiseFeedforwardLayer(hid_dim, 
                                                                     pf_dim, 
                                                                     dropout)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, trg, enc_src, trg_mask, src_mask=None):
        
        #trg = [batch size, trg len, hid dim]
        #enc_src = [batch size, src len, hid dim]
        #trg_mask = [batch size, 1, trg len, trg len]
        #src_mask = [batch size, 1, 1, src len]
        
        #self attention
        # print("Start self attention.....")
        _trg, _ = self.self_attention(trg, trg, trg, trg_mask)
        
        #dropout, residual connection and layer norm
        trg = self.self_attn_layer_norm(trg + self.dropout(_trg))
        # print("trg = [batch size, trg len, hid dim]:", trg.shape)    
        #trg = [batch size, trg len, hid dim]
            
        #encoder attention
        # print("Start encoder attention.....")
        _trg, attention = self.encoder_attention(trg, enc_src, enc_src, src_mask)
        
        #dropout, residual connection and layer norm
        trg = self.enc_attn_layer_norm(trg + self.dropout(_trg))
        # print("trg = [batch size, trg len, hid dim]:", trg.shape)            
        #trg = [batch size, trg len, hid dim]
        
        #positionwise feedforward
        _trg = self.positionwise_feedforward(trg)
        
        #dropout, residual and layer norm
        trg = self.ff_layer_norm(trg + self.dropout(_trg))
        # print("trg = [batch size, trg len, hid dim]:", trg.shape)
        #trg = [batch size, trg len, hid dim]
        #attention = [batch size, n heads, trg len, src len]
        
        return trg, attention
    
class Decoder(nn.Module):
    def __init__(self, 
                 output_dim, 
                 hid_dim, 
                 n_layers, 
                 n_heads, 
                 pf_dim, 
                 dropout, 
                 device,
                 max_length = 100):
        super().__init__()
        
        self.device = device
        self._reset_parameters()

        self.tok_embedding = nn.Embedding(output_dim, hid_dim)
        self.pos_embedding = nn.Embedding(max_length, hid_dim)
        
        self.layers = nn.ModuleList([DecoderLayer(hid_dim, 
                                                  n_heads, 
                                                  pf_dim, 
                                                  dropout, 
                                                  device)
                                     for _ in range(n_layers)])
        
        self.fc_out = nn.Linear(hid_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
        self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)
        
    def forward(self, trg, enc_src, trg_mask, src_mask=None):
        
        #trg = [batch size, trg len]
        #enc_src = [batch size, src len, hid dim]
        #trg_mask = [batch size, 1, trg len, trg len]
        #src_mask = [batch size, 1, 1, src len]
                
        batch_size = trg.shape[0]
        trg_len = trg.shape[1]
        
        pos = torch.arange(0, trg_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
        # print("pos = [batch size, trg len]:", pos.shape)       
        #pos = [batch size, trg len]
            
        trg = self.dropout((self.tok_embedding(trg) * self.scale) + self.pos_embedding(pos))
        # print("trg = [batch size, trg len, hid dim]:", trg.shape)
        #trg = [batch size, trg len, hid dim]
        
        for layer in self.layers:
            trg, attention = layer(trg, enc_src, trg_mask, src_mask)
        
        # print("trg = [batch size, trg len, hid dim]: ", trg.shape)
        # print("attention = [batch size, n heads, trg len, src len]: ", attention.shape)
        #trg = [batch size, trg len, hid dim]
        #attention = [batch size, n heads, trg len, src len]
        
        output = self.fc_out(trg)
        # print("output = [batch size, trg len, output dim]:", output.shape)
        #output = [batch size, trg len, output dim]
            
        return output, attention
    
    def _reset_parameters(self):
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)
        
class Transformer(nn.Module):
    def __init__(self,  
                 decoder,  
                 trg_pad_idx, 
                 device):
        super().__init__()
        
        self.decoder = decoder
        self.trg_pad_idx = trg_pad_idx
        self.device = device
    
    def make_trg_mask(self, trg):
        
        #trg = [batch size, trg len]
        trg_pad_mask = (trg != self.trg_pad_idx).unsqueeze(1).unsqueeze(2)
        
        # print("trg_pad_mask = [batch size, 1, 1, trg len]:", trg_pad_mask.shape)
        #trg_pad_mask = [batch size, 1, 1, trg len]
        
        trg_len = trg.shape[1]
        
        trg_sub_mask = torch.tril(torch.ones((trg_len, trg_len))).bool().to(device)
        # print("trg_sub_mask = [trg len, trg len]: ",trg_sub_mask.shape)
        
        #trg_sub_mask = [trg len, trg len]
            
        trg_mask = trg_pad_mask & trg_sub_mask
        
        #trg_mask = [batch size, 1, trg len, trg len]
        
        # print("trg_mask = [batch size, 1, trg len, trg len]:", trg_mask.shape)

        return trg_mask

    def forward(self, enc_src, trg):
                
        trg_mask = self.make_trg_mask(trg)
        # print("trg_mask = [batch size, 1, trg len, trg len]:", trg_mask.shape)
        #src_mask = [batch size, 1, 1, src len]
        #trg_mask = [batch size, 1, trg len, trg len]
        
        # enc_src = self.encoder.encode_image(src).unsqueeze(1).float()
        # print("enc_src = [batch size, src len, hid dim]: ", enc_src.shape)
        #enc_src = [batch size, src len, hid dim]
                
        output, attention = self.decoder(trg, enc_src, trg_mask)
        
        # print("output = [batch size, trg len, output dim]:", output.shape)
        #output = [batch size, trg len, output dim]
        #attention = [batch size, n heads, trg len, src len]
        
        return output, attention

def initialize_weights(m):
    if hasattr(m, 'weight') and m.weight.dim() > 1:
        nn.init.xavier_uniform_(m.weight.data)

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

def train(model, enc, train_loader, optimizer, criterion, clip):
    
    model.train()
    enc = enc.to(device)

    epoch_loss = 0
    
    for img, trg, _ in train_loader:
                
        img, trg = img.to(device), trg.to(device)
        optimizer.zero_grad()
        for param in enc.parameters():
            param.requires_grad = False             
        # print("img: ", img.shape)
        # print("trg: ", trg[:,:-1].shape)
        # output, _ = model(img, trg[:,:-1])
        enc_src = enc.encode_image(img).unsqueeze(1).float()
        output, attention = model(enc_src, trg[:,:-1])
        # print("output = [batch size, trg len, output dim]:", output.shape)                
        # output = [batch size, trg len - 1, output dim]
        # trg = [batch size, trg len]
            
        output_dim = output.shape[-1]
            
        output = output.contiguous().view(-1, output_dim)
        # print("output", output.shape)
        trg = trg[:,1:].contiguous().view(-1)
        # print("trg", trg.shape)     
                        
        #output = [batch size * trg len - 1, output dim]
        #trg = [batch size * trg len - 1]
            
        loss = criterion(output, trg)
        # loss = get_loss(output, trg, 18022, 0.1, 0)
        # print(f'| Loss: {loss:.3f}')

        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(train_loader.dataset)

def evaluate(model, enc, valid_loader, criterion):
    
    model.eval()
    enc = enc.to(device)
    epoch_loss = 0
    
    with torch.no_grad():
    
        for img, trg, _ in valid_loader:
            img, trg = img.to(device), trg.to(device)
            for param in enc.parameters():
                param.requires_grad = False  
            enc_src = enc.encode_image(img).unsqueeze(1).float()
            output, _ = model(enc_src, trg[:,:-1])
            
            #output = [batch size, trg len - 1, output dim]
            #trg = [batch size, trg len]
            
            output_dim = output.shape[-1]
            
            output = output.contiguous().view(-1, output_dim)
            trg = trg[:,1:].contiguous().view(-1)
            
            #output = [batch size * trg len - 1, output dim]
            #trg = [batch size * trg len - 1]
            
            loss = criterion(output, trg)

            epoch_loss += loss.item()
        
    return epoch_loss / len(valid_loader.dataset)

In [7]:
OUTPUT_DIM = 18022
HID_DIM = 512
ENC_LAYERS = 3
DEC_LAYERS = 7
ENC_HEADS = 8
DEC_HEADS = 8
ENC_PF_DIM = 512
DEC_PF_DIM = 512
ENC_DROPOUT = 0.1
DEC_DROPOUT = 0.1
TRG_PAD_IDX = 0
LEARNING_RATE = 2e-5
PAD = 0
BOS = 2
EOS = 3

enc, _ = clip.load("ViT-B/32", device)

enc = enc.to(device).float()

for param in enc.parameters():
    param.requires_grad = False

dec = Decoder(OUTPUT_DIM, 
              HID_DIM, 
              DEC_LAYERS, 
              DEC_HEADS, 
              DEC_PF_DIM, 
              DEC_DROPOUT, 
              device)
model = Transformer(dec, TRG_PAD_IDX, device).to(device)
print(f'The model has {count_parameters(model):,} trainable parameters')
model = model.apply(initialize_weights)
optimizer = torch.optim.Adam(model.parameters(), lr = LEARNING_RATE)
criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX, label_smoothing=0.1)


The model has 36,931,174 trainable parameters


In [11]:
N_EPOCHS = 20
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, enc, train_loader, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, enc, valid_loader, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), os.path.join(ckpt_dir, 'ViT-B_32declayer7.pt'))
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

Epoch: 01 | Time: 2m 26s
	Train Loss: 0.118 | Train PPL:   1.125
	 Val. Loss: 7.426 |  Val. PPL: 1679.471
Epoch: 02 | Time: 2m 11s
	Train Loss: 0.096 | Train PPL:   1.101
	 Val. Loss: 6.241 |  Val. PPL: 513.432
Epoch: 03 | Time: 2m 24s
	Train Loss: 0.084 | Train PPL:   1.087
	 Val. Loss: 5.718 |  Val. PPL: 304.253
Epoch: 04 | Time: 2m 30s
	Train Loss: 0.079 | Train PPL:   1.082
	 Val. Loss: 5.539 |  Val. PPL: 254.303
Epoch: 05 | Time: 2m 52s
	Train Loss: 0.077 | Train PPL:   1.080
	 Val. Loss: 5.398 |  Val. PPL: 220.856
Epoch: 06 | Time: 2m 50s
	Train Loss: 0.075 | Train PPL:   1.078
	 Val. Loss: 5.251 |  Val. PPL: 190.821
Epoch: 07 | Time: 2m 32s
	Train Loss: 0.073 | Train PPL:   1.076
	 Val. Loss: 5.183 |  Val. PPL: 178.188
Epoch: 08 | Time: 2m 51s
	Train Loss: 0.072 | Train PPL:   1.074
	 Val. Loss: 5.018 |  Val. PPL: 151.180
Epoch: 09 | Time: 2m 57s
	Train Loss: 0.070 | Train PPL:   1.073
	 Val. Loss: 4.966 |  Val. PPL: 143.400
Epoch: 10 | Time: 2m 57s
	Train Loss: 0.069 | Train PP

KeyboardInterrupt: 

In [8]:
def test(testmodel, valid_loader):
    predict = {}
    testmodel.eval()
        
    with torch.no_grad():
        
        for i, (img, trg, filename) in enumerate(valid_loader):
            img, trg = img.to(device), trg.to(device)
            out, _ = testmodel(img, trg)
            text = out.argmax(2, keepdim=False)
            text = text.cpu().numpy().squeeze().tolist()
            text = tokenizer.decode(text)
            # text = tokenizer.decode(text[:text.index(3)+1])
            
            predict[filename[0]] = text
        
    return predict


def generate_square_subsequent_mask(sz: int, device='cpu') -> torch.Tensor:
    return torch.triu(torch.full((sz, sz), float('-inf'), device=device), diagonal=1)

def inference(testmodel, valid_loader):
    pred = {}
    BOS = 2
    EOS = 3
    max_len = 77
    testmodel.eval()
    
    for i, (img, _, filename) in enumerate(valid_loader):
        img = img.to(device)
        batch_size = img.shape[0]
        
        trg_indexes = torch.zeros((batch_size, max_len), dtype=torch.int32)
        trg_indexes[:, 0] = BOS
        # print(trg_indexes)
        trg_indexes = trg_indexes.to(device)
        
        with torch.no_grad():
            # enc_src = testmodel.encoder.encode_image(img).unsqueeze(1).float()
                   
            for index in range(1, max_len):
                
                out, attention = testmodel(img, trg_indexes)
                pred_token = out.argmax(2)
                trg_indexes[:,index] = pred_token[:,index]
                # print(pred_token[:,index])
                if pred_token[:,index] == EOS:
                    break
                # print(trg_indexes)
            trg_indexes = trg_indexes.cpu().numpy().squeeze().tolist()
        caption = tokenizer.decode(trg_indexes)
        # print(caption)
        pred[filename[0]] = caption
    return pred    

def inference2(testmodel, valid_loader):
    dict = {}
    testmodel.eval()
    max_len = 77
    with torch.no_grad():
        for img, _, filename in valid_loader:
            img = img.to(device)
            
            trg_indexes = [BOS]
            # with torch.no_grad():
            #     enc_src = testmodel.encoder.encode_image(img).unsqueeze(1).float()
                
            for i in range(max_len):
                trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(0).to(device)
                # trg_mask = generate_square_subsequent_mask(trg_tensor.shape[1], device)
                # trg_mask = testmodel.make_trg_mask(trg_tensor)
                # out, attention = testmodel.decoder(trg_tensor, enc_src, trg_mask)
                out, attention = testmodel(img, trg_tensor)
                
                # print(out.argmax(2).shape)
                pred_token = out.argmax(2)[:,i].item()
                # print(out.argmax(2))
                trg_indexes.append(pred_token)
                # print(trg_indexes)
                if(pred_token == EOS):
                    break
            caption = tokenizer.decode(trg_indexes)
            # print(caption)
            dict[filename[0]] = caption
    return dict

def inference3(testmodel, valid_loader):
    pred = {}
    BOS = 2
    EOS = 3
    max_len = 77
    testmodel.eval()
    
    for i, (img, _, filename) in enumerate(valid_loader):
        img = img.to(device)
        batch_size = img.shape[0]
        
        trg_indexes = torch.zeros((batch_size, max_len), dtype=torch.int32)
        trg_indexes[:, 0] = BOS
        # print(trg_indexes)
        trg_indexes = trg_indexes.to(device)
        
        with torch.no_grad():
            enc_src = testmodel.encoder.encode_image(img).unsqueeze(1).float()
                   
            for index in range(1, max_len):
                
                out, attention = testmodel.decoder(img, trg_indexes)
                pred_token = out.argmax(2)
                trg_indexes[:,index] = pred_token[:,index]
                # print(pred_token[:,index])
                if pred_token[:,index] == EOS:
                    break
                # print(trg_indexes)
            trg_indexes = trg_indexes.cpu().numpy().squeeze().tolist()
        caption = tokenizer.decode(trg_indexes)
        # print(caption)
        pred[filename[0]] = caption
    return pred            

In [9]:
# load model
enc, _ = clip.load("ViT-B/32", device)

enc = enc.to(device).float()

dec = Decoder(OUTPUT_DIM, 
              HID_DIM, 
              DEC_LAYERS, 
              DEC_HEADS, 
              DEC_PF_DIM, 
              DEC_DROPOUT, 
              device)
testmodel = Transformer(dec, TRG_PAD_IDX, device)
print(f'The model has {count_parameters(testmodel):,} trainable parameters')
model = model.apply(initialize_weights)
optimizer = torch.optim.Ainitialize_weightsdam(testmodel.parameters(), lr = LEARNING_RATE)
criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)
testmodel.load_state_dict(torch.load(os.path.join(ckpt_dir, 'ViT-B_32declayer7.pt')))

The model has 36,931,174 trainable parameters


<All keys matched successfully>

In [10]:
pred = inference(testmodel, valid_loader)

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:5! (when checking argument for argument index in method wrapper__index_select)

In [None]:
pred

In [50]:
with open("./p2/pred.json", "w") as f:
    json.dump(pred, f, indent=4)

In [52]:
dict2 = inference2(testmodel, valid_loader)

In [53]:
dict2

{'000000165547': 'a kitchen with a sink and a sink .',
 '000000518586': 'a group of elephants are standing in the grass .',
 '000000249720': 'a person riding a skateboard down a street .',
 '000000122934': 'a man is riding a skateboard on the street .',
 '000000393258': 'a woman in a white shirt is sitting on a table .',
 '000000266041': 'a man in a black shirt and a black shirt is holding a cell phone .',
 '000000053015': 'a group of people sitting on a table with a table .',
 '000000039540': 'a woman is sitting on a table with a table .',
 '000000320039': 'a man in a white shirt and a white shirt is standing in a table .',
 '000000014941': 'a man is sitting on a bench with a red shirt .',
 '000000304355': 'a cat is sitting on a bed with a laptop .',
 '000000096514': 'a group of people sitting on a bench .',
 '000000578703': 'a group of people standing on a beach .',
 '000000350966': 'a train is parked on the side of a road .',
 '000000201220': 'a woman standing next to a small small 

In [54]:
with open("./p2/dict2.json", "w") as f:
    json.dump(dict2, f, indent=4)

In [41]:
dict

{'000000165547': 'a bathroom with a toilet and a sink',
 '000000518586': 'a man is standing on a bench next to a tree .',
 '000000249720': 'a man in a white shirt and white shirt is standing in front of a tree .',
 '000000122934': 'a man is standing on a bench in the air .',
 '000000393258': 'a man in a blue shirt and white shirt is sitting on a bench .',
 '000000266041': 'a man in a white shirt and white shirt is sitting on a bed .',
 '000000053015': 'a woman in a white shirt is sitting on a table .',
 '000000039540': 'a large kitchen with a table and a table .',
 '000000320039': 'a woman in a white shirt and white shirt is sitting on a table .',
 '000000014941': 'a woman is sitting on a table with a dog .',
 '000000304355': 'a cat sitting on a bed with a white toilet .',
 '000000096514': 'a man is sitting on a bench in the middle of a room .',
 '000000578703': 'a man is riding a skateboard down a street .',
 '000000350966': 'a train is parked on a street with a train .',
 '0000002012

In [36]:
predict = test(testmodel, valid_loader)

In [33]:
print(len(predict))

1789


In [54]:
with open("./p2/dict.json", "w") as f:
    json.dump(dict, f, indent=4)

with open("./p2/dict2.json", "w") as f:
    json.dump(dict2, f, indent=4)

In [129]:
for i in range(30):
    test = out.max(2, keepdim=True)[1][i]
    # print(out.max(2, keepdim=True)[1])
    test = test.cpu().numpy().squeeze().tolist()
    # print(test)
    test = test[:test.index(3)+1]
    print(tokenizer.decode(test))

table situated in corner of room with a vase for a center piece
a black and white photo of a train driving .
a woman is wearing a pink helmet and riding her bike through the city .
a group of people driving a horse drawn carriage .
a bunch of travel bags sit on a carpet floor
a person with a tattoo holding a basketball
a baby grabs for a bite of pizza that a man is eating .
a full veggie pizza near a couple of plate of fries , are all ready to be eaten .
a close up of a child eating food
a small child is holding a stuffed bear
a white toilet sitting in a bathroom next to a tub .
a bird is standing on a shallow body of water .
four ducks are in a grassy island of a parking lot with their heads down .
a subway train pulling into the train station .
a couple of people sitting inside of a car .
three zebras and two other animals grazing .
a group of men play a game of tennis together on a grass tennis court .
two men standing in a store aisle with one holding a baseball bat .
a man in a na