In [1]:
import os
import glob
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms

from torch.utils.data import Dataset, DataLoader
import numpy as np
import clip
import json 
import time
import math

import random
import pandas as pd
from PIL import Image
import matplotlib.pyplot as plt
from tokenizers import Tokenizer
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
# os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
# os.environ["CUDA_VISIBLE_DEVICES"] = "2, 5"


In [3]:
n = 100
label = np.arange(n)
print(label)

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
 96 97 98 99]


In [2]:
train_dir = "/data/dlcv/hw3/hw3_data/p2_data/images/train"
valid_dir = "/data/dlcv/hw3/hw3_data/p2_data/images/val"
train_json_dir = "/data/dlcv/hw3/hw3_data/p2_data/train.json"
val_json_dir = "/data/dlcv/hw3/hw3_data/p2_data/val.json"
ckpt_dir = "./p2"
tokenizer = Tokenizer.from_file("caption_tokenizer.json")
# device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.cuda.set_device(3)

print('Device used:', device)

Device used: cuda


In [3]:
clip.available_models()

['RN50',
 'RN101',
 'RN50x4',
 'RN50x16',
 'RN50x64',
 'ViT-B/32',
 'ViT-B/16',
 'ViT-L/14',
 'ViT-L/14@336px']

In [3]:
with open(train_json_dir, 'r') as j:
    caption_json = json.loads(j.read())
# print(caption_json["images"][i].values() for i in range(10))
print(type(caption_json["images"][0]))
# print(type([0]*80))

<class 'dict'>


In [4]:
class p2dataset(Dataset):
    def __init__(self, inputPath, json_dir, transform=None):
        self.inputPath = inputPath
        self.transform = transform
        self.inputName = sorted(os.listdir(inputPath))
        with open(json_dir, 'r') as j:
            caption_json = json.loads(j.read())
        self.caption_json = caption_json
        self.filecap = {}
        for dic in self.caption_json["images"]:
            self.filecap[dic['id']] = {"filename" : dic["file_name"], "captions": []}
        for dic in self.caption_json["annotations"]:
            self.filecap[dic['image_id']]['captions'].append(dic['caption'])
        self.files = [dic for dic in self.filecap.values()]
        
    def __getitem__(self, index):
        img = Image.open(os.path.join(self.inputPath, self.files[index]['filename']))
        if self.transform:
            img = self.transform(img)
        
        cap = self.files[index]['captions'][random.randint(0, len(self.files[index]['captions'])-1)]
        tokenized_caption = tokenizer.enable_padding(length=70)
        tokenized_caption = tokenizer.encode(cap)
        cut_eos = tokenized_caption.ids
        cut_eos[cut_eos.index(3)] = 0
        
        # [# of str, str len]
        return img, torch.Tensor(tokenized_caption.ids).long(), torch.Tensor(cut_eos).long()

    def __len__(self):
        return len(self.inputName)


In [5]:
_, preprocess = clip.load("ViT-L/14@336px", device)

In [6]:
train_dataset = p2dataset(train_dir, train_json_dir, transform=preprocess)
test_dataset = p2dataset(valid_dir, val_json_dir, transform=preprocess)
print('# images in train:', len(train_dataset))
print('# images in valid:', len(test_dataset))

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=256, shuffle=False, num_workers=1)

dataiter = iter(test_loader)
images, ids, cutos = dataiter.next()
print(images.shape)
print(ids.shape)
print(cutos.shape)

# print(filename)
# def imshow(img):
#     npimg = img.numpy()
#     plt.imshow(np.transpose(npimg, (1, 2, 0)))
# # show images
# imshow(torchvision.utils.make_grid(images))


# images in train: 10604
# images in valid: 1789


KeyboardInterrupt: 

### Model

In [9]:
class Encoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.clip_encoder, __ = clip.load('ViT-L/14@336px', device)
        self.clip_encoder = self.clip_encoder.float()
        #override vit's forward
        vit = self.clip_encoder.visual
        bound_method = vit_forward.__get__(vit, vit.__class__)
        setattr(vit, 'forward', bound_method)
        #Freeze model parameters
        for param in self.clip_encoder.parameters():
            param.requires_grad = False
    
    def forward(self, x):
        return self.clip_encoder.encode_image(x)

def vit_forward(self, x):
    x = self.conv1(x)  # shape = [*, width, grid, grid]
    x = x.reshape(x.shape[0], x.shape[1], -1)  # shape = [*, width, grid ** 2]
    x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
    x = torch.cat([self.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), x], dim=1)  # shape = [*, grid ** 2 + 1, width]
    x = x + self.positional_embedding.to(x.dtype)
    x = self.ln_pre(x)

    x = x.permute(1, 0, 2)  # NLD -> LND
    x = self.transformer(x)
    x = x.permute(1, 0, 2)  # LND -> NLD

    x = self.ln_post(x[:,1:,:])
    if self.proj is not None:
            x = x @ self.proj
            
    return x   


class PositionwiseFeedforwardLayer(nn.Module):
    def __init__(self, hid_dim, pf_dim, dropout):
        super().__init__()
        
        self.fc_1 = nn.Linear(hid_dim, pf_dim)
        self.fc_2 = nn.Linear(pf_dim, hid_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        
        #x = [batch size, seq len, hid dim]
        
        x = self.dropout(torch.relu(self.fc_1(x)))
        
        #x = [batch size, seq len, pf dim]
        
        x = self.fc_2(x)
        
        #x = [batch size, seq len, hid dim]
        
        return x
    
class MultiHeadAttentionLayer(nn.Module):
    def __init__(self, hid_dim, n_heads, dropout, device):
        super().__init__()
        
        assert hid_dim % n_heads == 0
        
        self.hid_dim = hid_dim
        self.n_heads = n_heads
        self.head_dim = hid_dim // n_heads
        
        self.fc_q = nn.Linear(hid_dim, hid_dim)
        self.fc_k = nn.Linear(hid_dim, hid_dim)
        self.fc_v = nn.Linear(hid_dim, hid_dim)
        
        self.fc_o = nn.Linear(hid_dim, hid_dim)
        
        self.dropout = nn.Dropout(dropout)
        
        self.scale = torch.sqrt(torch.FloatTensor([self.head_dim])).to(device)
        
    def forward(self, query, key, value, mask = None):
        
        batch_size = query.shape[0]
        # print("query = [batch size, query len, hid dim]:", query.shape)
        # print("key = [batch size, key len, hid dim]:", key.shape)
        # print("value = [batch size, value len, hid dim]", value.shape)
        #query = [batch size, query len, hid dim]
        #key = [batch size, key len, hid dim]
        #value = [batch size, value len, hid dim]
                
        Q = self.fc_q(query)
        K = self.fc_k(key)
        V = self.fc_v(value)
        # print("Q = [batch size, query len, hid dim]:", Q.shape)
        # print("K = [batch size, key len, hid dim]:", K.shape)
        # print("V = [batch size, value len, hid dim]", V.shape)
        #Q = [batch size, query len, hid dim]
        #K = [batch size, key len, hid dim]
        #V = [batch size, value len, hid dim]
                        
        Q = Q.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        K = K.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        V = V.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        # print("Q = [batch size, n heads, query len, head dim]:", Q.shape)
        # print("K = [batch size, n heads, key len, head dim]:", K.shape)
        # print("V = [batch size, n heads, value len, head dim]", V.shape)
        #Q = [batch size, n heads, query len, head dim]
        #K = [batch size, n heads, key len, head dim]
        #V = [batch size, n heads, value len, head dim]
                
        energy = torch.matmul(Q, K.permute(0, 1, 3, 2)) / self.scale
        # print("energy = [batch size, n heads, query len, key len]: ", energy.shape)
        #energy = [batch size, n heads, query len, key len]
        
        if mask is not None:
            energy = energy.masked_fill(mask == 0, -1e10)
        
        attention = torch.softmax(energy, dim = -1)
        # print("attention = [batch size, n heads, query len, key len]: ", attention.shape)      
        #attention = [batch size, n heads, query len, key len]
                
        x = torch.matmul(self.dropout(attention), V)
        # print("x = [batch size, n heads, query len, key len]: ", x.shape)
        #x = [batch size, n heads, query len, head dim]
        
        x = x.permute(0, 2, 1, 3).contiguous()
        # print("x = [batch size, n heads, query len, key len]: ", x.shape)    
        #x = [batch size, query len, n heads, head dim]
        
        x = x.view(batch_size, -1, self.hid_dim)
        # print("x = [batch size, n heads, query len, key len]: ", x.shape)            
        #x = [batch size, query len, hid dim]
        
        x = self.fc_o(x)
        # print("x = [batch size, n heads, query len, key len]: ", x.shape)                    
        #x = [batch size, query len, hid dim]
        
        return x, attention

class DecoderLayer(nn.Module):
    def __init__(self, 
                 hid_dim, 
                 n_heads, 
                 pf_dim, 
                 dropout, 
                 device):
        super().__init__()
        
        self.self_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.enc_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.ff_layer_norm = nn.LayerNorm(hid_dim)
        self.self_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.encoder_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.positionwise_feedforward = PositionwiseFeedforwardLayer(hid_dim, 
                                                                     pf_dim, 
                                                                     dropout)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, trg, enc_src, trg_mask, src_mask=None):
        
        #trg = [batch size, trg len, hid dim]
        #enc_src = [batch size, src len, hid dim]
        #trg_mask = [batch size, 1, trg len, trg len]
        #src_mask = [batch size, 1, 1, src len]
        
        #self attention
        # print("Start self attention.....")
        _trg, _ = self.self_attention(trg, trg, trg, trg_mask)
        
        #dropout, residual connection and layer norm
        trg = self.self_attn_layer_norm(trg + self.dropout(_trg))
        # print("trg = [batch size, trg len, hid dim]:", trg.shape)    
        #trg = [batch size, trg len, hid dim]
            
        #encoder attention
        # print("Start encoder attention.....")
        _trg, attention = self.encoder_attention(trg, enc_src, enc_src, src_mask)
        
        #dropout, residual connection and layer norm
        trg = self.enc_attn_layer_norm(trg + self.dropout(_trg))
        # print("trg = [batch size, trg len, hid dim]:", trg.shape)            
        #trg = [batch size, trg len, hid dim]
        
        #positionwise feedforward
        _trg = self.positionwise_feedforward(trg)
        
        #dropout, residual and layer norm
        trg = self.ff_layer_norm(trg + self.dropout(_trg))
        # print("trg = [batch size, trg len, hid dim]:", trg.shape)
        #trg = [batch size, trg len, hid dim]
        #attention = [batch size, n heads, trg len, src len]
        
        return trg, attention
    
    
class Decoder(nn.Module):
    def __init__(self, 
                 output_dim, 
                 hid_dim, 
                 n_layers, 
                 n_heads, 
                 pf_dim, 
                 dropout, 
                 device,
                 max_length = 100):
        super().__init__()
        
        self.device = device
        self._reset_parameters()
        self.tok_embedding = nn.Embedding(output_dim, hid_dim)
        self.pos_embedding = nn.Embedding(max_length, hid_dim)
        
        self.layers = nn.ModuleList([DecoderLayer(hid_dim, 
                                                  n_heads, 
                                                  pf_dim, 
                                                  dropout, 
                                                  device)
                                     for _ in range(n_layers)])
        
        self.fc_out = nn.Linear(hid_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
        self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)
        
    def forward(self, trg, enc_src, trg_mask, src_mask=None):
        
        #trg = [batch size, trg len]
        #enc_src = [batch size, src len, hid dim]
        #trg_mask = [batch size, 1, trg len, trg len]
        #src_mask = [batch size, 1, 1, src len]
                
        batch_size = trg.shape[0]
        trg_len = trg.shape[1]
        
        pos = torch.arange(0, trg_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
        # print("pos = [batch size, trg len]:", pos.shape)       
        #pos = [batch size, trg len]
            
        trg = self.dropout((self.tok_embedding(trg) * self.scale) + self.pos_embedding(pos))
        # print("trg = [batch size, trg len, hid dim]:", trg.shape)
        #trg = [batch size, trg len, hid dim]
        
        for layer in self.layers:
            trg, attention = layer(trg, enc_src, trg_mask, src_mask)
        
        # print("trg = [batch size, trg len, hid dim]: ", trg.shape)
        # print("attention = [batch size, n heads, trg len, src len]: ", attention.shape)
        #trg = [batch size, trg len, hid dim]
        #attention = [batch size, n heads, trg len, src len]
        
        output = self.fc_out(trg)
        # print("output = [batch size, trg len, output dim]:", output.shape)
        #output = [batch size, trg len, output dim]
            
        return output, attention
    
    def _reset_parameters(self):
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)
class Transformer(nn.Module):
    def __init__(self,  
                 decoder,  
                 trg_pad_idx, 
                 device):
        super().__init__()
        
        self.decoder = decoder
        self.trg_pad_idx = trg_pad_idx
        self.device = device
    
    def make_trg_mask(self, trg):
        
        #trg = [batch size, trg len]
        trg_pad_mask = (trg != self.trg_pad_idx).unsqueeze(1).unsqueeze(2)
        
        # print("trg_pad_mask = [batch size, 1, 1, trg len]:", trg_pad_mask.shape)
        #trg_pad_mask = [batch size, 1, 1, trg len]
        
        trg_len = trg.shape[1]
        
        trg_sub_mask = torch.tril(torch.ones((trg_len, trg_len))).bool().to(device)
        # print("trg_sub_mask = [trg len, trg len]: ",trg_sub_mask.shape)
        
        #trg_sub_mask = [trg len, trg len]
            
        trg_mask = trg_pad_mask & trg_sub_mask
        
        #trg_mask = [batch size, 1, trg len, trg len]
        
        # print("trg_mask = [batch size, 1, trg len, trg len]:", trg_mask.shape)

        return trg_mask

    def forward(self, img, enc_src, trg):
        
        #src = [batch size, src len]
        #trg = [batch size, trg len]
        src_mask = torch.ones((img.shape[0],1,1,1)).bool().to(device)
        # print("src_mask = [batch size, 1, 1, src len]:", src_mask.shape)
        
        trg_mask = self.make_trg_mask(trg)
        # print("trg_mask = [batch size, 1, trg len, trg len]:", trg_mask.shape)
        #src_mask = [batch size, 1, 1, src len]
        #trg_mask = [batch size, 1, trg len, trg len]
        
        # enc_src = self.encoder(img)#.unsqueeze(1).float()
        # print("enc_src = [batch size, src len, hid dim]: ", enc_src.shape)
        #enc_src = [batch size, src len, hid dim]
                
        output, attention = self.decoder(trg, enc_src, trg_mask, src_mask)
        
        # print("output = [batch size, trg len, output dim]:", output.shape)
        #output = [batch size, trg len, output dim]
        #attention = [batch size, n heads, trg len, src len]
        
        return output, attention

def initialize_weights(m):
    if hasattr(m, 'weight') and m.weight.dim() > 1:
        nn.init.xavier_uniform_(m.weight.data)

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def get_loss(pred, ans, vocab_size=18022, label_smoothing=0.01, pad=0):
    # took this "normalizing" from tensor2tensor. We subtract it for
    # readability. This makes no difference on learning.
    confidence = 1.0 - label_smoothing
    low_confidence = (1.0 - confidence) / float(vocab_size - 1)
    normalizing = -(
        confidence * math.log(confidence) + float(vocab_size - 1) *
        low_confidence * math.log(low_confidence + 1e-20))

    one_hot = torch.zeros_like(pred).scatter_(1, ans.unsqueeze(1), 1)
    one_hot = one_hot * confidence + (1 - one_hot) * low_confidence
    log_pred = F.log_softmax(pred, dim=1)
    #print("=========test2=========", log_pred)

    xent = -(one_hot * log_pred).sum(dim=1)
    xent = xent.masked_select(ans != pad)
    loss = (xent - normalizing).mean()
    return loss

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

def train(model, enc, train_loader, optimizer, criterion, clip):
    
    model.train()
    enc = enc.to(device)

    epoch_loss = 0
    
    for i ,(img, trg, trg_cut) in enumerate(train_loader):
        
        img, trg, trg_cut = img.to(device), trg.to(device), trg_cut.to(device)
        with torch.no_grad():
            enc_src = enc(img)
        # print("img: ", img.shape)
        # print("trg: ", trg[:,:-1].shape)
        # output, _ = model(img, trg[:,:-1])
        output, attention = model(img, enc_src, trg_cut)
        # print("output = [batch size, trg len, output dim]:", output.shape)                
        # output = [batch size, trg len - 1, output dim]
        # trg = [batch size, trg len]
            
        output_dim = output.shape[-1]
            
        output = output.contiguous().view(-1, output_dim)
        # print("output ", output.shape)
        # trg = trg.contiguous().view(-1)
        ans = torch.roll(trg, -1)
        ans[:, -1] = 0
        ans = ans.view(-1)
        # print("ans ", ans.shape) 
                        
        #output = [batch size * trg len - 1, output dim]
        #trg = [batch size * trg len - 1]
        loss = criterion(output, ans)
        # loss = get_loss(output, ans)
        epoch_loss += loss.item()   
        # print(f'| train Loss: {loss:.3f}')
        optimizer.zero_grad()               
        loss.backward()
        
        # torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
             
    return epoch_loss / (i+1)

def evaluate(model, enc, test_loader, criterion):
    
    model.eval()
    enc = enc.to(device)
    epoch_loss = 0
        
    for i, (img, trg, trg_cut) in enumerate(test_loader):
        img, trg, trg_cut = img.to(device), trg.to(device), trg_cut.to(device)
        # output, _ = model(img, trg)
        
        with torch.no_grad():
            enc_src = enc(img)
            # print("img: ", img.shape)
            # print("trg: ", trg[:,:-1].shape)
            # output, _ = model(img, trg[:,:-1])
            output, attention = model(img, enc_src, trg_cut)
            #output = [batch size, trg len - 1, output dim]
            #trg = [batch size, trg len]
            
            output_dim = output.shape[-1]
            
            output = output.contiguous().view(-1, output_dim)
            # trg = trg.contiguous().view(-1)
            ans = torch.roll(trg, -1)
            ans[:, -1] = 0
            ans = ans.view(-1)
            
            #output = [batch size * trg len - 1, output dim]
            #trg = [batch size * trg len - 1]
            
            loss = criterion(output, ans)
            # loss = get_loss(output, ans)
            # print(f'| val Loss: {loss:.3f}')            
            epoch_loss += loss.item()
        
    return epoch_loss / (i+1)

In [10]:
OUTPUT_DIM = 18022
HID_DIM = 768
ENC_LAYERS = 3
DEC_LAYERS = 3
ENC_HEADS = 8
DEC_HEADS = 8
ENC_PF_DIM = 512
DEC_PF_DIM = 512
ENC_DROPOUT = 0.1
DEC_DROPOUT = 0.1
TRG_PAD_IDX = 0
LEARNING_RATE = 1e-4
PAD = 0
BOS = 2
EOS = 3

# enc, _ = clip.load('ViT-L/14@336px')

enc = Encoder()

dec = Decoder(OUTPUT_DIM, 
              HID_DIM, 
              DEC_LAYERS, 
              DEC_HEADS, 
              DEC_PF_DIM, 
              DEC_DROPOUT, 
              device)
model = Transformer(dec, TRG_PAD_IDX, device).to(device)
print(f'The model has {count_parameters(model):,} trainable parameters')
# model = model.apply(initialize_weights)
optimizer = torch.optim.Adam(model.parameters(), lr = LEARNING_RATE)
scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[12,15], gamma=0.9)
criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX, label_smoothing=0.1)

The model has 44,327,782 trainable parameters


In [33]:
N_EPOCHS = 30
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, enc, train_loader, optimizer, criterion, CLIP)
    scheduler.step()
    valid_loss = evaluate(model, enc, test_loader, criterion)

    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), os.path.join(ckpt_dir, 'ViTLlayer5_.pt'))
        print(f"|save model for {epoch+1}")
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

|save model for 1
Epoch: 01 | Time: 6m 26s
	Train Loss: 4.194 | Train PPL:  66.310
	 Val. Loss: 4.131 |  Val. PPL:  62.216
|save model for 2
Epoch: 02 | Time: 6m 27s
	Train Loss: 4.058 | Train PPL:  57.872
	 Val. Loss: 4.092 |  Val. PPL:  59.874
|save model for 3
Epoch: 03 | Time: 6m 26s
	Train Loss: 3.950 | Train PPL:  51.911
	 Val. Loss: 3.974 |  Val. PPL:  53.170
|save model for 4
Epoch: 04 | Time: 6m 30s
	Train Loss: 3.866 | Train PPL:  47.767
	 Val. Loss: 3.937 |  Val. PPL:  51.255
|save model for 5
Epoch: 05 | Time: 6m 27s
	Train Loss: 3.789 | Train PPL:  44.208
	 Val. Loss: 3.885 |  Val. PPL:  48.650
|save model for 6
Epoch: 06 | Time: 6m 27s
	Train Loss: 3.729 | Train PPL:  41.641
	 Val. Loss: 3.864 |  Val. PPL:  47.667
|save model for 7
Epoch: 07 | Time: 6m 27s
	Train Loss: 3.665 | Train PPL:  39.048
	 Val. Loss: 3.795 |  Val. PPL:  44.488
Epoch: 08 | Time: 6m 27s
	Train Loss: 3.626 | Train PPL:  37.578
	 Val. Loss: 3.804 |  Val. PPL:  44.867
|save model for 9
Epoch: 09 | Time

In [11]:
class valdataset(Dataset):
    def __init__(self, inputPath,  transform=None):
        self.inputPath = inputPath
        self.transform = transform
        self.inputName = sorted(os.listdir(inputPath))        
        
    def __getitem__(self, index):
        img = Image.open(os.path.join(self.inputPath, self.inputName[index]))
        if self.transform:
            img = self.transform(img)
        
        return img, self.inputName[index].replace(".jpg", "")

    def __len__(self):
        return len(self.inputName)

def imshow(img):
    npimg = img.numpy()
    plt.imshow(np.transpose(npimg, (1, 2, 0)))
# show images

In [12]:
inference_dataset = valdataset(valid_dir, transform=preprocess)
print('# images in valid:', len(inference_dataset))

inference_loader = DataLoader(inference_dataset, batch_size=1, shuffle=False, num_workers=0)

dataiter = iter(inference_loader)
images,  filename = dataiter.next()
print(filename[0])
# imshow(torchvision.utils.make_grid(images))

# images in valid: 1789
000000000368


In [13]:
def inference(enc, testmodel, inference_loader):
    pred = {}
    BOS = 2
    EOS = 3
    max_len = 60
    testmodel.eval()
    
    for i, (img, filename) in enumerate(inference_loader):
        img = img.to(device)
        batch_size = img.shape[0]
        
        trg_indexes = torch.zeros((batch_size, max_len), dtype=torch.int32)
        trg_indexes[:, 0] = BOS
        # print(trg_indexes)
        trg_indexes = trg_indexes.to(device)
        
        with torch.no_grad():
            enc_src = enc(img).float().to(device)
                               
            for index in range(1, max_len):
                # trg_mask = testmodel.make_trg_mask(trg_indexes)
                # out, attention = testmodel.decoder(trg_indexes, enc_src, trg_mask)
                out, attention = testmodel(img, enc_src, trg_indexes)
                
                pred_token = out.argmax(2)[:,-1].item()
                # print("pred: ",pred_token)
                trg_indexes[:,index] = pred_token
                # print(pred_token[:,index])
                if pred_token == EOS:
                    break
                # print("trg: ", trg_indexes)
            trg_indexes = trg_indexes.cpu().numpy().squeeze().tolist()
        caption = tokenizer.decode(trg_indexes)
        print(caption)
        pred[filename[0]] = caption
    return pred    

def inference2(enc, testmodel, inference_loader):
    dict = {}
    testmodel.eval()
    max_len = 70
    with torch.no_grad():
        for img, filename in inference_loader:
            img = img.to(device)
            trg_indexes = [BOS]
            hasEOS = False
            with torch.no_grad():
                enc_src = enc(img).float().to(device)
                
                for i in range(max_len):
                    trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(0).to(device)
                    out, attention = testmodel(img, enc_src, trg_tensor)
                    
                    pred_token = out.argmax(2)[:,-1].item()
                    trg_indexes.append(pred_token)
                    if(pred_token == EOS):
                        hasEOS = True
                        break
            if hasEOS:
                caption = tokenizer.decode(trg_indexes[:trg_indexes.index(EOS)])
            else:
                caption = tokenizer.decode(trg_indexes[:max_len])
            # print(caption)
            dict[filename[0]] = caption
    return dict


In [15]:
# load model
enc = Encoder()

dec = Decoder(OUTPUT_DIM, 
              HID_DIM, 
              DEC_LAYERS, 
              DEC_HEADS, 
              DEC_PF_DIM, 
              DEC_DROPOUT, 
              device)
testmodel = Transformer(dec, TRG_PAD_IDX, device).to(device)

print(f'The model has {count_parameters(testmodel):,} trainable parameters')
# model = model.apply(initialize_weights)
optimizer = torch.optim.Adam(testmodel.parameters(), lr = LEARNING_RATE)
criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX, label_smoothing=0.05)
testmodel.load_state_dict(torch.load(os.path.join(ckpt_dir, 'ViTLlayer3_27.pt')))

The model has 44,327,782 trainable parameters


<All keys matched successfully>

In [16]:
testlayer3 = inference2(enc, testmodel, inference_loader)
with open("./p2/testlayer3.json", "w") as f:
    json.dump(testlayer3, f, indent=4)

In [17]:
# learing rate 1e-4, multischeduler, layer3
!python3 p2_evaluate.py --pred_file "./p2/testlayer3.json" --images_root "/data/dlcv/hw3/hw3_data/p2_data/images/val"  --annotation_file "/data/dlcv/hw3/hw3_data/p2_data/val.json"

PTBTokenizer tokenized 110968 tokens at 739948.03 tokens per second.
PTBTokenizer tokenized 22239 tokens at 253832.75 tokens per second.
CIDEr: 0.8681001566774383 | CLIPScore: 0.7104499010882476


In [12]:
test = inference2(enc, testmodel, inference_loader)
with open("./p2/test.json", "w") as f:
    json.dump(test, f, indent=4)

In [43]:
# learning rate 3e-5, layer6
!python3 p2_evaluate.py --pred_file "./p2/dict2.json" --images_root "/data/dlcv/hw3/hw3_data/p2_data/images/val"  --annotation_file "/data/dlcv/hw3/hw3_data/p2_data/val.json"

PTBTokenizer tokenized 110968 tokens at 731053.32 tokens per second.
PTBTokenizer tokenized 24192 tokens at 269959.78 tokens per second.
CIDEr: 0.6663138535774273 | CLIPScore: 0.6692470765900119


In [32]:
dict2

{'000000000368': 'two young children playing soccer on a field .',
 '000000000620': 'a pizza sitting on a table with a fork .',
 '000000001548': 'a man in a white shirt is standing on a surfboard .',
 '000000001999': 'a cat sitting on top of a bed .',
 '000000002982': 'a train traveling down a track .',
 '000000003461': 'a man in a snow covered slope and a snow covered slope .',
 '000000003771': 'a brown cow sitting on a field with a brown cow in the background .',
 '000000003999': 'a cat laying on a couch in a chair .',
 '000000004956': 'a elephant standing in front of a fence .',
 '000000005418': 'two giraffes are standing in the grass near a fence .',
 '000000005434': 'a dog is sitting on a wooden bench .',
 '000000005757': 'a red bus is parked on the side of a bus .',
 '000000005811': 'a large red double decker bus is parked on a city street .',
 '000000006393': 'a woman with a red hair and a white shirt is sitting on a table .',
 '000000006789': 'a train traveling down a train tra

In [41]:
dict

{'000000165547': 'a bathroom with a toilet and a sink',
 '000000518586': 'a man is standing on a bench next to a tree .',
 '000000249720': 'a man in a white shirt and white shirt is standing in front of a tree .',
 '000000122934': 'a man is standing on a bench in the air .',
 '000000393258': 'a man in a blue shirt and white shirt is sitting on a bench .',
 '000000266041': 'a man in a white shirt and white shirt is sitting on a bed .',
 '000000053015': 'a woman in a white shirt is sitting on a table .',
 '000000039540': 'a large kitchen with a table and a table .',
 '000000320039': 'a woman in a white shirt and white shirt is sitting on a table .',
 '000000014941': 'a woman is sitting on a table with a dog .',
 '000000304355': 'a cat sitting on a bed with a white toilet .',
 '000000096514': 'a man is sitting on a bench in the middle of a room .',
 '000000578703': 'a man is riding a skateboard down a street .',
 '000000350966': 'a train is parked on a street with a train .',
 '0000002012

In [36]:
predict = test(testmodel, valid_loader)

In [33]:
print(len(predict))

1789


In [54]:
with open("./p2/dict.json", "w") as f:
    json.dump(dict, f, indent=4)

with open("./p2/dict2.json", "w") as f:
    json.dump(dict2, f, indent=4)

In [129]:
for i in range(30):
    test = out.max(2, keepdim=True)[1][i]
    # print(out.max(2, keepdim=True)[1])
    test = test.cpu().numpy().squeeze().tolist()
    # print(test)
    test = test[:test.index(3)+1]
    print(tokenizer.decode(test))

table situated in corner of room with a vase for a center piece
a black and white photo of a train driving .
a woman is wearing a pink helmet and riding her bike through the city .
a group of people driving a horse drawn carriage .
a bunch of travel bags sit on a carpet floor
a person with a tattoo holding a basketball
a baby grabs for a bite of pizza that a man is eating .
a full veggie pizza near a couple of plate of fries , are all ready to be eaten .
a close up of a child eating food
a small child is holding a stuffed bear
a white toilet sitting in a bathroom next to a tub .
a bird is standing on a shallow body of water .
four ducks are in a grassy island of a parking lot with their heads down .
a subway train pulling into the train station .
a couple of people sitting inside of a car .
three zebras and two other animals grazing .
a group of men play a game of tennis together on a grass tennis court .
two men standing in a store aisle with one holding a baseball bat .
a man in a na