In [1]:
import torch
import numpy as np
from torchvision import transforms, datasets, models, transforms
from torch.utils.data import DataLoader
from torch import nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib  inline
import random
from tqdm import tqdm
import os
import pandas as pd
from PIL import Image
import cv2 as cv
from google.colab.patches import cv2_imshow # for image display in google colab


from google.colab import drive
drive.mount('/content/drive')

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

!pip install torchinfo

# setting seed
torch.manual_seed(0)
random.seed(0)
np.random.seed(0)

!pip install -U nltk
import nltk
nltk.download('wordnet')

!pip install torchinfo

Mounted at /content/drive
Collecting torchinfo
  Downloading https://files.pythonhosted.org/packages/00/ac/a33b67c628df213260fb0c39590dc68291382d1f66bfb159e4af27f25ca7/torchinfo-0.1.1-py3-none-any.whl
Installing collected packages: torchinfo
Successfully installed torchinfo-0.1.1
Collecting nltk
[?25l  Downloading https://files.pythonhosted.org/packages/5e/37/9532ddd4b1bbb619333d5708aaad9bf1742f051a664c3c6fa6632a105fd8/nltk-3.6.2-py3-none-any.whl (1.5MB)
[K     |████████████████████████████████| 1.5MB 8.6MB/s 
Installing collected packages: nltk
  Found existing installation: nltk 3.2.5
    Uninstalling nltk-3.2.5:
      Successfully uninstalled nltk-3.2.5
Successfully installed nltk-3.6.2


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.




Normalizing caption text and arranging train, val and test captions

In [2]:
image_folder_path = "/content/drive/MyDrive/VR final project/Flickr8K/Flicker8k_Images/"
image_text_folder_path = "/content/drive/MyDrive/VR final project/Flickr8K/Flickr8k_text/"

import re
import string
def normalize_text(text):
  text = text.lower()

  # Remove punctuations
  exclude = set(string.punctuation)
  text = "".join(ch for ch in text if ch not in exclude)

  
  regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
  text = re.sub(regex, " ", text)

  # Remove extra white space
  text = " ".join(text.split())
  return text


def sep_image_name(x):
  y = str(x)
  return y[:-2]

def create_label_set(filename_lst, file_to_token_dict):
  label_lst = []
  for file in filename_lst:
    lst = file_to_token_dict[file]
    transformed_lst = []
    for i, sentence in enumerate(lst):
      transformed_lst.append("<sos> " + normalize_text(sentence) + " <eos>")
    label_lst.append(transformed_lst)
  return label_lst 

def create_table(image_folder_path, image_text_folder_path, token_type='Lemmatised'):
  # image and raw token caption
  image_rawToken_caption = pd.read_csv(os.path.join(image_text_folder_path, 'Flickr8k.token.txt'), sep="\t", header = None, names = ["image", "caption"])
  # image and lemmatized token caption
  image_lemmatize_caption = pd.read_csv(os.path.join(image_text_folder_path, 'Flickr8k.lemma.token.txt'), sep="\t", header = None, names = ["image", "caption"])

  train_image_list = pd.read_csv(os.path.join(image_text_folder_path, 'Flickr_8k.trainImages.txt'), sep='\t', header=None, names=['image'])
  val_image_list = pd.read_csv(os.path.join(image_text_folder_path, 'Flickr_8k.valImages.txt'), sep='\t', header=None, names=['image'])
  test_image_list = pd.read_csv(os.path.join(image_text_folder_path, 'Flickr_8k.testImages.txt'), sep='\t', header=None, names=['image'])

  image_rawToken_caption['imageName'] = image_rawToken_caption['image'].apply(sep_image_name)
  image_with_rawTokens = image_rawToken_caption.groupby('imageName')['caption'].agg(list)
  image_filename_to_rawtaglst = image_with_rawTokens.to_dict()

  image_lemmatize_caption['imageName'] = image_lemmatize_caption['image'].apply(sep_image_name)
  image_with_lemmTokens = image_lemmatize_caption.groupby('imageName')['caption'].agg(list)
  image_filename_to_lemmtaglst = image_with_lemmTokens.to_dict()

  train_item_lst = train_image_list['image'].to_list()
  val_item_lst = val_image_list['image'].to_list()
  test_item_lst = test_image_list['image'].to_list()

  if (token_type == 'Lemmatised'):
    train_tags_lst = create_label_set(train_item_lst, image_filename_to_lemmtaglst)
    val_tags_lst = create_label_set(val_item_lst, image_filename_to_lemmtaglst)
    test_tags_lst = create_label_set(test_item_lst, image_filename_to_lemmtaglst)
  else :
    train_tags_lst = create_label_set(train_item_lst, image_filename_to_rawtaglst)
    val_tags_lst = create_label_set(val_item_lst, image_filename_to_rawtaglst)
    test_tags_lst = create_label_set(test_item_lst, image_filename_to_rawtaglst)

  return train_tags_lst, val_tags_lst, test_tags_lst


lst_labels = create_table(image_folder_path, image_text_folder_path, token_type='non-Lemmatised')
#lst_labels[0] -> train labels
# lst_labels[1] -> val labels
# lst_labels[2] -> test labels

Loading the train, validation and test tensor that was preprocessed and saved in drive

In [3]:
!cp -r  /content/drive/MyDrive/VR" "final" "project/TrainLoader.pth /content/
!cp -r /content/drive/MyDrive/VR" "final" "project/TestLoader.pth /content/
!cp -r /content/drive/MyDrive/VR" "final" "project/ValLoader.pth  /content/

In [5]:
TrainLoader = torch.load('TrainLoader.pth')

In [6]:
ValLoader = torch.load('ValLoader.pth')

In [7]:
TestLoader = torch.load('TestLoader.pth')

Creating Vocabulary from the given captions

In [8]:
# preparing data for LSTM

import itertools

class Vocab():
  def __init__(self):
    self.word_to_index = {}
    self.index_to_word = {}
    self.n_words = 0
    self.word_set = {}

  def buildVocab(self, label_lst):
    label_lst = list(itertools.chain.from_iterable(label_lst[0])) # building vocab from train data
    big_str = " ".join(label_lst)
    big_str = big_str.lower() # lower case
    self.word_set = set(big_str.split(" "))
    self.word_set.add("<UNK>")
    self.word_set.add("<PAD>")
    self.word_to_index = {word:index for index, word in enumerate(self.word_set)}
    self.index_to_word = {index:word for index, word in enumerate(self.word_set)}
    self.n_words = len(self.word_set)

  def transformLabelToToken(self, label_lst):
    transformed_lst = []
    for lst in label_lst:
      new_lst = []
      for lst_of_sent in lst:
        new_lst_of_sent = []
        for sentence in lst_of_sent:
          normalize_text(sentence)
          transformed_sent = list(map(lambda x: self.word_to_index[x] if x in self.word_to_index else self.word_to_index["<UNK>"], sentence.lower().split(" ")))
          new_lst_of_sent.append(transformed_sent)
        new_lst.append(new_lst_of_sent)
      transformed_lst.append(new_lst)
    return transformed_lst  

  def padTokens(self, token_lst, MAX_LEN):
    for i in range(len(token_lst)):
      for j in range(len(token_lst[i])):
        for k in range(len(token_lst[i][j])):
          if (len(token_lst[i][j][k]) < MAX_LEN):
            for x in range(MAX_LEN - len(token_lst[i][j][k])):
              token_lst[i][j][k].append(self.word_to_index["<PAD>"])

    return token_lst 


In [23]:
vggNet = models.vgg16(pretrained=True)

Downloading: "https://download.pytorch.org/models/vgg16-397923af.pth" to /root/.cache/torch/hub/checkpoints/vgg16-397923af.pth


HBox(children=(FloatProgress(value=0.0, max=553433881.0), HTML(value='')))




In [9]:
resnet = models.resnet18(pretrained=True)

Downloading: "https://download.pytorch.org/models/resnet18-5c106cde.pth" to /root/.cache/torch/hub/checkpoints/resnet18-5c106cde.pth


HBox(children=(FloatProgress(value=0.0, max=46827520.0), HTML(value='')))




Model architecture code

In [10]:
class EncoderCNN(nn.Module):
  def __init__(self, pretrained_model):
    """
    Using pretrained CNN architecture - try using VGGnet as given in the paper: 
    Fine tuning on the upper layers also possible - try if there is time
    """
    super(EncoderCNN, self).__init__()

    model_layers = list(pretrained_model.children())[:-2] 
    self.pretrained_model = nn.Sequential(*model_layers)

    # fine-tuning is also possible 
    for param in self.pretrained_model.parameters():
      param.requires_grad = False    

  def forward(self, img_input):
    output = self.pretrained_model(img_input)  # (batch_size, img_depth, img_height, img_width)
    output = output.permute(0, 2, 3, 1).contiguous()  #(batch_size, img_height, img_width, img_depth)
    return output


class DecoderRNN(nn.Module):
  def __init__(self, embed_size, vocab_size, encoder_size, hidden_size, dropout_prob):  
    """
    embed_size : embedding size (output of embedding layer)
    vocab_size: size of vocabulary
    encoder_size: size of encoder output feature i.e image_depth 
    decoder_size: size of decoder hidden feature i.e hidden size of lstm
    """
    super(DecoderRNN, self).__init__()
    self.embed_size = embed_size
    self.vocab_size = vocab_size
    self.encoder_size = encoder_size
    self.hidden_size = hidden_size  
    
    self.embedding = nn.Embedding(vocab_size, embed_size)
    
    # lstm_cell. For multilayers, define multiple such 
    self.lstm_cell = nn.LSTMCell(embed_size, hidden_size, bias=True)

    self.dropout = nn.Dropout(p = dropout_prob)
    self.linear = nn.Linear(hidden_size, vocab_size)  # final layer to give logits on vocabulary

    self.init_h = nn.Linear(encoder_size, hidden_size) # initialization of initial hidden state vector
    self.init_c = nn.Linear(encoder_size, hidden_size) # initialization of initial cell state vector


  def init_hidden(self, encoder_out):
    mean_encoder_out = encoder_out.mean(dim=1) # (batch_size, num_pixels, encoder_size) -> (batch_size, encoder_size)
    init_hidden_state = self.init_h(mean_encoder_out)
    init_cell_state = self.init_c(mean_encoder_out)
    return init_hidden_state, init_cell_state

  def forward(self, encoder_out, captions, MAX_LEN):
    """
    Uses teacher forcing technique
    """
    # encoder_out shape: (batch_size, image_height, image_width, image_depth)
    batch_size = encoder_out.size(0)
    encoder_size = encoder_out.size(-1)
    encoder_out = encoder_out.view(batch_size, -1, encoder_size) # now the shape: (batch_size, num_pixels, encoder_size)
    
    
    embeddings = self.embedding(captions[:, :-1])      # embeddings shape: (batch_size, seq_len->MAX_LEN-1, embed_size)
    hidden_state, cell_state = self.init_hidden(encoder_out)   # hidden_state shape: (batch_size, hidden_size)

    out_logits = torch.zeros(batch_size, MAX_LEN-1, self.vocab_size).to(device)   # (batch_size, seq_len->MAX_LEN-1, vocab_size)

    for t in range(MAX_LEN-1):
      hidden_state, cell_state = self.lstm_cell(embeddings[:, t, :], (hidden_state, cell_state))
      output= self.linear(self.dropout(hidden_state))
      out_logits[:, t, :] = output

    return out_logits


  def mix_technique(self, encoder_out, captions, MAX_LEN):
    """
    Uses teacher_forcing for the first 10 time steps and then onwards non-teacher forcing
    """

    batch_size = encoder_out.size(0)
    encoder_size = encoder_out.size(-1)
    encoder_out = encoder_out.view(batch_size, -1, encoder_size) # now the shape: (batch_size, num_pixels, encoder_size)
    

    embeddings = self.embedding(captions[:, :, -1]) 
    hidden_state, cell_state = self.init_hidden(encoder_out)

    out_logits = torch.zeros(batch_size, MAX_LEN-1, self.vocab_size).to(device)

    for t in range(MAX_LEN-1):
      if (t<10): # change t for controlling number of steps in teacher-forcing
        input = embeddings[:, t, :]
      else:
        input = prev_embedding
      hidden_state, cell_state = self.lstm_cell(input, (hidden_state, cell_state))
      output = self.linear(self.dropout(hidden_state))
      out_logits[:, t, :] = output
      output = nn.LogSoftmax(dim=1)(output)
      output = output.argmax(1)  # output shape: (batch_size)
      prev_embedding = self.embedding(output.unsqueeze(1))  # no teacher forcing
    
    return out_logits
    

  def generateCaptions(self, encoder_out, VocabObj, MAX_LEN):
    """
    Doesn't use teacher forcing technique -> used while decoding and testing
    Used for batch_size = 1
    """
    batch_size = encoder_out.size(0)
    encoder_size = encoder_out.size(-1)
    encoder_out = encoder_out.view(batch_size, -1, encoder_size)

    # embedding for '<sos>'
    sos_tensor = VocabObj.word_to_index["<sos>"]
    sos_tensor = torch.tensor([[sos_tensor] for l in range(batch_size)], dtype=int)  # sos_tensor shape: (batch_size, 1)
    sos_tensor = sos_tensor.to(device)
    embedding = self.embedding(sos_tensor) # initial input embedding

    hidden_state, cell_state = self.init_hidden(encoder_out)

    out_logits = torch.zeros(batch_size, MAX_LEN-1, self.vocab_size).to(device)
    
    generated_captions = ["<sos>"]

    with torch.no_grad():
      for t in range(MAX_LEN-1):
        hidden_state, cell_state = self.lstm_cell(embedding.squeeze(1), (hidden_state, cell_state))
        output= self.linear(hidden_state)  # output shape: (batch_size, vocab_size)
        out_logits[:, t, :] = output
        output = nn.LogSoftmax(dim=1)(output)
        output = output.argmax(1)  # output shape: (batch_size)
        embedding = self.embedding(output.unsqueeze(1))  # no teacher forcing

        generated_captions.append(VocabObj.index_to_word[output.item()])
        if (VocabObj.index_to_word[output.item()] == "<eos>"):
          break

    #print(f"output: {output}") 
    return generated_captions


class ImageCaptionModel(nn.Module):
  def __init__(self, pretrained_model, embed_size, vocab_size, encoder_size, hidden_size, dropout_prob):
    super(ImageCaptionModel, self).__init__()
    self.encoder = EncoderCNN(pretrained_model)
    self.decoder = DecoderRNN(embed_size, vocab_size, encoder_size, hidden_size, dropout_prob)
  
  def forward(self, img_input, captions, MAX_LEN):  # remove arg MAX_LEN while summary
    return self.decoder(self.encoder(img_input), captions, MAX_LEN) # put MAX_LEN = 163 while summary

  def mix_technique(self, img_input, captions, MAX_LEN):
    return self.decoder(self.encoder(img_input), captions, MAX_LEN)

  def generateCaptions(self, img_input, VocabObj, MAX_LEN):
    return self.decoder.generateCaptions(self.encoder(img_input), VocabObj, MAX_LEN)




Visualizing model summary and memory details

In [None]:
# EMBED_SIZE = 256 
# ENCODER_SIZE = 512  
# HIDDEN_SIZE = 512  
# VOCAB_SIZE = 7631
# LSTM_DROPOUT_PROB = 0.5 
# NUM_EPOCHS = 20 
# LEARNING_RATE = 5e-4  
# model = ImageCaptionModel(resnet, EMBED_SIZE, VOCAB_SIZE, ENCODER_SIZE, HIDDEN_SIZE, LSTM_DROPOUT_PROB)
# model = model.to(device)

# from torchinfo import summary

# summary(model, [(32, 3, 256, 256), (32, 163)], dtypes=[torch.float, torch.int])

Training function

In [11]:
def train(model, TrainLoader, TrainCaptionLoader, ValLoader, ValCaptionLoader, num_epochs, learning_rate, VocabObj, MAX_LEN, teacher_forcing=True):
  model = model.to(device)
  criterion = nn.CrossEntropyLoss(ignore_index = VocabObj.word_to_index["<PAD>"])
  optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)

  train_loss_lst = []
  val_loss_lst = []

  for epoch in range(num_epochs):
    model.train()
    caption_loader = iter(TrainCaptionLoader)
    train_loss = 0
    for idx, img in tqdm(enumerate(TrainLoader), total=len(TrainLoader), position=0, leave=True):
      img = img.to(device)
      caption = next(caption_loader)
      caption = caption.to(device)

      batch_size = img.size(0)
      if (teacher_forcing):
        output_logits = model(img, caption, MAX_LEN)
      else:
        output_logits = model.mix_technique(img, caption, MAX_LEN)

      optimizer.zero_grad()
      loss = criterion(output_logits.permute(0, 2, 1).contiguous(), caption[:, 1:])
      loss.backward()
      optimizer.step()

    train_loss_lst.append(loss/batch_size)

    # validation loop, if training is taking too much time replace epoch==num_epochs-1 otherwise epoch <= num_epochs-1
    if (epoch == num_epochs-1): 
      model.eval()
      with torch.no_grad():
        val_caption_loader = iter(ValCaptionLoader)
        val_loss = float('inf')
        for idx, val_img in tqdm(enumerate(ValLoader), total=len(ValLoader), position=0, leave=True):
          val_img = val_img.to(device)
          batch_size = val_img.size(0)
          val_caption = next(val_caption_loader)
          for y in range(5):  
            val_cap = val_caption[y].to(device)
            val_cap = val_cap.unsqueeze(0)

            if (teacher_forcing):
              val_out_logits = model(val_img, val_cap, MAX_LEN)
            else:
              val_out_logits = model.mix_technique(val_img, val_cap, MAX_LEN)

            tmp_val_loss = criterion(val_out_logits.permute(0, 2, 1).contiguous(), val_cap[:, 1:])

            val_loss = min(val_loss, tmp_val_loss)

        val_loss_lst.append(val_loss/batch_size) 
    else:
      val_loss_lst.append(torch.zeros(1))

    print(f"Epoch: {epoch+1}/{num_epochs}, train loss: {train_loss_lst[epoch].item(): .4f}, validation loss: {val_loss_lst[epoch].item(): .4f}")

  return train_loss_lst, val_loss_lst


def showPlot(train_loss_lst, val_loss_lst):
  plt.figure(figsize=(8, 8))
  plt.plot(train_loss_lst)
  plt.plot(val_loss_lst)
  plt.title('loss per epoch')
  plt.xlabel('epoch')
  plt.ylabel('loss')
  plt.legend(['Training loss', 'Validation loss'])
  plt.savefig('Plot_of_loss_per_epoch.jpg')

Evaluation function

In [12]:
# outputing captions -> Greedy method
def outputCaptions(model, Loader, VocabObj, MAX_LEN):
  # Loader can be ValLoader or TestLoader (batch_size is assumed to be 1 in this case)
  model.eval()
  output_captions = []
  with torch.no_grad():
    for idx, img in enumerate(Loader):
      img = img.to(device)
      caption_out = model.generateCaptions(img, VocabObj, MAX_LEN)
      output_captions.append(" ".join(caption_out))
  return output_captions


# BLEU eval metric
from nltk.translate.bleu_score import corpus_bleu, sentence_bleu
from nltk.translate.meteor_score import meteor_score
def BLEU(reference_doc, sentence_list):
  # reference_doc 3-dm list
  # sentence_list is list of lists of sentence tokens 2-dm list
  bleuScore1 = corpus_bleu(reference_doc, sentence_list, weights=(1, 0, 0, 0))
  bleuScore2 = corpus_bleu(reference_doc, sentence_list, weights=(0.5, 0.5, 0, 0))
  bleuScore3 = corpus_bleu(reference_doc, sentence_list, weights=(1/3, 1/3, 1/3, 0))
  bleuScore4 = corpus_bleu(reference_doc, sentence_list, weights=(0.25, 0.25, 0.25, 0.25))
  return bleuScore1, bleuScore2, bleuScore3, bleuScore4


# Not a correct implementation of METEOR score; True one is more complex
def METEOR(reference_doc, sentence_list):
  score = 0
  for idx, _ in enumerate(sentence_list):
    score += meteor_score(reference_doc[idx], sentence_list[idx])
  return score/len(sentence_list) # averaging METEOR score

def Eval_score(label_lst, predicted_list):
  # label_lst -> list of lists of 5 sentences esp label_lst[2] #test
  sentence_list = [sent.split(" ") for sent in predicted_list]
  reference_doc = [[k.split(" ") for k in l] for l in label_lst] 
  bleuScore1, bleuScore2, bleuScore3, bleuScore4 = BLEU(reference_doc, sentence_list)
  meteorScore = METEOR(label_lst, predicted_list)
  print(f"BLEU-1 score: {bleuScore1: .5f}, BLEU-2 score: {bleuScore2: .5f}, BLEU-3 score: {bleuScore3: .5f}, BLEU-4 score: {bleuScore4: .5f}\nAverage METEOR score: {meteorScore: .5f}\n")



# evaluation
def evalResults(validation_captions, test_captions, label_lst):
  val_ref = label_lst[1]
  test_ref = label_lst[2]

  print(f"Validation metrics:\n")
  Eval_score(val_ref, validation_captions)

  print(f"\n Test metrics:\n")
  Eval_score(test_ref, test_captions)

In [13]:
# script

BATCH_SIZE = 32 
MAX_LEN = 163 
VocabObj = Vocab()
VocabObj.buildVocab(lst_labels)
token_lst = VocabObj.transformLabelToToken(lst_labels)
token_lst = VocabObj.padTokens(token_lst, MAX_LEN)

def prepareImageLoader(Loader, batch_size):
  lst = []
  for idx, img in enumerate(Loader):
    lst.append(img.squeeze(0)) 

  return DataLoader(dataset=lst, batch_size = batch_size, pin_memory=True)

In [14]:
TrainLoader = prepareImageLoader(TrainLoader, BATCH_SIZE)


In [37]:


def take_first_caption(token_lst):
  lst_tokens = []
  for lst in token_lst:
    lst_tokens.append(torch.tensor(lst[0]))
  return lst_tokens

def take_all_captions(token_lst):
  lst_tokens = []
  for lst in token_lst:
    for x in lst:
      lst_tokens.append(torch.tensor(x))
  return lst_tokens

TrainCaptionLoader = DataLoader(dataset = take_first_caption(token_lst[0]), batch_size = BATCH_SIZE)
ValCaptionLoader = DataLoader(dataset = take_all_captions(token_lst[1]), batch_size = 5)




EMBED_SIZE = 256 
ENCODER_SIZE = 512  
HIDDEN_SIZE = 512   
VOCAB_SIZE = VocabObj.n_words
LSTM_DROPOUT_PROB = 0.5 
NUM_EPOCHS = 20 
LEARNING_RATE = 5e-4  

model = ImageCaptionModel(vggNet, EMBED_SIZE, VOCAB_SIZE, ENCODER_SIZE, HIDDEN_SIZE, LSTM_DROPOUT_PROB)
model = model.to(device)


In [38]:
train_loss_lst, val_loss_lst = train(model, TrainLoader, TrainCaptionLoader, ValLoader, ValCaptionLoader, NUM_EPOCHS, LEARNING_RATE, VocabObj, MAX_LEN, teacher_forcing=False)

100%|██████████| 188/188 [02:00<00:00,  1.57it/s]
  0%|          | 0/188 [00:00<?, ?it/s]

Epoch: 1/20, train loss:  0.3212, validation loss:  0.0000


100%|██████████| 188/188 [01:59<00:00,  1.57it/s]
  0%|          | 0/188 [00:00<?, ?it/s]

Epoch: 2/20, train loss:  0.2873, validation loss:  0.0000


100%|██████████| 188/188 [01:59<00:00,  1.57it/s]
  0%|          | 0/188 [00:00<?, ?it/s]

Epoch: 3/20, train loss:  0.2623, validation loss:  0.0000


100%|██████████| 188/188 [01:59<00:00,  1.57it/s]
  0%|          | 0/188 [00:00<?, ?it/s]

Epoch: 4/20, train loss:  0.2322, validation loss:  0.0000


100%|██████████| 188/188 [01:59<00:00,  1.57it/s]
  0%|          | 0/188 [00:00<?, ?it/s]

Epoch: 5/20, train loss:  0.2150, validation loss:  0.0000


100%|██████████| 188/188 [01:59<00:00,  1.57it/s]
  0%|          | 0/188 [00:00<?, ?it/s]

Epoch: 6/20, train loss:  0.1999, validation loss:  0.0000


100%|██████████| 188/188 [01:59<00:00,  1.57it/s]
  0%|          | 0/188 [00:00<?, ?it/s]

Epoch: 7/20, train loss:  0.1796, validation loss:  0.0000


100%|██████████| 188/188 [01:59<00:00,  1.57it/s]
  0%|          | 0/188 [00:00<?, ?it/s]

Epoch: 8/20, train loss:  0.1563, validation loss:  0.0000


100%|██████████| 188/188 [01:59<00:00,  1.57it/s]
  0%|          | 0/188 [00:00<?, ?it/s]

Epoch: 9/20, train loss:  0.1437, validation loss:  0.0000


100%|██████████| 188/188 [01:59<00:00,  1.57it/s]
  0%|          | 0/188 [00:00<?, ?it/s]

Epoch: 10/20, train loss:  0.1303, validation loss:  0.0000


100%|██████████| 188/188 [01:59<00:00,  1.57it/s]
  0%|          | 0/188 [00:00<?, ?it/s]

Epoch: 11/20, train loss:  0.1208, validation loss:  0.0000


100%|██████████| 188/188 [01:59<00:00,  1.57it/s]
  0%|          | 0/188 [00:00<?, ?it/s]

Epoch: 12/20, train loss:  0.1096, validation loss:  0.0000


100%|██████████| 188/188 [01:59<00:00,  1.57it/s]
  0%|          | 0/188 [00:00<?, ?it/s]

Epoch: 13/20, train loss:  0.1000, validation loss:  0.0000


100%|██████████| 188/188 [01:59<00:00,  1.57it/s]
  0%|          | 0/188 [00:00<?, ?it/s]

Epoch: 14/20, train loss:  0.0849, validation loss:  0.0000


100%|██████████| 188/188 [01:59<00:00,  1.57it/s]
  0%|          | 0/188 [00:00<?, ?it/s]

Epoch: 15/20, train loss:  0.0870, validation loss:  0.0000


100%|██████████| 188/188 [01:59<00:00,  1.57it/s]
  0%|          | 0/188 [00:00<?, ?it/s]

Epoch: 16/20, train loss:  0.0768, validation loss:  0.0000


100%|██████████| 188/188 [01:59<00:00,  1.57it/s]
  0%|          | 0/188 [00:00<?, ?it/s]

Epoch: 17/20, train loss:  0.0723, validation loss:  0.0000


100%|██████████| 188/188 [01:59<00:00,  1.57it/s]
  0%|          | 0/188 [00:00<?, ?it/s]

Epoch: 18/20, train loss:  0.0606, validation loss:  0.0000


100%|██████████| 188/188 [01:59<00:00,  1.57it/s]
  0%|          | 0/188 [00:00<?, ?it/s]

Epoch: 19/20, train loss:  0.0547, validation loss:  0.0000


100%|██████████| 188/188 [01:59<00:00,  1.57it/s]
100%|██████████| 1000/1000 [03:03<00:00,  5.46it/s]


Epoch: 20/20, train loss:  0.0542, validation loss:  0.6222


In [None]:
showPlot(train_loss_lst, val_loss_lst) # omitted since we validating only at the end of training

In [39]:
validation_captions = outputCaptions(model, ValLoader, VocabObj, MAX_LEN)
test_captions = outputCaptions(model, TestLoader, VocabObj, MAX_LEN)

In [40]:
validation_captions

['<sos> boy in black shirt is sitting on bench with his arms crossed <eos>',
 '<sos> boy in red shirt is standing on rock next to building <eos>',
 '<sos> boy holds ball while another dog looks at it <eos>',
 '<sos> black dog is jumping over log in air <eos>',
 '<sos> man in red shorts is climbing rock face <eos>',
 '<sos> child in bathtub with water coming out of water <eos>',
 '<sos> group of people are walking in front of bridge in city <eos>',
 '<sos> boy and girl are playing in field <eos>',
 '<sos> boy in red shirt is standing on top of cliff in front of building <eos>',
 '<sos> man in red jacket and hat sits on bench <eos>',
 '<sos> man in black wetsuit is surfing <eos>',
 '<sos> baseball player hitting ball <eos>',
 '<sos> black dog in water <eos>',
 '<sos> boy in red shirt is jumping into pool <eos>',
 '<sos> man is standing on top of cliff taking picture of people <eos>',
 '<sos> boy in swimsuit is jumping into pool <eos>',
 '<sos> man in blue shirt is smiling with his arms c

In [41]:
evalResults(validation_captions, test_captions, lst_labels)

Validation metrics:

BLEU-1 score:  0.52499, BLEU-2 score:  0.30646, BLEU-3 score:  0.19464, BLEU-4 score:  0.12138
Average METEOR score:  0.37981


 Test metrics:

BLEU-1 score:  0.53386, BLEU-2 score:  0.31465, BLEU-3 score:  0.19630, BLEU-4 score:  0.12096
Average METEOR score:  0.39307



Loading Subjective image tensor data from the drive

In [34]:
subject_img_folder = "/content/drive/MyDrive/VR final project/subjective_img_loader.pth"

subjective_img_loader = torch.load(subject_img_folder)

Testing the model on subjective images

In [35]:
subjective_captions = outputCaptions(model, subjective_img_loader, VocabObj, MAX_LEN)

In [36]:
subjective_captions

['<sos> boy in black shirt is doing skateboard trick <eos>',
 '<sos> group of people ride bikes on street <eos>',
 '<sos> crowd of people are standing in front of building <eos>',
 '<sos> group of people are walking over large white and white dog <eos>',
 '<sos> dog jumps into air to catch tennis ball <eos>']