# Pip Installs

In [None]:
!pip install efficientnet_pytorch
!pip install spacy
from efficientnet_pytorch import EfficientNet

Collecting efficientnet_pytorch
  Downloading efficientnet_pytorch-0.7.1.tar.gz (21 kB)
Building wheels for collected packages: efficientnet-pytorch
  Building wheel for efficientnet-pytorch (setup.py) ... [?25l[?25hdone
  Created wheel for efficientnet-pytorch: filename=efficientnet_pytorch-0.7.1-py3-none-any.whl size=16446 sha256=5a8c9a8ae9d2f44511b197204fd4f2289b71fdc8694add5f9d369b0839d8dfea
  Stored in directory: /root/.cache/pip/wheels/0e/cc/b2/49e74588263573ff778da58cc99b9c6349b496636a7e165be6
Successfully built efficientnet-pytorch
Installing collected packages: efficientnet-pytorch
Successfully installed efficientnet-pytorch-0.7.1


In [None]:
!pip install bcolz
import bcolz

Collecting bcolz
  Downloading bcolz-1.2.1.tar.gz (1.5 MB)
[?25l[K     |▎                               | 10 kB 25.0 MB/s eta 0:00:01[K     |▌                               | 20 kB 27.5 MB/s eta 0:00:01[K     |▊                               | 30 kB 18.2 MB/s eta 0:00:01[K     |█                               | 40 kB 16.8 MB/s eta 0:00:01[K     |█▏                              | 51 kB 7.1 MB/s eta 0:00:01[K     |█▍                              | 61 kB 8.3 MB/s eta 0:00:01[K     |█▋                              | 71 kB 9.2 MB/s eta 0:00:01[K     |█▉                              | 81 kB 10.1 MB/s eta 0:00:01[K     |██                              | 92 kB 11.1 MB/s eta 0:00:01[K     |██▎                             | 102 kB 9.8 MB/s eta 0:00:01[K     |██▌                             | 112 kB 9.8 MB/s eta 0:00:01[K     |██▊                             | 122 kB 9.8 MB/s eta 0:00:01[K     |███                             | 133 kB 9.8 MB/s eta 0:00:01[K     |███▏ 

In [None]:
!pip install einops

Collecting einops
  Downloading einops-0.4.1-py3-none-any.whl (28 kB)
Installing collected packages: einops
Successfully installed einops-0.4.1


# Imports

In [None]:

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import torch.backends.cudnn as cudnn
import numpy as np
import torchvision
import pickle
from torchvision.io import read_image
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import time
import shutil
import os
import copy
import torchvision.transforms as T
from torchsummary import summary
from torchvision.models import efficientnet_b4,efficientnet_b0
import random
import pandas as pd
import json
from efficientnet_pytorch import EfficientNet
import spacy
from PIL import Image
from torch.utils.data import DataLoader, Dataset
from spacy.attrs import ORTH
import cv2 as cv 
from google.colab.patches import cv2_imshow # for image display
from einops import rearrange
import math


In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


# Flickr Dataset

In [None]:
!unzip '/content/drive/MyDrive/GroupProjectNLP/flickr8k.zip'

## Preprocess the images, captions and create a data generator

In [None]:
#Data loader for training: will return only one sentence per image
class FlickrDataset(Dataset):
    
    def __init__(self, root, ann_file, img_transform=None, txt_transform=None):
        df = pd.read_csv(ann_file)
        self.root = root
        self.img_transform = img_transform
        self.txt_transform = txt_transform
        self.img_ids = df['image']
        self.captions = df['caption']
        self.vocab = Vocabulary()
        self.vocab.build_vocab(self.captions.tolist())
    
    def __len__(self):
        return len(self.img_ids)

    def __getitem__(self, idx):
        img = Image.open(os.path.join(self.root, self.img_ids[idx]))
        img = self.img_transform(img) if self.img_transform is not None else img
        caption = self.captions[idx]
        caption = self.txt_transform(caption) if self.txt_transform is not None else caption
        caption = self.vocab.numericalize('<sos> ' + caption + ' <eos>')
        return img, torch.Tensor(caption)


In [None]:
#Data loader for validation/testing: will return 5 sentences per image
class FlickrDatasetVal(Dataset):
    
    def __init__(self, root, ann_file, img_transform=None, txt_transform=None):
        df = pd.read_csv(ann_file)
        self.root = root
        self.img_transform = img_transform
        self.txt_transform = txt_transform
        self.img_ids = df['image']
        self.captions = df['captions']
        self.vocab = Vocabulary()
        self.vocab.build_vocab(self.captions.tolist())
    
    def __len__(self):
        return len(self.img_ids)

    def collate_captions(self, captions):
        lengths = [len(cap) for cap in captions]
        targets = torch.zeros(len(captions), max(lengths)).long()
        for i, cap in enumerate([torch.tensor(caption) for caption in captions]):
            end = lengths[i]
            targets[i, :end] = cap[:end]
        return targets  

    def __getitem__(self, idx):
        img = Image.open(os.path.join(self.root, self.img_ids[idx]))
        img = self.img_transform(img) if self.img_transform is not None else img
        captions = self.captions[idx].split('|')
        captions = ['<sos> ' + caption + ' <eos>' for caption in captions]
        captions = self.txt_transform(captions) if self.txt_transform is not None else captions
        captions = [self.vocab.numericalize(caption) for caption in captions]        
        return img, self.collate_captions(captions)

In [None]:
#Preprocessing for both training and validation data
def collate_fn(data):
    # Sort a data list by caption length (descending order).
    data.sort(key=lambda x: len(x[1]), reverse=True)
    images, captions = zip(*data)

    # Merge images (from tuple of 3D tensor to 4D tensor).
    images = torch.stack(images, 0)

    # Merge captions (from tuple of 1D tensor to 2D tensor).
    lengths = [len(cap) for cap in captions]
    targets = torch.zeros(len(captions), max(lengths)).long()
    for i, cap in enumerate(captions):
        end = lengths[i]
        targets[i, :end] = cap[:end]        
    return images, targets, lengths


def collate_fn_val(data):
    images, captions = zip(*data)
    images = torch.stack(images, 0)
    max_length = max([len(cap[0]) for cap in captions])
    targets = torch.zeros(len(captions), 5, max_length).long()
    for i, caps in enumerate(captions):
        end = caps.shape[1]
        targets[i, :, :end] = caps[:end]
    return images, targets, max_length


In [None]:
#Building the vocabulary for a given Flickr dataset
spacy_eng = spacy.load("en_core_web_sm")
spacy_eng.tokenizer.add_special_case('<sos>', [{ORTH: "<sos>"}])
spacy_eng.tokenizer.add_special_case('<eos>', [{ORTH: "<eos>"}])
spacy_eng.tokenizer.add_special_case('<pad>', [{ORTH: "<pad>"}])
spacy_eng.tokenizer.add_special_case('<unk>', [{ORTH: "<unk>"}])

class Vocabulary:

    def __init__(self):
        self.itos = {0:"<pad>",1:"<sos>",2:"<eos>",3:"<unk>"}
        self.stoi = {v:k for k,v in self.itos.items()}
        
    def __len__(self): return len(self.itos)
    
    @staticmethod
    def tokenize(text):
        return [token.text.lower() for token in spacy_eng.tokenizer(text)]
    
    def build_vocab(self, sentence_list):
        idx = 4
        # add words from all sentences to vocab
        for sentence in sentence_list:
            for word in self.tokenize(sentence):
                if word not in self.stoi:
                    self.stoi[word] = idx
                    self.itos[idx] = word
                    idx += 1
    
    def numericalize(self,text):
        tokenized_text = self.tokenize(text)
        return [self.stoi[token] if token in self.stoi else self.stoi["<unk>"] for token in tokenized_text]


In [None]:
# Create train, validation and test data loaders for Flickr8k
transform = transforms.Compose([
    transforms.Resize([224, 224]),
    transforms.ToTensor()
])

# Training dataset & loader: will only give 1 target sentence per image
dataset = FlickrDataset(
    root='/content/Images',
    ann_file='/content/drive/MyDrive/GroupProjectNLP/flickr8k_train.csv',
    img_transform=transform
)

train_loader = DataLoader(
    dataset=dataset,
    batch_size=100,
    num_workers=8,
    shuffle=True,
    pin_memory=True,
    collate_fn=collate_fn
)

#To evaluate
dataset_train_five = FlickrDatasetVal(
    root='/content/Images',
    ann_file='/content/drive/MyDrive/GroupProjectNLP/flickr8k_train_agg.csv',
    img_transform=transform
)

train_loader_five = DataLoader(
    dataset=dataset_train_five,
    batch_size=100,
    num_workers=8,
    shuffle=False,
    pin_memory=True,
    collate_fn=collate_fn_val
)

# Validation
valset = FlickrDatasetVal(
    root='/content/Images',
    ann_file='/content/drive/MyDrive/GroupProjectNLP/flickr8k_val.csv',
    img_transform=transform
)

val_loader = DataLoader(
    dataset=valset,
    batch_size=100,
    num_workers=8,
    shuffle=False,
    collate_fn=collate_fn_val
)

# Testing
testset = FlickrDatasetVal(
    root='/content/Images',
    ann_file='/content/drive/MyDrive/GroupProjectNLP/flickr8k_test.csv',
    img_transform=transform
)

test_loader = DataLoader(
    dataset=testset,
    batch_size=128,
    num_workers=8,
    shuffle=False,
    collate_fn=collate_fn_val
)


flickr_classes = list(dataset.vocab.stoi.keys())
flickr_classes.append("<pad>")

  cpuset_checked))


# YOLOv5 model

In [None]:
# Download YOLOv5 model
yolo_model = torch.hub.load('ultralytics/yolov5', 'yolov5s', pretrained=True, force_reload=True)

Downloading: "https://github.com/ultralytics/yolov5/archive/master.zip" to /root/.cache/torch/hub/master.zip
Downloading https://ultralytics.com/assets/Arial.ttf to /root/.config/Ultralytics/Arial.ttf...
[31m[1mrequirements:[0m PyYAML>=5.3.1 not found and is required by YOLOv5, attempting auto-update...
Collecting PyYAML>=5.3.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
Installing collected packages: PyYAML
  Attempting uninstall: PyYAML
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully uninstalled PyYAML-3.13
Successfully installed PyYAML-6.0

[31m[1mrequirements:[0m 1 package updated per /root/.cache/torch/hub/ultralytics_yolov5_master/requirements.txt
[31m[1mrequirements:[0m ⚠️ [1mRestart runtime or rerun command for updates to take effect[0m

YOLOv5 🚀 2022-4-6 torch 1.10.0+cu111 CUDA:0 (Tesla P100-PCIE-16GB, 16281MiB)



Downloading https://github.com/ultralytics/yolov5/releases/download/v6.1/yolov5s.pt to yolov5s.pt...


  0%|          | 0.00/14.1M [00:00<?, ?B/s]




Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients
Adding AutoShape... 


In [None]:
# Yolo classes
yolo_classes = yolo_model.names.copy()
yolo_classes.append("<pad>")
print(len(yolo_classes))
print(yolo_classes[-5:])

81
['scissors', 'teddy bear', 'hair drier', 'toothbrush', '<pad>']


In [None]:
# Set model parameters (optional)
yolo_model.conf = 0.35 # confidence threshold (0-1)

In [None]:
def transform_yolo(imgs_list):
    '''
    Function that prepares the data loader images to input in YOLOv5 object detector.
    Input: list of images, each image is (n,m,3) pytorch tensor representing normalised [0,1] RGB values for image of dimensions (n,m)
    Output: list of images, each image is (n,m,3) numpy array representing un-normalised [0,255] RGB values for image of dimension (n,m)
    '''
    imgs_list_out = []

    for i in range(len(imgs_list)):
        imgs_list_out.append(np.array(imgs_list[i]) * 255)

    return imgs_list_out

In [None]:
def yolov5_detector(model, imgs_list):
    '''
    Use YOLOv5 to detect objects in an image.
    Input:
    model = YOLOv5 model loaded from torch hub
    imgs_list = list of images (n,m,3) numpy array with RGB values [0,255]. The images do not necessarily need to be of the same (n,m) size

    Outputs:
    objects = list of lists of strings of d identified objects (with repetitions) for each image
    boxes = list of (d,4) numpy arrays with the bounding boxes coordinates for each object in every image
    '''
    with torch.no_grad():
        results = model(imgs_list)

    # Uncomment the line below if you want to print a summary for each image:
    # results.print()

    data_frames_list = results.pandas().xyxy  # img1 predictions (pandas)

    objects = []
    boxes = []

    for i in range(len(imgs_list)):
        objects.append(list(data_frames_list[i]['name']))
        boxes.append(results.xyxy[i][:,:4])

    return objects, boxes, results

# Feature extraction

In [None]:
#Load model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_name = 'efficientnet-b4'
classifier_model = EfficientNet.from_pretrained(model_name).to(device)
image_size = EfficientNet.get_image_size(model_name)
classifier_model.eval()

Downloading: "https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b4-6ed6700e.pth" to /root/.cache/torch/hub/checkpoints/efficientnet-b4-6ed6700e.pth


  0%|          | 0.00/74.4M [00:00<?, ?B/s]

Loaded pretrained weights for efficientnet-b4


EfficientNet(
  (_conv_stem): Conv2dStaticSamePadding(
    3, 48, kernel_size=(3, 3), stride=(2, 2), bias=False
    (static_padding): ZeroPad2d(padding=(0, 1, 0, 1), value=0.0)
  )
  (_bn0): BatchNorm2d(48, eps=0.001, momentum=0.010000000000000009, affine=True, track_running_stats=True)
  (_blocks): ModuleList(
    (0): MBConvBlock(
      (_depthwise_conv): Conv2dStaticSamePadding(
        48, 48, kernel_size=(3, 3), stride=[1, 1], groups=48, bias=False
        (static_padding): ZeroPad2d(padding=(1, 1, 1, 1), value=0.0)
      )
      (_bn1): BatchNorm2d(48, eps=0.001, momentum=0.010000000000000009, affine=True, track_running_stats=True)
      (_se_reduce): Conv2dStaticSamePadding(
        48, 12, kernel_size=(1, 1), stride=(1, 1)
        (static_padding): Identity()
      )
      (_se_expand): Conv2dStaticSamePadding(
        12, 48, kernel_size=(1, 1), stride=(1, 1)
        (static_padding): Identity()
      )
      (_project_conv): Conv2dStaticSamePadding(
        48, 24, kernel_siz

# Image Pre-processing

In [None]:
#Image preprocessing, adapted to Daniel's loader

def preprocessing_with_aug(imgs):
  
  gamma = np.random.choice([0.8, 1., 1.2])
  imgs = T.functional.adjust_gamma(imgs, gamma)
  transf_aug = T.Compose([T.RandomHorizontalFlip(p=0.5),
                          T.RandomResizedCrop((224,224)),
             ])
  imgs = transf_aug(imgs)
  return T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])(imgs)

def preprocessing_no_aug(imgs):
  return T.Compose([T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])(imgs)


# Word embeddings

In [None]:
#Load in pickled embeddings
glove_path = "/content/drive/MyDrive/GroupProjectNLP/"
vectors = bcolz.open(f'{glove_path}/6B.50.dat')[:]
words = pickle.load(open(f'{glove_path}/6B.50_words.pkl', 'rb'))
word2idx = pickle.load(open(f'{glove_path}/6B.50_idx.pkl', 'rb'))
glove = {w: vectors[word2idx[w]] for w in words}

In [None]:
dataset_embeddings = FlickrDataset(
    root='/content/Images',
    ann_file='/content/captions.txt',
    img_transform=transform
)

flickr_classes = list(dataset_embeddings.vocab.stoi.keys())
flickr_classes.append("<pad>")

## Embedding matrix for Yolo classes

In [None]:
# Embedding matrix of Yolo classes
embedding_dim = 50
matrix_len = len(yolo_classes)
weights_matrix = np.zeros((matrix_len, embedding_dim))
words_found = 0

#Create a matrix of embeddings, not yet directly usable by Pytorch
for i, word in enumerate(yolo_classes):
    try: 
        weights_matrix[i] = glove[word]
        words_found += 1
    except KeyError:
        weights_matrix[i] = glove["<unk>"]



#Transform the embedding matrix to its Pytorch equivalence
weights_matrix = torch.Tensor(weights_matrix)
non_trainable = True
num_embeddings, embedding_dim = weights_matrix.shape
emb_layer_yolo = torch.nn.Embedding(num_embeddings, embedding_dim)
emb_layer_yolo.load_state_dict({'weight': weights_matrix})
if non_trainable:
    emb_layer_yolo.weight.requires_grad = False

emb_layer_yolo

Embedding(81, 50)

## GloVe embedding matrix for Flickr classes

In [None]:
# Embedding matrix of flickr classes
embedding_dim = 50
matrix_len = len(flickr_classes)
weights_matrix = np.zeros((matrix_len, embedding_dim))
words_found = 0

#Create a matrix of embeddings, not yet directly usable by Pytorch
for i, word in enumerate(flickr_classes):
    try: 
        weights_matrix[i] = glove[word]
        words_found += 1
    except KeyError:
        weights_matrix[i] = glove["<unk>"]


#Transform the embedding matrix to its Pytorch equivalence
weights_matrix = torch.Tensor(weights_matrix)
non_trainable = True
num_embeddings, embedding_dim = weights_matrix.shape
emb_layer_flickr = torch.nn.Embedding(num_embeddings, embedding_dim)
emb_layer_flickr.load_state_dict({'weight': weights_matrix})
if non_trainable:
    emb_layer_flickr.weight.requires_grad = False

Embedding(8512, 50)

# Function that takes batches and outputs the image features, the object detector vectors, and the target caption indexes

In [None]:
#For training we do data augmentation
def load_training(data):
    (imgs, targets, lengths)=data

    with torch.no_grad():
      
      # ------------- FEATURES ----------------------
      # ---------------------------------------------
      features = classifier_model.extract_features(preprocessing_with_aug(imgs).to(device)).flatten(start_dim=-2).permute(0,2,1)
      # ---------------------------------------------

      # ------------- OBJECTS -----------------------
      # ---------------------------------------------
      # Transform images to use them with YOLOv5
      imgs = transform_yolo(imgs)

      # Apply YOLOv5 to get objects and bounding boxes
      objects, boxes, results = yolov5_detector(yolo_model, imgs)

      max_objects = len(max(objects, key=len))
      for idx,objects_list in enumerate(objects):
        pad_amount = (max_objects-len(objects_list))
        objects_list.extend(["<pad>"]*pad_amount)
        boxes[idx] = torch.cat((boxes[idx]/255, torch.zeros((pad_amount,4)).to(device)),0)

      indices_objects = torch.tensor([[yolo_classes.index(object_img) for object_img in objects_image] for objects_image in objects])
      embedded_objects = emb_layer_yolo(indices_objects)
      embedded_objects = torch.cat((embedded_objects.to(device), torch.stack(boxes).to(device)),axis=2)
      # ---------------------------------------------

      return(features, embedded_objects, targets)


def load_validation(data):
    (imgs, targets, lengths)=data

    with torch.no_grad():    

      # ------------- FEATURES ----------------------
      # ---------------------------------------------  
      features = classifier_model.extract_features(preprocessing_no_aug(imgs).to(device)).flatten(start_dim=-2).permute(0,2,1)
      # ---------------------------------------------

      # ------------- OBJECTS -----------------------
      # ---------------------------------------------
      # Transform images to use them with YOLOv5
      imgs = transform_yolo(imgs)

      # Apply YOLOv5 to get objects and bounding boxes
      objects, boxes, results = yolov5_detector(yolo_model, imgs)

      max_objects = len(max(objects, key=len))
      for idx,objects_list in enumerate(objects):
        pad_amount = (max_objects-len(objects_list))
        objects_list.extend(["<pad>"]*pad_amount)
        boxes[idx] = torch.cat((boxes[idx]/255, torch.zeros((pad_amount,4)).to(device)),0)

      indices_objects = torch.tensor([[yolo_classes.index(object_img) for object_img in objects_image] for objects_image in objects])
      embedded_objects = emb_layer_yolo(indices_objects)
      embedded_objects = torch.cat((embedded_objects.to(device), torch.stack(boxes).to(device)),axis=2)
      # ---------------------------------------------

      return(features, embedded_objects, targets)

# Positional encoding for the transformer

In [None]:
# Source: https://pytorch.org/tutorials/beginner/transformer_tutorial
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=100):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

# Some other needed functions for the transformer (input layer, target mask, etc.)

In [None]:
class InputLayer(nn.Module):
  """
  Reshaping the image features and appeneding text.
  Takes batch of transformed images in the form BxCxWxH and
  a batch of words in the form BxNum_of_words x Word_size.

  img_feature_size = WxH

  Returns Bx(C+Num_of_words) x Word_size
  """

  def __init__(self, img_feature_size, word_size):
    super().__init__()

    self.fc = nn.Linear(img_feature_size, word_size)

  def forward(self, batch_img_feat, batch_words):

    x = torch.flatten(batch_img_feat, 2)
   
    x = self.fc(x)
    
    y = batch_words
    

    return torch.cat([x,y], dim=1)

In [None]:
def get_tgt_mask( size) -> torch.tensor:
        # Generates a squeare matrix where the each row allows one word more to be seen
        mask = torch.tril(torch.ones(size, size) == 1) # Lower triangular matrix
        mask = mask.float()
        mask = mask.masked_fill(mask == 0, float('-inf')) # Convert zeros to -inf
        mask = mask.masked_fill(mask == 1, float(0.0)) # Convert ones to 0
        
        # EX for size=5:
        # [[0., -inf, -inf, -inf, -inf],
        #  [0.,   0., -inf, -inf, -inf],
        #  [0.,   0.,   0., -inf, -inf],
        #  [0.,   0.,   0.,   0., -inf],
        #  [0.,   0.,   0.,   0.,   0.]]
        
        return mask
    
def create_pad_mask( matrix: torch.tensor, pad_token: int) -> torch.tensor:
        # If matrix = [1,2,3,0,0,0] where pad_token=0, the result mask is
        # [False, False, False, True, True, True]
        return torch.where(matrix==pad_token, True, False)

# The TRANSFORMER model

In [None]:
class Transformer(nn.Module):
    def __init__(self, vocab_size, d_model, nhead,
                 dim_feedforward=2048, max_seq_length=96, pos_dropout=0.1, word_size=54, img_features_size=64, trans_dropout=0.1
                 , num_encoder_layers=6, num_decoder_layers=6):
        """
        Initializes the model
                Parameters:
                        vocab_size (int): The amount of tokens in both vocabularies (including start, end, etc tokens)
                        d_model (int): Expected number of features in the encoder/decoder inputs, also used in embeddings
                        nhead (int): Number of heads in the transformer
                        num_encoder_layers (int): Number of sub-encoder layers in the transformer
                        num_decoder_layers (int): Number of sub-decoder layers in the transformer
                        dim_feedforward (int): Dimension of the feedforward network in the transformer
                        max_seq_length (int): Maximum length of each tokenized sentence
                        pos_dropout (float): Dropout value in the positional encoding
                        trans_dropout (float): Dropout value in the transformer
        """
        super().__init__()
        self.d_model = d_model
        #self.embed_src = nn.Embedding(vocab_size, d_model)
        self.input = InputLayer(img_features_size, word_size)

        self.embed_tgt = nn.Embedding(vocab_size, d_model)
        # self.embed_tgt = emb_layer_flickr
        self.pos_enc = PositionalEncoding(d_model, pos_dropout, max_seq_length)

        self.transformer = nn.Transformer(d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, trans_dropout)
        self.fc = nn.Linear(d_model, vocab_size)

    def forward(self, src_full, tgt, 
                tgt_mask):
        
        # merge image and word data
        src_img, src_words = src_full
        src = self.input(src_img, src_words)
        
        # change to seq_len x batch x 54
        src = torch.permute(src, (1,0,2))

        tgt_key_padding_mask = create_pad_mask(tgt,0)
        #print(tgt_key_padding_mask, tgt)
        tgt = torch.permute(tgt, (1,0))
  

        # Embed the targets, scale by sqrt(d_model), and add the positional encoding
        tgt = self.pos_enc(self.embed_tgt(tgt) * math.sqrt(self.d_model))

        # Send the batches to the model
        
        output = self.transformer(src.float(), tgt.float(), tgt_mask=tgt_mask, 
                                  tgt_key_padding_mask=tgt_key_padding_mask, 
                                  )

        # change back to batch x seq_len x 54
        output = torch.permute(output, (1,0,2))
        
        # Run the output through an fc layer to return values for each token in the vocab
        return self.fc(output)

# Detokenizer and Bleu Score

In [None]:
from torchtext.data.metrics import bleu_score
candidate_corpus = [['My' , 'full', 'pytorch' ,'test'], ['Another', 'Sentence']]
references_corpus = [[['My', 'full', 'pytorch', 'test'], ['Completely', 'Different']], [['No', 'Match']]]
bleu_score(candidate_corpus, references_corpus)

0.8408964276313782

In [None]:
def detokenize(batch):
  #During training, there is only one sentence for each image in the batch
  (dataset.vocab.itos[4])
  temp = batch.tolist()
  op = lambda x: dataset.vocab.itos[x]

  detokenized= []
  for i in range(len(temp)):
    withpad = list(map(op, temp[i]))
    woutpados = list(filter(lambda x : x not in ['<pad>', '<eos>', '<sos>'], withpad))
    detokenized.append(woutpados)
  
  return detokenized

In [None]:
#When we have multiple target sentences
def detokenize_multiple(batch, dataset):
  # 5 sentences per image in the batch
  (dataset.vocab.itos[4])
  batch_size, n_sentences, max_seq_length = batch.shape
  op = lambda x: dataset.vocab.itos[x]

  detokenized_all= []
  for i in range(batch_size):
    detokenized_single_image = []
    for j in range(n_sentences):
      withpad = list(map(op, batch[i,j,:].tolist()))
      woutpados = list(filter(lambda x : x not in ['<pad>', '<eos>', '<sos>'], withpad))
      detokenized_single_image.append(woutpados)
    detokenized_all.append(detokenized_single_image)
  
  return detokenized_all

In [None]:
#Compute the average BLEU score across a batch
def cal_bleu_score(predictions, targets, n=4, dataset=valset):
    targets = detokenize_multiple(targets, dataset)
    predictions = detokenize(predictions)
    score = round(bleu_score(predictions, targets, max_n=n) * 100, 2)
    return score

# TRAINING FUNCTION

In [None]:
def train_loop(model, opt, loss_fn, train_loader):
    model.train()
    total_loss = 0.0

    for i, batch in enumerate(train_loader,0):

      batch_img, batch_words, batch_target_seq = load_training(batch) # image, target, length of target
      batch_img, batch_words, batch_target_seq = batch_img.clone().detach().to(device), batch_words.clone().detach().to(device), batch_target_seq.clone().detach().to(device)
      X = batch_img, batch_words

      # Now we shift the tgt by one so with the <SOS> we predict the token at pos 1
      y_input = batch_target_seq[:,:-1]
      y_expected = batch_target_seq[:,1:]

      
      # Get mask to mask out the next words
      sequence_length = y_input.size(1)
      tgt_mask = get_tgt_mask(sequence_length).to(device)

      # Standard training except we pass in y_input and tgt_mask
      pred = model(X, y_input, tgt_mask=tgt_mask)
      loss = loss_fn(rearrange(pred, 'b t v -> (b t) v'), rearrange(y_expected, 'b o -> (b o)'))
      


      opt.zero_grad()
      torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
      loss.backward()
      opt.step()
  
      total_loss += loss.detach().item()
        
    return total_loss/len(train_loader)

# VALIDATION FUNCTION

In [None]:
def validation_loop(model, loader, vocab_dataset, max_length=25, SOS_token=1, EOS_token=2):
    
    model.eval()
    total_bleu = 0.0

    with torch.no_grad():
        for i, batch in enumerate(loader,0):
          
            batch_img, batch_words, batch_target_seq = load_validation(batch) # image, target, length of target
            batch_img, batch_words, batch_target_seq = batch_img.clone().detach().to(device), batch_words.clone().detach().to(device), batch_target_seq.clone().detach().to(device)
            X = batch_img, batch_words

            y_input = torch.ones((len(batch_img), 1), dtype=torch.long, device=device)

            for _ in range(max_length):
              # Get mask to mask out the next words
              sequence_length = y_input.size(1)
              tgt_mask = torch.zeros(sequence_length, sequence_length).to(device)

              # Standard training except we pass in y_input and src_mask
              pred = model(X, y_input, tgt_mask)

              # --- GREEDY ---
              next_item = pred.topk(k=1, dim=2).indices # num with highest probability
              # --- --- --- --
              next_item = next_item[:,-1,:] # Get rid of the dummy dimension

              # Concatenate previous input with predicted best word
              y_input = torch.cat((y_input, next_item), dim=1)

            prediction = y_input

            im = 0
            preds = EOS_token * torch.ones_like(prediction)
            for i in prediction:
                arg = torch.where(i == EOS_token)
                try:
                    preds[im][:arg[0][0]+1] = prediction[im][:arg[0][0]+1]
                except: 
                    preds[im] = prediction[im]
                im += 1
            
            #bleu-score
            bleu = cal_bleu_score(preds, batch_target_seq, 4, vocab_dataset)
            total_bleu += bleu
        
    return total_bleu/len(loader)

In [None]:
def beam_search_decoder(X, k=3, max_length=25, SOS_token=int(1), EOS_token=int(2)):
  sequences = [[list() + [SOS_token], 0.0]]
	# walk over each step in sequence
  for _ in range(max_length-1): # iterate over the max length (building the sentence)
    all_candidates = list()
		# expand each current candidate
    # iterate over the number of sequences we have, i.e. k 
    for i in range(len(sequences)): 
      seq, score = sequences[i]

      if seq[-1] == EOS_token:
        candidate = [seq + [EOS_token], score]
        all_candidates.append(candidate)
        continue

      y_input = torch.tensor([seq], dtype=torch.long, device=device) # dim = (1, len(seq))
      sequence_length = y_input.size(1)
      tgt_mask = torch.zeros(sequence_length, sequence_length).to(device)

      pred = model(X, y_input, tgt_mask)

      next_item = pred.topk(k=k, dim=2).indices # num with highest probability
      scores = pred.topk(k=k, dim=2).values
      next_item = next_item[:,-1,:]
      scores = torch.nn.Softmax(dim=1)(scores[:,-1,:])
      
      # just take top k words and add them to all_candidates
      for j in range(k):
        candidate = [seq + [next_item[0,j]], score - math.log(scores[0,j])]
        all_candidates.append(candidate) # we append all candidates for all the current k sequences
        
    # order all candidates by score
    ordered = sorted(all_candidates, key=lambda tup:tup[1])

		# select k best
    sequences = ordered[:k]
    
  # Take the sequence with the highest score, i.e. lowest -log score, i.e. first sequence in sequences
  return sequences[0][0]

In [None]:
def beam_validation_loop(model, loader, vocab_dataset, k=3, max_length=25, SOS_token=1, EOS_token=2):
    
    model.eval()
    total_bleu = 0.0

    with torch.no_grad():
        for i, batch in enumerate(loader,0):
            batch_img, batch_words, batch_target_seq = load_validation(batch) # image, target, length of target
            batch_img, batch_words, batch_target_seq = batch_img.clone().detach().to(device), batch_words.clone().detach().to(device), batch_target_seq.clone().detach().to(device)
            
            # The resulting array of predictions for the whole batch is an array of the size (batch size, max_length)
            # max_length=25
            prediction = torch.zeros(len(batch_target_seq), max_length, dtype=int).to(device)

            # start iterating over the batch here:
            for b in range(len(batch_target_seq)):
              X = batch_img[b][None, :].clone().detach().to(device), batch_words[b][None, :].clone().detach().to(device)
              prediction[b] = torch.tensor(beam_search_decoder(X, k))

            #bleu-score
            bleu = cal_bleu_score(prediction, batch_target_seq, 4, vocab_dataset)
            total_bleu += bleu
        
    return total_bleu/len(loader)

# CHECK WHICH GPU WE'RE USING

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Wed Apr  6 22:34:59 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   48C    P0    36W / 250W |   1151MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# CHECK AMOUNT OF RAM WE HAVE

In [None]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 13.6 gigabytes of available RAM

Not using a high-RAM runtime


# Save and loading checkpoints

In [None]:
def save_ckp(state, checkpoint_dir):
    f_path = checkpoint_dir + 'checkpoint13.pt'
    torch.save(state, f_path)
    return

def load_ckp(checkpoint_fpath, model, optimizer):
    checkpoint = torch.load(checkpoint_fpath+ 'checkpoint13.pt')
    model.load_state_dict(checkpoint['state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer'])
    return model, optimizer, checkpoint['epoch']

# EXECUTION FUNCTION

In [None]:
def fit(model, opt, loss_fn, train_loader, val_loader, epochs,saving_path,resume_training=False):
    """
    Adapted from "A detailed guide to Pytorch's nn.Transformer() module.", by
    Daniel Melchor: https://medium.com/@danielmelchor/a-detailed-guide-to-pytorchs-nn-transformer-module-c80afbc9ffb1
    """
    if resume_training:
      checkpoint_path = saving_path
      model, opt, already_trained_epochs = load_ckp(checkpoint_path, model, opt)
    
    # Used for plotting later on
    train_loss_list, train_bleu_list, validation_bleu_list = [], [], []
    
    print("Training and validating model")
    curr_best_bleu = 0.

    for epoch in range(epochs):
        
        train_loss = train_loop(model, opt, loss_fn, train_loader)
        train_loss_list += [train_loss]

        print("Saving model")
        checkpoint = {
              'epoch': epoch,
              'state_dict': model.state_dict(),
              'optimizer': opt.state_dict()
          }
        save_ckp(checkpoint, saving_path+'notvalidated_')
        # train_bleu = beam_validation_loop(model, train_loader_five, dataset_train_five, max_length=25, SOS_token=1, EOS_token=2)
        # train_bleu_list += [train_bleu]

        validation_bleu = beam_validation_loop(model, val_loader, valset, max_length=25, SOS_token=1, EOS_token=2)
        validation_bleu_list += [validation_bleu]

        df = pd.DataFrame(data={"val_bleu": validation_bleu_list, "train_loss": train_loss_list})
        df.to_csv('/content/drive/MyDrive/GroupProjectNLP/trained_models/results13.csv', sep=',',index=False)

        if validation_bleu > curr_best_bleu:
          print(f"Improved on previous val bleu, saving new model.")
          checkpoint = {
              'epoch': epoch,
              'state_dict': model.state_dict(),
              'optimizer': opt.state_dict()
          }
          save_ckp(checkpoint, saving_path)
          curr_best_bleu = validation_bleu

        
        if (epoch+1) % 1 == 0:
          print("-"*25, f"Epoch {epoch + 1}","-"*25)
          print(f'{epoch+1}: TRAIN LOSS = {train_loss} | VAL BLEU = {validation_bleu}')
          print()
        
    return train_loss_list, validation_bleu_list

# SETTING THE MODEL

In [None]:
vocab_size = len(dataset.vocab.stoi)
d_model = 54
img_feature_size = 1792
output_size = 54
dim_feedforward=512

saving_path = '/content/drive/MyDrive/GroupProjectNLP/trained_models/'
resume_training = False

model = Transformer(
    d_model=d_model, nhead=2, num_encoder_layers=3, num_decoder_layers=3, 
    pos_dropout=0.1, trans_dropout=0.1, dim_feedforward=dim_feedforward,
    img_features_size=img_feature_size, word_size=output_size,
    vocab_size = vocab_size).to(device)

opt = torch.optim.Adam(model.parameters(), lr=0.001)

loss_fn = nn.CrossEntropyLoss(ignore_index=dataset.vocab.stoi["<pad>"])

In [None]:
train_loss_list, validation_bleu_list = fit(model, opt, loss_fn, train_loader, 
                                                            val_loader, epochs=500, saving_path=saving_path,resume_training=resume_training)