# Download dataset *

In [None]:
!wget https://bashupload.com/-nc5I/train2014.zip
!unzip train2014.zip
!rm train2014.zip
!wget https://bashupload.com/baIo3/val2014.zip
!unzip val2014.zip
!rm val2014.zip
!wget http://cs.stanford.edu/people/karpathy/deepimagesent/caption_datasets.zip
!unzip caption_datasets.zip
!rm caption_datasets.zip
!rm dataset_flickr30k.json
!rm dataset_flickr8k.json

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!cp /content/drive/My\ Drive/bootcamp/ML\ fundamentals/image\ captioning/train2014.zip ./
!unzip train2014.zip
!rm train2014.zip
!cp /content/drive/My\ Drive/bootcamp/ML\ fundamentals/image\ captioning/val2014.zip ./
!unzip val2014.zip
!rm val2014.zip
!wget http://cs.stanford.edu/people/karpathy/deepimagesent/caption_datasets.zip
!unzip caption_datasets.zip
!rm caption_datasets.zip
!rm dataset_flickr30k.json
!rm dataset_flickr8k.json

#Data preprocessing *

In [None]:
import os
import numpy as np
import h5py
import json
import torch
from PIL import Image
from tqdm import tqdm
from collections import Counter
from random import seed, choice, sample
import cv2

In [None]:
# Datasets:
#   train2014 is a folder of image files for training
#   val2014 is a folder of image files for validation
#   dataset_coco.json is a JSON file that tells {image -> captions}

## Data loading *


In [None]:
# Load JSON file into dict
json_path = 'dataset_coco.json'
with open(json_path, 'r') as j:
    data = json.load(j)
print(data['images'][0])

# 'filename' is the image name
# 'filepath' is the folder name
# 'imgid' is the id of the image
# 'sentences' is a list of the human captioning
# 'tokens' is a list of words

{'filepath': 'val2014', 'sentids': [770337, 771687, 772707, 776154, 781998], 'filename': 'COCO_val2014_000000391895.jpg', 'imgid': 0, 'split': 'test', 'sentences': [{'tokens': ['a', 'man', 'with', 'a', 'red', 'helmet', 'on', 'a', 'small', 'moped', 'on', 'a', 'dirt', 'road'], 'raw': 'A man with a red helmet on a small moped on a dirt road. ', 'imgid': 0, 'sentid': 770337}, {'tokens': ['man', 'riding', 'a', 'motor', 'bike', 'on', 'a', 'dirt', 'road', 'on', 'the', 'countryside'], 'raw': 'Man riding a motor bike on a dirt road on the countryside.', 'imgid': 0, 'sentid': 771687}, {'tokens': ['a', 'man', 'riding', 'on', 'the', 'back', 'of', 'a', 'motorcycle'], 'raw': 'A man riding on the back of a motorcycle.', 'imgid': 0, 'sentid': 772707}, {'tokens': ['a', 'dirt', 'path', 'with', 'a', 'young', 'person', 'on', 'a', 'motor', 'bike', 'rests', 'to', 'the', 'foreground', 'of', 'a', 'verdant', 'area', 'with', 'a', 'bridge', 'and', 'a', 'background', 'of', 'cloud', 'wreathed', 'mountains'], 'raw'

In [None]:
data["images"]

In [None]:
# Each image may have multiple captions
# to reduce the bias we are introducing, 
# use the same number of captions per image
captions_per_image=5

# Maximum number of words in a sentence
# If the sentence has more than max_len words, skip it
# If the sentence has less than max_len words, pad it with <pad>
max_len=50

# From json object to a list of (image_path, captions) pairs 
# note: captions should be a list of word lists
train_img_cap_pairs = []
val_img_cap_pairs = []
test_img_cap_pairs = []

# It contains all distinct words
word_set = set()

for img_obj in data['images']:
    captions = []
    for caption in img_obj['sentences']:
        word_set.update(caption['tokens'])
        if len(caption['tokens']) <= max_len:
            captions.append(caption['tokens'])

    # If captions is empty
    if len(captions) == 0:
        continue

    img_path = os.path.join(img_obj['filepath'], img_obj['filename'])

    # If image cannot be found
    if not os.path.exists(img_path): continue

    # Append the pair to the list
    if img_obj['split'] == 'train':
      train_img_cap_pairs.append([img_path, captions])
    elif img_obj['split'] == 'val':
      val_img_cap_pairs.append([img_path, captions])

## Data tranformation *

In [None]:
# HDF5: HDF5 is a unique technology suite that makes possible the management
# of extremely large and complex data collections.

# 1. Will create 2 hdf5 files: 
#      train_images.hdf5, val_images.hdf5
# 2. Will create 5 json files: 
#      word_map.json -- contains a (word -> number) hash object
#      train_captions.json -- contains a list of encoded training captions
#      val_captions.json -- contains a list of encoded validation captions
#      train_caption_length.json -- contains a list of training caption lengths
#      val_caption_length.json -- contains a list of validation caption lengths

In [None]:
# Word Encoding
# word_map: word    -> number (starting from 1)
#           <pad>   -> 0
#           <start> -> the second highest number
#           <end>   -> the highest number
word_map = {k: idx + 1 for idx, k in enumerate(word_set)}
word_map['<start>'] = len(word_map) + 1
word_map['<end>'] = len(word_map) + 1
word_map['<pad>'] = 0

# Save word map to a JSON
with open(os.path.join('word_map.json'), 'w') as j:
  json.dump(word_map, j)

In [None]:

for img_cap_pairs, split in [[train_img_cap_pairs,'train'], [train_img_cap_pairs, 'val']]:
    # Save encoded captions and their lengths to JSON files
    h5py_path = os.path.join(split + '_images.hdf5')
    
    # remove it if the path exists
    if os.path.exists(h5py_path): os.remove(h5py_path)

    with h5py.File(h5py_path, 'a') as h:
        # Make a note of the number of captions we are sampling per image
        h.attrs['captions_per_image'] = captions_per_image

        # Create dataset inside HDF5 file to store images
        # do channel first for the image
        images = h.create_dataset('images', (len(img_cap_pairs), 3, 256, 256), dtype='uint8')

        enc_captions = []
        caplens = []
        for index, img_cap_pair in enumerate(img_cap_pairs):
            img_path, captions = img_cap_pair

            if len(captions) < captions_per_image:
                # add some captions by randomly sampling from captions
                captions = captions + [choice(captions) for _ in range(captions_per_image - len(captions))]
            else:
                # randomly sample k from captions
                captions = sample(captions, captions_per_image)

            # Sanity check
            assert len(captions) == captions_per_image

            # Read image and transform it into (3, 256, 256)
            # use cv2, need to read, resize and transpose
            img = cv2.imread(img_path)
            img = cv2.resize(img, (256, 256))
            img = img.transpose(2, 0, 1)

            assert img.shape == (3, 256, 256)

            # Save image to HDF5 file
            images[index] = img
            for idx, caption in enumerate(captions):
                # Encode captions
                #   a list of numbers
                #   Format should be <start> word1 word2 ... wordN <end> <pad> <pad>...
                #   The total length should be equal to max_len
                enc_c = [word_map['<start>']] + [word_map[word] for word in caption] + \
                 [word_map['<end>']] + [word_map['<pad>']] * (max_len - len(caption))

                enc_captions.append(enc_c)
                caplens.append(len(caption) + 2)                

    with open(os.path.join(split + '_captions.json'), 'w') as j:
        json.dump(enc_captions, j)

    with open(os.path.join(split + '_caption_length.json'), 'w') as j:
        json.dump(caplens, j)

# Sanity check
print('caption length:', caplens[0])
print('caption:', caption)
print('caption encoding:', enc_c)

caption length: 13
caption: ['a', 'plate', 'of', 'crinkle', 'fries', 'and', 'panini', 'sandwiches']
caption encoding: [27930, 6413, 26063, 8787, 2378, 26541, 27042, 3220, 6744, 27931, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


#Helper functions *

In [None]:
def save_checkpoint(data_name, epoch, epochs_since_improvement, encoder, decoder, encoder_optimizer, decoder_optimizer,
                    bleu4, is_best):
    state = {'epoch': epoch,
             'epochs_since_improvement': epochs_since_improvement,
             'bleu-4': bleu4,
             'encoder': encoder,
             'decoder': decoder,
             'encoder_optimizer': encoder_optimizer,
             'decoder_optimizer': decoder_optimizer}
    filename = 'checkpoint_' + data_name + '.pth.tar'
    torch.save(state, filename)
    # If this checkpoint is the best so far, store a copy so it doesn't get overwritten by a worse checkpoint
    if is_best:
        torch.save(state, 'BEST_' + filename)


class AverageMeter(object):
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

def accuracy(scores, targets, k):
    batch_size = targets.size(0)
    _, ind = scores.topk(k, 1, True, True)
    correct = ind.eq(targets.view(-1, 1).expand_as(ind))
    correct_total = correct.view(-1).float().sum()  # 0D tensor
    return correct_total.item() * (100.0 / batch_size)

# Model Architecture *

## Encoder *

In [None]:
import torch
from torch import nn
import torchvision

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


class Encoder(nn.Module):
    def __init__(self, encoded_image_size=14):
        super(Encoder, self).__init__()
        self.enc_image_size = encoded_image_size

        resnet = torchvision.models.resnet101(pretrained=True)  # pretrained ImageNet ResNet-101

        # Remove linear and pool layers
        modules = list(resnet.children())[:-2]
        self.resnet = nn.Sequential(*modules)

        # Resize image to fixed size to allow input images of variable size
        self.adaptive_pool = nn.AdaptiveAvgPool2d((encoded_image_size, encoded_image_size))

        # Want to avoid training encoder
        for p in self.resnet.parameters():
            p.requires_grad = False

    def forward(self, images):
        # Use resnet, apply adaptive_pool
        out = self.resnet(images)  # (batch_size, 2048, image_size/32, image_size/32)
        out = self.adaptive_pool(out)  # (batch_size, 2048, encoded_image_size, encoded_image_size)
        out = out.permute(0, 2, 3, 1)  # (batch_size, encoded_image_size, encoded_image_size, 2048)
        return out



##Attention *

In [None]:
class Attention(nn.Module):
    """
    Attention Network.
    """

    def __init__(self, encoder_dim, decoder_dim, attention_dim):
        """
        :param encoder_dim: feature size of encoded images
        :param decoder_dim: size of decoder's RNN
        :param attention_dim: size of the attention network
        """
        super(Attention, self).__init__()
        self.encoder_att = nn.Linear(encoder_dim, attention_dim)  # linear layer to transform encoded image
        self.decoder_att = nn.Linear(decoder_dim, attention_dim)  # linear layer to transform decoder's output
        self.full_att = nn.Linear(attention_dim, 1)  # linear layer to calculate values to be softmax-ed
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=1)  # softmax layer to calculate weights

    def forward(self, encoder_out, decoder_hidden):
        """
        Forward propagation.
        :param encoder_out: encoded images, a tensor of dimension (batch_size, num_pixels, encoder_dim)
        :param decoder_hidden: previous decoder output, a tensor of dimension (batch_size, decoder_dim)
        :return: attention weighted encoding, weights
        """
        att1 = self.encoder_att(encoder_out)  # (batch_size, num_pixels, attention_dim)
        att2 = self.decoder_att(decoder_hidden)  # (batch_size, attention_dim)
        att = self.full_att(self.relu(att1 + att2.unsqueeze(1))).squeeze(2)  # (batch_size, num_pixels)
        alpha = self.softmax(att)  # (batch_size, num_pixels)
        attention_weighted_encoding = (encoder_out * alpha.unsqueeze(2)).sum(dim=1)  # (batch_size, encoder_dim)

        return attention_weighted_encoding, alpha


##Decoder *

In [None]:

class DecoderWithAttention(nn.Module):
    """
    Decoder.
    """

    def __init__(self, attention_dim, embed_dim, decoder_dim, vocab_size, encoder_dim=2048, dropout=0.5):
        """
        :param attention_dim: size of attention network
        :param embed_dim: embedding size
        :param decoder_dim: size of decoder's RNN
        :param vocab_size: size of vocabulary
        :param encoder_dim: feature size of encoded images
        :param dropout: dropout
        """
        super(DecoderWithAttention, self).__init__()

        self.encoder_dim = encoder_dim
        self.attention_dim = attention_dim
        self.embed_dim = embed_dim
        self.decoder_dim = decoder_dim
        self.vocab_size = vocab_size
        self.dropout = dropout

        self.attention = Attention(encoder_dim, decoder_dim, attention_dim)  # attention network

        self.embedding = nn.Embedding(vocab_size, embed_dim)  # embedding layer
        self.dropout = nn.Dropout(p=self.dropout)
        self.decode_step = nn.LSTMCell(embed_dim + encoder_dim, decoder_dim, bias=True)  # decoding LSTMCell
        self.init_h = nn.Linear(encoder_dim, decoder_dim)  # linear layer to find initial hidden state of LSTMCell
        self.init_c = nn.Linear(encoder_dim, decoder_dim)  # linear layer to find initial cell state of LSTMCell
        self.f_beta = nn.Linear(decoder_dim, encoder_dim)  # linear layer to create a sigmoid-activated gate
        self.sigmoid = nn.Sigmoid()
        self.fc = nn.Linear(decoder_dim, vocab_size)  # linear layer to find scores over vocabulary
        self.init_weights()  # initialize some layers with the uniform distribution

    def init_weights(self):
        """
        Initializes some parameters with values from the uniform distribution, for easier convergence.
        """
        self.embedding.weight.data.uniform_(-0.1, 0.1)
        self.fc.bias.data.fill_(0)
        self.fc.weight.data.uniform_(-0.1, 0.1)

    def init_hidden_state(self, encoder_out):
        """
        Creates the initial hidden and cell states for the decoder's LSTM based on the encoded images.
        :param encoder_out: encoded images, a tensor of dimension (batch_size, num_pixels, encoder_dim)
        :return: hidden state, cell state
        """
        mean_encoder_out = encoder_out.mean(dim=1)
        h = self.init_h(mean_encoder_out)  # (batch_size, decoder_dim)
        c = self.init_c(mean_encoder_out)
        return h, c

    def forward(self, encoder_out, encoded_captions, caption_lengths):
        """
        Forward propagation.
        :param encoder_out: encoded images, a tensor of dimension (batch_size, enc_image_size, enc_image_size, encoder_dim)
        :param encoded_captions: encoded captions, a tensor of dimension (batch_size, max_caption_length)
        :param caption_lengths: caption lengths, a tensor of dimension (batch_size, 1)
        :return: scores for vocabulary, sorted encoded captions, decode lengths, weights, sort indices
        """

        batch_size = encoder_out.size(0)
        encoder_dim = encoder_out.size(-1)
        vocab_size = self.vocab_size

        # Flatten image
        encoder_out = encoder_out.view(batch_size, -1, encoder_dim)  # (batch_size, num_pixels, encoder_dim)
        num_pixels = encoder_out.size(1)

        # Sort input data by decreasing lengths
        caption_lengths, sort_ind = caption_lengths.squeeze(1).sort(dim=0, descending=True)
        encoder_out = encoder_out[sort_ind]
        encoded_captions = encoded_captions[sort_ind]

        # Embedding
        embeddings = self.embedding(encoded_captions)  # (batch_size, max_caption_length, embed_dim)

        # Initialize LSTM state
        h, c = self.init_hidden_state(encoder_out)  # (batch_size, decoder_dim)

        # Won't decode at the <end> position, since it've finished generating as soon as it generate <end>
        # So, decoding lengths are actual lengths - 1
        decode_lengths = (caption_lengths - 1).tolist()

        # Create tensors to hold word predicion scores and alphas
        predictions = torch.zeros(batch_size, max(decode_lengths), vocab_size).to(device)
        alphas = torch.zeros(batch_size, max(decode_lengths), num_pixels).to(device)

        # At each time-step, decode by
        # attention-weighing the encoder's output based on the decoder's previous hidden state output
        # then generate a new word in the decoder with the previous word and the attention weighted encoding
        for t in range(max(decode_lengths)):
            batch_size_t = sum([l > t for l in decode_lengths])
            attention_weighted_encoding, alpha = self.attention(encoder_out[:batch_size_t],
                                                                h[:batch_size_t])
            gate = self.sigmoid(self.f_beta(h[:batch_size_t]))  # gating scalar, (batch_size_t, encoder_dim)
            attention_weighted_encoding = gate * attention_weighted_encoding
            h, c = self.decode_step(
                torch.cat([embeddings[:batch_size_t, t, :], attention_weighted_encoding], dim=1),
                (h[:batch_size_t], c[:batch_size_t]))  # (batch_size_t, decoder_dim)
            preds = self.fc(self.dropout(h))  # (batch_size_t, vocab_size)
            predictions[:batch_size_t, t, :] = preds
            alphas[:batch_size_t, t, :] = alpha

        return predictions, encoded_captions, decode_lengths, alphas, sort_ind

# Pytoch dataset transformation *


In [None]:
import torch
from torch.utils.data import Dataset
import h5py
import json
import os


class CaptionDataset(Dataset):
    def __init__(self, split):
        """
        :param split: split, one of 'TRAIN', 'VAL', or 'TEST'
        """
        self.split = split

        # Open hdf5 file where images are stored
        self.h = h5py.File(os.path.join(self.split + '_images.hdf5'), 'r')
        self.imgs = self.h['images']

        # Captions per image
        self.cpi = self.h.attrs['captions_per_image']

        # Load encoded captions 
        with open(os.path.join(self.split + '_captions.json'), 'r') as j:
            self.captions = json.load(j)

        # Load caption lengths
        with open(os.path.join(self.split + '_caption_length.json'), 'r') as j:
            self.caplens = json.load(j)

        # Total number of datapoints
        self.dataset_size = len(self.captions)

    def __getitem__(self, i):
        # The Nth caption corresponds to the (N // captions_per_image)th image
        img = torch.FloatTensor(self.imgs[i // self.cpi] / 255.)
        normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        img = transforms.Compose([normalize])(img)

        caption = torch.LongTensor(self.captions[i])
        caplen = torch.LongTensor([self.caplens[i]])

        if self.split is 'train':
            return img, caption, caplen
        else:
            # For validation of testing, also return all 'captions_per_image' captions to find BLEU-4 score
            all_captions = torch.LongTensor(
                self.captions[((i // self.cpi) * self.cpi):(((i // self.cpi) * self.cpi) + self.cpi)])
            return img, caption, caplen, all_captions

    def __len__(self):
        return self.dataset_size

# Model training *

##Initialize parameters *

In [None]:
import time
import torch.backends.cudnn as cudnn
import torch.optim
import torch.utils.data
import torchvision.transforms as transforms
from torch import nn
from torch.nn.utils.rnn import pack_padded_sequence
from nltk.translate.bleu_score import corpus_bleu

# Model hyper-parameters
emb_dim = 512  # dimension of word embeddings
attention_dim = 512  # dimension of attention linear layers
decoder_dim = 512  # dimension of decoder RNN
dropout = 0.5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  
cudnn.benchmark = True  

epochs = 30  
batch_size = 32
decoder_lr = 4e-4  
alpha_c = 1.  # regularization parameter for 'doubly stochastic attention'
best_bleu4 = 0.  # BLEU-4 score right now

## Training per epoch *

In [None]:
def train(train_loader, encoder, decoder, loss_function, decoder_optimizer, epoch):
    """
    Performs one epoch's training.
    :param train_loader: DataLoader for training data
    :param encoder: encoder model
    :param decoder: decoder model
    :param loss_function: loss layer
    :param decoder_optimizer: optimizer to update decoder's weights
    :param epoch: epoch number
    """

    decoder.train()  # train mode (dropout and batchnorm is used)
    encoder.train()

    losses = AverageMeter()  # loss (per word decoded)
    top5accs = AverageMeter()  # top5 accuracy

    # Load by batches
    for i, (imgs, caps, caplens) in enumerate(train_loader):
        # Remember to use GPU
        imgs = imgs.to(device)
        caps = caps.to(device)
        caplens = caplens.to(device)

        # Encoding
        encoded_imgs = encoder(imgs)
        # Decoding
        preds, caps_sorted, decode_lengths, alphas, sort_ind = decoder(encoded_imgs, caps, caplens)

        # Since decoded starting with <start>, the targets are all words after <start>, up to <end>
        targets = caps_sorted[:, 1:]

        # Remove timesteps that we didn't decode at, or are pads
        preds, _, _, _ = pack_padded_sequence(preds, decode_lengths, batch_first=True)
        targets, _, _, _ = pack_padded_sequence(targets, decode_lengths, batch_first=True)

        # Calculate loss
        loss = loss_function(preds, targets)

        # Add doubly stochastic attention regularization
        loss += alpha_c * ((1. - alphas.sum(dim=1)) ** 2).mean()

        # Back prop on decoder only
        loss.backward()
      
        # Update weights
        decoder_optimizer.step()
        decoder_optimizer.zero_grad()

        # Keep track of metrics
        losses.update(loss.item(), sum(decode_lengths))
        top5accs.update(accuracy(preds, targets, 5), sum(decode_lengths))

        # Print status
        if i % 100 == 0:
            print('Epoch: [{0}][{1}/{2}]\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                  'Top-5 Accuracy {top5.val:.3f} ({top5.avg:.3f})'.format(epoch, i, len(train_loader),
                                                                          loss=losses,
                                                                          top5=top5accs))


## Validation per epoch *

In [None]:
def validate(val_loader, encoder, decoder, loss_function):
    """
    Performs one epoch's validation.
    :param val_loader: DataLoader for validation data.
    :param encoder: encoder model
    :param decoder: decoder model
    :param loss_function: loss layer
    :return: BLEU-4 score
    """
    # eval mode (no dropout or batchnorm)
    decoder.eval()  
    encoder.eval()

    losses = AverageMeter()
    top5accs = AverageMeter()

    references = list()  # references (true captions) for calculating BLEU-4 score
    hypotheses = list()  # hypotheses (predictions)

    with torch.no_grad():
        # Batches
        for i, (imgs, caps, caplens, allcaps) in enumerate(val_loader):
            imgs = imgs.to(device)
            caps = caps.to(device)
            caplens = caplens.to(device)

            # Forward prop.
            encoded_imgs = encoder(imgs)
            scores, caps_sorted, decode_lengths, alphas, sort_ind = decoder(encoded_imgs, caps, caplens)
            targets = caps_sorted[:, 1:]

            # Remove timesteps that we didn't decode at, or are pads
            # pack_padded_sequence is an easy trick to do this
            scores_copy = scores.clone()

            scores, _, _, _ = pack_padded_sequence(scores, decode_lengths, batch_first=True)
            targets, _, _, _ = pack_padded_sequence(targets, decode_lengths, batch_first=True)

            # Calculate loss
            loss = loss_function(scores, targets)

            # Add doubly stochastic attention regularization
            loss += alpha_c * ((1. - alphas.sum(dim=1)) ** 2).mean()

            # Keep track of metrics
            losses.update(loss.item(), sum(decode_lengths))
            top5 = accuracy(scores, targets, 5)
            top5accs.update(top5, sum(decode_lengths))

            if i % 100 == 0:
                print('Validation: [{0}/{1}]\t'
                      'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                      'Top-5 Accuracy {top5.val:.3f} ({top5.avg:.3f})\t'.format(i, len(val_loader),
                                                                                loss=losses, top5=top5accs))

            # Store references (true captions), and hypothesis (prediction) for each image
            # If for n images, we have n hypotheses, and references a, b, c... for each image, we need -
            # references = [[ref1a, ref1b, ref1c], [ref2a, ref2b], ...], hypotheses = [hyp1, hyp2, ...]

            # References
            allcaps = allcaps[sort_ind]  # because images were sorted in the decoder
            for j in range(allcaps.shape[0]):
                img_caps = allcaps[j].tolist()
                img_captions = list(
                    map(lambda c: [w for w in c if w not in {word_map['<start>'], word_map['<pad>']}],
                        img_caps))  # remove <start> and pads
                references.append(img_captions)

            # Hypotheses
            _, preds = torch.max(scores_copy, dim=2)
            preds = preds.tolist()
            temp_preds = list()
            for j, p in enumerate(preds):
                temp_preds.append(preds[j][:decode_lengths[j]])  # remove pads
            preds = temp_preds
            hypotheses.extend(preds)

            assert len(references) == len(hypotheses)

        # Calculate BLEU-4 scores
        bleu4 = corpus_bleu(references, hypotheses)

        print(
            '\n * LOSS - {loss.avg:.3f}, TOP-5 ACCURACY - {top5.avg:.3f}, BLEU-4 - {bleu}\n'.format(
                loss=losses,
                top5=top5accs,
                bleu=bleu4))

    return bleu4


## Start training *

In [None]:
# Read word map
word_map_file = os.path.join('word_map.json')
with open(word_map_file, 'r') as j:
    word_map = json.load(j)

# Initialize encoder, and we don't train it
encoder = Encoder()
encoder_optimizer = None

# Initialize decoder, and adam optimizer
decoder = DecoderWithAttention(attention_dim=attention_dim,
                                embed_dim=emb_dim,
                                decoder_dim=decoder_dim,
                                vocab_size=len(word_map),
                                dropout=dropout)
decoder_optimizer = torch.optim.Adam(params=filter(lambda p: p.requires_grad, decoder.parameters()),
                                      lr=decoder_lr)


# Move to GPU, if available
decoder = decoder.to(device)
encoder = encoder.to(device)

# Loss function
loss_function = nn.CrossEntropyLoss().to(device)

# Custom dataloaders
train_loader = torch.utils.data.DataLoader(
    CaptionDataset('train'),
    batch_size=batch_size, 
    shuffle=True, 
    num_workers=1, 
    pin_memory=True
)
val_loader = torch.utils.data.DataLoader(
    CaptionDataset('val'),
    batch_size=batch_size, 
    shuffle=True, 
    num_workers=1, 
    pin_memory=True
)

# Epochs
for epoch in range(epochs):
    # One epoch's training
    train(train_loader=train_loader,
          encoder=encoder,
          decoder=decoder,
          loss_function=loss_function,
          decoder_optimizer=decoder_optimizer,
          epoch=epoch)

    # One epoch's validation
    recent_bleu4 = validate(val_loader=val_loader,
                            encoder=encoder,
                            decoder=decoder,
                            loss_function=loss_function)

    # Save checkpoint
    is_best = recent_bleu4 > best_bleu4
    best_bleu4 = max(recent_bleu4, best_bleu4)

    save_checkpoint('coco', epoch, 0, encoder, decoder, encoder_optimizer,
                    decoder_optimizer, recent_bleu4, is_best)


Downloading: "https://download.pytorch.org/models/resnet101-5d3b4d8f.pth" to /root/.cache/torch/hub/checkpoints/resnet101-5d3b4d8f.pth


HBox(children=(FloatProgress(value=0.0, max=178728960.0), HTML(value='')))


Epoch: [0][0/126]	Loss 11.2181 (11.2181)	Top-5 Accuracy 0.000 (0.000)
Epoch: [0][100/126]	Loss 5.9757 (7.0003)	Top-5 Accuracy 37.883 (33.418)
Validation: [0/126]	Loss 5.5259 (5.5259)	Top-5 Accuracy 42.234 (42.234)	
Validation: [100/126]	Loss 5.6910 (5.6864)	Top-5 Accuracy 43.646 (41.719)	

 * LOSS - 5.696, TOP-5 ACCURACY - 41.665, BLEU-4 - 0.019798742846755937

Epoch: [1][0/126]	Loss 5.9955 (5.9955)	Top-5 Accuracy 39.437 (39.437)
Epoch: [1][100/126]	Loss 5.1200 (5.6183)	Top-5 Accuracy 50.125 (44.326)
Validation: [0/126]	Loss 4.6494 (4.6494)	Top-5 Accuracy 57.265 (57.265)	
Validation: [100/126]	Loss 4.9912 (5.0468)	Top-5 Accuracy 50.000 (51.468)	

 * LOSS - 5.051, TOP-5 ACCURACY - 51.452, BLEU-4 - 0.0685348286502511

Epoch: [2][0/126]	Loss 5.1345 (5.1345)	Top-5 Accuracy 50.000 (50.000)
Epoch: [2][100/126]	Loss 4.7788 (5.1082)	Top-5 Accuracy 56.647 (50.961)
Validation: [0/126]	Loss 4.7505 (4.7505)	Top-5 Accuracy 57.880 (57.880)	
Validation: [100/126]	Loss 4.7280 (4.6619)	Top-5 Accuracy 

In [None]:
!cp BEST_checkpoint_coco.pth.tar /tmp/BEST_checkpoint_coco.pth.tar

In [None]:
!curl https://bashupload.com/BEST_checkpoint_coco.pth.tar --data-binary @/tmp/BEST_checkpoint_coco.pth.tar

tcmalloc: large alloc 1073750016 bytes == 0x561298612000 @  0x7f94363362a4 0x561255d08c81 0x561255d017bc 0x561255d03098 0x561255d083d8 0x561255cfb5d0 0x7f9435877b97 0x561255cfb71a

Uploaded 1 file, 631369728 bytes

wget https://bashupload.com/gZLIi/IG9DU.tar




# Inference


In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
!cp /content/drive/My\ Drive/bootcamp/ML\ fundamentals/image\ captioning/BEST_checkpoint_coco.pth.tar ./
!cp /content/drive/My\ Drive/bootcamp/ML\ fundamentals/image\ captioning/word_map.json ./

In [None]:
# Read word map
word_map_file = os.path.join('word_map.json')
with open(word_map_file, 'r') as j:
    word_map = json.load(j)

In [None]:
import os
import numpy as np
import h5py
import json
import torch
from PIL import Image
from tqdm import tqdm
from collections import Counter
from random import seed, choice, sample
import cv2

In [None]:
import torch.nn.functional as F
vocab_size = len(word_map)
def inference(img, allcaps):
  references = list()
  hypotheses = list()

  checkpoint = torch.load('BEST_checkpoint_coco.pth.tar')
  decoder = checkpoint['decoder']
  decoder = decoder.to(device)
  decoder.eval()
  encoder = checkpoint['encoder']
  encoder = encoder.to(device)
  encoder.eval()
  normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                    std=[0.229, 0.224, 0.225])

  k = 3
  encoder_out = encoder(img.reshape([1,3,256,256]).to(device))  # (1, enc_image_size, enc_image_size, encoder_dim)
  enc_image_size = encoder_out.size(1)
  encoder_dim = encoder_out.size(3)
  encoder_out = encoder_out.view(1, -1, encoder_dim)  # (1, num_pixels, encoder_dim)
  num_pixels = encoder_out.size(1) 

  # We'll treat the problem as having a batch size of k
  encoder_out = encoder_out.expand(k, num_pixels, encoder_dim)  # (k, num_pixels, encoder_dim)

  # Tensor to store top k previous words at each step; now they're just <start>
  k_prev_words = torch.LongTensor([[word_map['<start>']]] * k).to(device)  # (k, 1)

  # Tensor to store top k sequences; now they're just <start>
  seqs = k_prev_words  # (k, 1)

  # Tensor to store top k sequences' scores; now they're just 0
  top_k_scores = torch.zeros(k, 1).to(device)  # (k, 1)
  # Lists to store completed sequences and scores
  complete_seqs = list()
  complete_seqs_scores = list()

  # Start decoding
  step = 1
  h, c = decoder.init_hidden_state(encoder_out)

  # s is a number less than or equal to k, because sequences are removed from this process once they hit <end>
  while True:

      embeddings = decoder.embedding(k_prev_words).squeeze(1)  # (s, embed_dim)

      awe, _ = decoder.attention(encoder_out, h)  # (s, encoder_dim), (s, num_pixels)

      gate = decoder.sigmoid(decoder.f_beta(h))  # gating scalar, (s, encoder_dim)
      awe = gate * awe

      h, c = decoder.decode_step(torch.cat([embeddings, awe], dim=1), (h, c))  # (s, decoder_dim)

      scores = decoder.fc(h)  # (s, vocab_size)
      scores = F.log_softmax(scores, dim=1)

      # Add
      scores = top_k_scores.expand_as(scores) + scores  # (s, vocab_size)

      # For the first step, all k points will have the same scores (since same k previous words, h, c)
      if step == 1:
          top_k_scores, top_k_words = scores[0].topk(k, 0, True, True)  # (s)
      else:
          # Unroll and find top scores, and their unrolled indices
          top_k_scores, top_k_words = scores.view(-1).topk(k, 0, True, True)  # (s)

      # Convert unrolled indices to actual indices of scores
      prev_word_inds = top_k_words // vocab_size  # (s)
      next_word_inds = top_k_words % vocab_size  # (s)

      # Add new words to sequences
      seqs = torch.cat([seqs[prev_word_inds], next_word_inds.unsqueeze(1)], dim=1)  # (s, step+1)

      # Which sequences are incomplete (didn't reach <end>)?
      incomplete_inds = [ind for ind, next_word in enumerate(next_word_inds) if
                          next_word != word_map['<end>']]
      complete_inds = list(set(range(len(next_word_inds))) - set(incomplete_inds))

      # Set aside complete sequences
      if len(complete_inds) > 0:
          complete_seqs.extend(seqs[complete_inds].tolist())
          complete_seqs_scores.extend(top_k_scores[complete_inds])
      k -= len(complete_inds)  # reduce beam length accordingly

      # Proceed with incomplete sequences
      if k == 0:
          break
      seqs = seqs[incomplete_inds]
      h = h[prev_word_inds[incomplete_inds]]
      c = c[prev_word_inds[incomplete_inds]]
      encoder_out = encoder_out[prev_word_inds[incomplete_inds]]
      top_k_scores = top_k_scores[incomplete_inds].unsqueeze(1)
      k_prev_words = next_word_inds[incomplete_inds].unsqueeze(1)

      # Break if things have been going on too long
      if step > 50:
          break
      step += 1

  i = complete_seqs_scores.index(max(complete_seqs_scores))
  seq = complete_seqs[i]

  # References
  img_caps = allcaps[0].tolist()
  img_captions = list(
      map(lambda c: [w for w in c if w not in {word_map['<start>'], word_map['<end>'], word_map['<pad>']}],
          img_caps))  # remove <start> and pads
  references.append(img_captions)

  # Hypotheses
  hypotheses.append([w for w in seq if w not in {word_map['<start>'], word_map['<end>'], word_map['<pad>']}])
  print('Model prediction:')
  for ref in references[0]:
    print(' '.join([rev_word_map[num] for num in ref]))

In [None]:
import matplotlib. pyplot as plt 
from sklearn.preprocessing import MinMaxScaler

val_loader = torch.utils.data.DataLoader(
    CaptionDataset('val'),
    batch_size=batch_size, 
    shuffle=True, 
    num_workers=1, 
    pin_memory=True
)

rev_word_map = {v: k for k, v in word_map.items()}

for i, (imgs, caps, caplens, allcaps) in enumerate(val_loader):
  img = imgs[0].detach().numpy().transpose([1,2,0])
  normalized_img = (img-np.min(img))/(np.max(img)-np.min(img))
  plt.imshow(cv2.cvtColor(normalized_img, cv2.COLOR_BGR2RGB))
  plt.show()
  print('Label:')
  sentence = []
  for num in caps[0]:
    word = rev_word_map[int(num.numpy())]
    sentence.append(word)
    if word == '<end>': break
  print(' '.join(sentence))
  inference(imgs[0], allcaps)
  
  break

In [None]:
!pip install ffmpeg-python
import ffmpeg