In [1]:
# https://github.com/sgrvinod/a-PyTorch-Tutorial-to-Image-Captioning
import os
import numpy as np
import h5py
import json
import torch
import imageio
from PIL import Image
import matplotlib.pyplot as plt
import torch.nn as nn
import torchvision
from tqdm import tqdm
from collections import Counter
from random import seed, choice, sample
from torch.utils.data import Dataset
from PIL import Image
from torchvision.transforms import transforms
from torch.nn.utils.rnn import pack_padded_sequence



A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/opt/anaconda3/lib/python3.11/site-packages/ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "/opt/anaconda3/lib/python3.11/site-packages/traitlets/config/application.py", line 992, in launch_instance
    app.start()
  File "/opt/anaconda3/lib/python3.11/site-packages/ipykernel/kernelapp.py", line 701, in start
    self.io_loop.start()
  File "/opt/anaconda3/lib/python3.11/site-p

In [2]:
def save_checkpoint( epoch, encoder, decoder, decoder_optimizer,
                    bleu4, is_best):
    state = {'epoch': epoch,
             'encoder': encoder,
             'decoder': decoder,
             'decoder_optimizer': decoder_optimizer}
    filename = 'checkpoint_' + str(epoch) + '.pth.tar'
    torch.save(state, filename)
    # If this checkpoint is the best so far, store a copy so it doesn't get overwritten by a worse checkpoint
    if is_best:
        torch.save(state, filename)

In [3]:
class AverageMeter(object):
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [4]:
# Adjust learning rate
def adjust_learning_rate(optimizer, shrink_factor):
    print("\nDECAYING learning rate.")
    optimizer.param_groups[0]['lr'] = optimizer.param_groups[0]['lr'] * shrink_factor
    print("The new learning rate is: {:.3f}".format(optimizer.param_groups[0]['lr']))

In [5]:
def accuracy(predictions, targets, k):
    # k is the number of top k predictions to consider
    """
    predictions: batch_size , max(decode_lengths) , vocab_size ->> ((sum of decode_lengths) , vocab_size)
    targets: batch_size , max(decode_lengths) ->> (sum of decode_lengths)
    """
    batch_size = targets.size(0)
    _, ind = predictions.topk(k, 1, True, True)
    correct = ind.eq(targets.view(-1, 1).expand_as(ind))
    correct_total = correct.view(-1).float().sum()
    return correct_total.item() * (100.0 / batch_size)

In [6]:
def example_1():
    predictions = torch.randn(4,7)
    targets = torch.randn(4)
    print("predictions: ", predictions)
    print("targets: ", targets)
    _, ind = predictions.topk(3, 1, True, True)
    print("ind: ",ind)
    print("ind.shape: ",ind.shape)
    targets = targets.view(-1, 1).expand_as(ind)
    print("transforms: ",targets)

example_1()

predictions:  tensor([[-0.0576,  0.6382,  0.5079,  0.6549,  1.9575,  0.0989,  1.0490],
        [ 0.0279, -0.2752, -1.0813, -0.7613,  1.6496,  1.0647,  1.6779],
        [ 0.3651, -1.9554,  0.3273,  2.1166,  1.8529, -0.3284,  1.2625],
        [-0.8952,  0.5363,  1.9846, -1.0545,  1.0618, -1.4901, -0.8384]])
targets:  tensor([-0.3005,  0.4374,  0.0802, -1.9800])
ind:  tensor([[4, 6, 3],
        [6, 4, 5],
        [3, 4, 6],
        [2, 4, 1]])
ind.shape:  torch.Size([4, 3])
transforms:  tensor([[-0.3005, -0.3005, -0.3005],
        [ 0.4374,  0.4374,  0.4374],
        [ 0.0802,  0.0802,  0.0802],
        [-1.9800, -1.9800, -1.9800]])


In [7]:
class Flicker8kDataset(Dataset):
    def __init__(self, data_transforms):
        # Load images
        self.h = h5py.File('TRAIN_IMAGES.hdf5', 'r') # read an hdf5 file
        self.img = self.h['images'] # images
        self.cpi = self.h.attrs['captions_per_image'] # captions per image
        # Load captions
        with open('TRAIN_CAPTIONS.json', 'r') as j:
            self.captions = json.load(j)
        # Load caption lengths
        with open('TRAIN_CAPLENS.json', 'r') as j:
            self.caplens = json.load(j)
        # Total number of datapoints
        self.dataset_size = len(self.captions)
        # Image transformer
        self.transform = data_transforms

    def __getitem__(self, i): # Retrieve one image and the caption of it
        # i is the index of the image
        # [i // self.cpi] this is happening because we have e.g. 5 captions per image 
        # So here the maximum range of a pixel in each of the RGMB channels is 255, so therefore we're gonna divide it by 255 to get values between zero and one
        img = torch.FloatTensor(self.img[i // self.cpi] / 255.) 
        if self.transform is not None:
            img = self.transform(img)
        caption = torch.LongTensor(self.captions[i]) # captions
        caplen = torch.LongTensor([self.caplens[i]]) # caption lengths
        return img, caption, caplen
    
    def __len__(self):
        return self.dataset_size # it'll keep loading data until it reaches this number right here.
       

In [8]:
train_loader = torch.utils.data.DataLoader(Flicker8kDataset(data_transforms = None), 
                                           batch_size=10, shuffle=True, 
                                           pin_memory=True
                                           )

FileNotFoundError: [Errno 2] Unable to open file (unable to open file: name = 'TRAIN_IMAGES.hdf5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)

In [None]:
#img, caption, caplen = next(iter(train_loader)) 

In [9]:
class Encoder(nn.Module):
    def __init__(self):
        super(Encoder, self).__init__()
        resnet = torchvision.models.resnet101(pretrained=True) # pretrained ImageNet ResNet-101
        all_modules = list(resnet.children()) # get all the modules of the resnet (all layers)
        modules = all_modules[:-2] # get all the modules except the last two (Adaptive average pooling and the fully connected layer(Linear layer))
        self.resnet = nn.Sequential(*modules) # create a sequential model with all the modules except the last two
        self.avgpool = nn.AvgPool2d((14, 14)) # create an adaptive average pooling layer to convert the output of the resnet to a fixed size
        self.fine_tune() # fine-tune the model
    
    """So usually when you use a pre-trained convolutional network
    and a encoder decoder framework,
    then you freeze the layers in the first few training steps
    and you just train the decoder, the LSTM decoder alone.
    And then you fine tune the CNN along with the decoder
    for three or four more epochs."""

    def fine_tune(self, fine_tune): 
        for p in self.resnet.parameters():
            p.requires_grad = fine_tune
    
    def forward(self, images):
        """
        images: (batch_size, 3, 254, 254)
        """
        batch_size = images.shape[0]
        encoded_image = self.resnet(images)
        global_features = self.avgpool(encoded_image) # (batch_size, 2048, 1, 1)
        return  global_features.view(batch_size, -1) # (batch_size, 2048)




In [None]:
class Decoder(nn.Module):
    def __init__(self, embed_dim, decoder_dim, vocab_size, encoder_dim=2048):
        super(Decoder, self).__init__()
        self.embed_dim = embed_dim
        self.decoder_dim = decoder_dim
        self.vocab_size = vocab_size
        self.encoder_dim = encoder_dim
        # Define layers
        self.embedding = nn.Embedding(vocab_size, embed_dim) # embedding layer
        # embed_dim + encoder_dim: So in this case, each of the LSTM time steps receives the word and the image feature. So therefore, the dimensionality of the LSTM is the words, the dimensionality of the word plus the image feature dimensionality.
        self.lstm = nn.LSTMCell(embed_dim + encoder_dim, decoder_dim, bias=True)
        self.fc = nn.Linear(decoder_dim, vocab_size) # Classification layer
        # Initialize the weights
        self.init_weights() #for faster convergence

    def init_weights(self):
        self.embedding.weight.data.uniform_(-0.1, 0.1) #  uniform distribution between -0.1 and 0.1
        self.fc.weight.data.uniform_(-0.1, 0.1) 
        self.fc.bias.data.fill_(0) # Initialize bias with zeros

    def init_hidden_state(self, batch_size):
        # construct metrixs of zeros
        h = torch.zeros(batch_size, self.decoder_dim).to(device) # Hidden state
        c = torch.zeros(batch_size, self.decoder_dim).to(device) # Memory stage
        return h, c
    
    def forward(self, global_iamge, encoded_captions, caption_lengths):
        """
        global_image: (batch_size, number_pixels, 2048)
        encoded_captions: (batch_size, max_caption_length)
        caption_lengths: (batch_size, 1)
        """
        batch_size = global_iamge.size(0)
        encoder_dim = global_iamge.size(-1) # (2048)
        caption_lengths, sort_indx = caption_lengths.squeeze(1).sort(dim =0, descending =  True) # sort the caption lengths in descending order

        """Okay, so now what we're going to do is
        sort everything again, since we've sorted the caption
        lengths according to descending order.
        So that means the order
        of the batch samples is different now.
        So therefore we need to reorder the encoded captions
        and the global image,
        because if we don't, then basically we have different order
        since we've sorted them here.
        """
        global_image = global_image[sort_indx]
        encoded_captions = encoded_captions[sort_indx] # (batch_size, max_caption_length)
        # Run embeddings layer
        embeddings = self.embedding(encoded_captions) # (batch_size, max_caption_length, embed_dim)
        # Initialize LSTM hidden state
        h,c = self.init_hidden_state(batch_size)
        decode_lengths = (caption_lengths - 1).tolist() # (batch_size) # (caption_lengths - 1) because we don't want to decode the <end> token
        predictions = torch.zeros(batch_size, max(decode_lengths), self.vocab_size).to(device) # (batch_size, max(decode_lengths), vocab_size)

        # Define LSTM
        for t in range(max(decode_lengths)):
            batch_size_t = sum([l > t for l in decode_lengths]) # batch size for each time step
            lstm_input = torch.cat([embeddings[:batch_size_t, t, :], global_image[:batch_size_t]], dim=-1) 
            h, c = self.lstm(lstm_input, (h[:batch_size_t], c[:batch_size_t]))
            preds = self.fc(h)
            predictions[:batch_size_t, t, :] = preds

        return predictions, encoded_captions, decode_lengths, sort_indx



        