# Image Captioning

In [1]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import spacy
import re
import random
import json

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
%load_ext autoreload
%autoreload 2

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator

In [3]:
extracted_data_train_dir = 'extracted_data/train/'
extracted_data_test_dir = 'extracted_data/test/'

In [4]:
def load_json(file):
    with open(file, 'r') as f1:
        return json.loads(f1.read())

In [5]:
train_captions = load_json(extracted_data_train_dir + 'captions.json')
train_image_paths = load_json(extracted_data_train_dir + 'image_paths.json')
print(train_image_paths[0], train_captions[0])

mscoco/train/img/COCO_train2014_000000318556 <START> A very clean and well decorated empty bathroom <END>


## Building a vocabulary

In [6]:
max_vocab_size = 10000

In [7]:
from collections import defaultdict

word_freq = defaultdict(int)
word_to_idx = {"<pad>": 0, "<start>": 1, "<end>": 2, '<unk>': 3}
idx_to_word = {0: "<pad>", 1: "<start>", 2: "<end>", 3: '<unk>'}

unk_idx = word_to_idx['<unk>']

In [8]:
def preprocess_caption(caption):
    return caption.lower().split()

In [9]:
for caption in train_captions:
    caption = preprocess_caption(caption)
    for word in caption:
        if word in word_freq: word_freq[word] += 1
        else: word_freq[word] = 1

word_freq.pop("<start>", None)
word_freq.pop("<end>", None)

249454

In [10]:
start_idx = len(word_to_idx)
print(start_idx)

# get top K words
for word in sorted(word_freq, key=word_freq.get, reverse=True):
    word_to_idx[word] = start_idx
    idx_to_word[start_idx] = word
    start_idx += 1
    if start_idx >= max_vocab_size: break

len(word_to_idx)

4


10000

In [11]:
def encode_sentence(sentence):
    return [word_to_idx.get(x, unk_idx) for x in preprocess_caption(sentence)]

def decode_sentence(indices):
    return " ".join([idx_to_word.get(x, '<unk>') for x in indices])

In [12]:
sentence = 'This is not fun'
encoded = encode_sentence(sentence)
decoded = decode_sentence(encoded)

print(encoded)
print(decoded)

[132, 11, 665, 1931]
this is not fun


Create encoded captions for each caption in training

In [13]:
encoded_captions_train = [encode_sentence(x) for x in train_captions]

print(encoded_captions_train[:1])
print(decode_sentence(encoded_captions_train[0]))
print(train_captions[:1])

[[1, 4, 125, 473, 10, 742, 480, 230, 38, 2]]
<start> a very clean and well decorated empty bathroom <end>
['<START> A very clean and well decorated empty bathroom <END>']


In [14]:
len(encoded_captions_train)

249454

## Create Dataset

In [153]:
from torch.utils.data import DataLoader
from MyDataset import MyDataset

In [154]:
from torch.nn.utils.rnn import pad_sequence
wat = [torch.tensor(x, dtype=torch.int16) for x in encoded_captions_train]
padded = pad_sequence(wat).permute(1, 0)
print(len(padded))

249454


In [155]:
dataset = MyDataset(enc_captions=padded,
                    image_paths=train_image_paths,
                   data_dir=extracted_data_train_dir + 'vecs/')

In [156]:
x = dataset[5]

In [157]:
dataset[0][0].shape

torch.Size([64, 2048])

In [158]:
dataloader = DataLoader(dataset=dataset, batch_size=64, 
                         num_workers = 3)

In [159]:
for idx, data in enumerate(dataloader):
    imgs, labels = data[0], data[1]
    print(idx, imgs.shape)
    print(labels[0].shape)
    print(labels[0].tolist())
    print(decode_sentence(labels[0].tolist()))
    break

0 torch.Size([64, 64, 2048])
torch.Size([51])
[1, 4, 125, 473, 10, 742, 480, 230, 38, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
<start> a very clean and well decorated empty bathroom <end> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>


## Train model

In [327]:
from layers.Encoder import Encoder
from layers.Decoder import Decoder
from layers.Attention import Attention
from layers.End2End import End2End

In [419]:
# INPUT_DIM = len(SRC.vocab)
# OUTPUT_DIM = len(TRG.vocab)
DEC_EMB_DIM = 256
ENC_INPUT = 2048
ENC_OUTPUT = 256
DEC_HID_DIM = 512
ATTN_DIM = 512
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

In [426]:
encoder = Encoder(ENC_INPUT, ENC_OUTPUT)
decoder = Decoder(ENC_OUTPUT, DEC_HID_DIM, DEC_HID_DIM, ATTN_DIM, device, vocab_size=max_vocab_size)

model = End2End(enc, dec, device).to(device)

In [427]:
def count_parameters(model: nn.Module):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 7,216,833 trainable parameters


In [428]:
optimizer = optim.Adam(model.parameters())

In [429]:
PAD_IDX = 0

criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

In [441]:
def train(img_tensor, target):

    model.train()

    loss = 0
    
    hidden = decoder.reset_state(batch_size=target.shape[0])
    dec_input = [word_to_idx['<start>']] * target.shape[0]
    dec_input = torch.as_tensor(dec_input).unsqueeze(1)
    
    features = encoder(img_tensor)
    for i in range(target.shape[0]):
        
        predictions, hidden = decoder(dec_input, features, hidden)
#         print("PREDS: ", predictions.shape)
#         print("CONCR: ", target[:,i].unsqueeze(-1).shape)
#         return
#         loss += criterion(target[:,i].unsqueeze(-1), predictions)
        loss += cire
    
    optimizer.zero_grad()
    loss.backward()

    optimizer.step()
    total_loss = (loss / int(target.shape[1]))

    return loss, total_loss

In [442]:
import time

loss_plot = []
EPOCHS = 20

for epoch in range(EPOCHS):
    start = time.time()
    total_loss = 0
    
    for idx, batch in enumerate(dataloader):
        img_tensor, target = batch[0], batch[1]
        
        batch_loss, t_loss = train(img_tensor, target)
        total_loss += t_loss
        
        
        if batch % 100 == 0:
            print ('Epoch {} Batch {} Loss {:.4f}'.format(
              epoch + 1, batch, batch_loss.numpy() / int(target.shape[1])))
    
        # storing the epoch end loss value to plot later
    loss_plot.append(total_loss / num_steps)

    print ('Epoch {} Loss {:.6f}'.format(epoch + 1,
                                         total_loss/num_steps))
    print ('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

AttributeError: 'int' object has no attribute 'backward'

In [None]:
plt.plot(loss_plot)
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Loss Plot')
plt.show()