# Image Captioning

In [17]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import spacy
import re
import random
import json

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [10]:
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator

In [16]:
extracted_data_train_dir = 'extracted_data/train/'
extracted_data_test_dir = 'extracted_data/test/'

In [18]:
def load_json(file):
    with open(file, 'r') as f1:
        return json.loads(f1.read())

In [20]:
train_captions = load_json(extracted_data_train_dir + 'captions.json')
train_image_paths = load_json(extracted_data_train_dir + 'image_paths.json')
print(train_image_paths[0], train_captions[0])

mscoco/train/img/COCO_train2014_000000318556 <START> A very clean and well decorated empty bathroom <END>


## Building a vocabulary

In [23]:
max_vocab_size = 10000

In [125]:
from collections import defaultdict

word_freq = defaultdict(int)
word_to_idx = {"<pad>": 0, "<start>": 1, "<end>": 2, '<unk>': 3}
idx_to_word = {0: "<pad>", 1: "<start>", 2: "<end>", 3: '<unk>'}

unk_idx = word_to_idx['<unk>']

In [126]:
def preprocess_caption(caption):
    return caption.lower().split()

In [127]:
for caption in train_captions:
    caption = preprocess_caption(caption)
    for word in caption:
        if word in word_freq: word_freq[word] += 1
        else: word_freq[word] = 1

word_freq.pop("<start>", None)
word_freq.pop("<end>", None)

249454

In [128]:
start_idx = len(word_to_idx)
print(start_idx)

# get top K words
for word in sorted(word_freq, key=word_freq.get, reverse=True):
    word_to_idx[word] = start_idx
    idx_to_word[start_idx] = word
    start_idx += 1
    if start_idx >= max_vocab_size: break

len(word_to_idx)

4


10000

In [130]:
def encode_sentence(sentence):
    return [word_to_idx.get(x, unk_idx) for x in preprocess_caption(sentence)]

def decode_sentence(indices):
    return " ".join([idx_to_word.get(x, '<unk>') for x in indices])

In [131]:
sentence = 'This is not fun'
encoded = encode_sentence(sentence)
decoded = decode_sentence(encoded)

print(encoded)
print(decoded)

[132, 11, 665, 1931]
this is not fun


Create encoded captions for each caption in training

In [133]:
encoded_captions_train = [encode_sentence(x) for x in train_captions]

print(encoded_captions_train[:1])
print(train_captions[:1])

[[1, 4, 125, 473, 10, 742, 480, 230, 38, 2]]
['<START> A very clean and well decorated empty bathroom <END>']
