In [119]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import re
import cv2
import json
plt.style.use('seaborn')

### Loading Captions

In [120]:
with open('../Flickr8k/Flickr_TextData/Flickr8k.token.txt') as f:
    data=f.read()
data=data.split('\n')[:-1] #last entry is empty

In [121]:
data[0]

'1000268201_693b08cb0e.jpg#0\tA child in a pink dress is climbing up a set of stairs in an entry way .'

### Data Cleaning
stemming ,lemmatising , stopword removal techniques have not been performed while cleaning the captions so as to teach model write captions with correct english otherwise - a caption like 'A child in a pink dress is climbing up a set of stairs in an entry way' model will predict caption of form 'child pink dress climb set of stairs entry way' which doesn't make sense.

In [122]:
def clean_text(sentence):
    sentence = sentence.lower()
    sentence = re.sub("[^a-z]+"," ",sentence)
    return sentence

In [123]:
captions={}
for i in range(len(data)):
    img_name,img_caption=data[i].split('\t')
    img_name=img_name.split('.jpg')[0]
    img_caption=clean_text(img_caption)
    if captions.get(img_name) is None:
        captions[img_name] = []
    captions[img_name].append(img_caption)

In [124]:
# print(captions)
with open("cleaned_caption_description.json", "w") as file:
    json.dump(captions, file)

In [125]:
with open("cleaned_caption_description.json", "r") as file:
    captions=json.load(file)

### creating Vocab

In [126]:
total_words=[]
for img_name in captions.keys():
    [total_words.append(i) for sent in captions[img_name] for i in sent.split()]
print('total words= {}'.format(len(total_words)))
count=np.array(np.unique(total_words,return_counts=True))
count=[(count[0][i],count[1][i])for i in range(count.shape[1])]
print('no of unique words= {}'.format(len(count))) #no of unique words

total words= 437466
no of unique words= 8441


In [127]:
#reducing size of vocab by removing words with frequency less than 10
size=10
vocab=[w[0] for w in count if int(w[1])>size]
print('no of unique words= {}'.format(len(vocab)))

no of unique words= 1850


### Load train and test data

In [108]:
train=None
test=[]
with open('../Flickr8k/Flickr_TextData/Flickr_8k.trainImages.txt') as f:
    train=f.readlines()
with open('../Flickr8k/Flickr_TextData/Flickr_8k.testImages.txt') as f:
    test=f.readlines()

In [112]:
train=[c.split('.jpg')[0] for c in train]
test=[c.split('.jpg')[0] for c in test]

In [110]:
print(train[:5])

['2513260012_03d33305cf', '2903617548_d3e38d7f88', '3338291921_fe7ae0c8f8', '488416045_1c6d903fe0', '2644326817_8f45080b87']


<start\> (start-of-sequence) and <end\> (end-of-sequence) characters are added to every caption to help our model determine when to start and - more importantly - end sequences.
Because while generating text(captions) model should know when it has to end otherwise it will keep predicting words and add them to the captions.  That's why we teach our model to decide the length of a caption on that by itself. when model will predict <end\> as next word for the caption, model should stop predicting more words.

In [130]:
#prepare train descriptions
train_descriptions={}
for t in train:
    train_descriptions[t]=[]
    for sent in captions[t]:
        new_caption='<start>'+sent+'<end>'
        train_descriptions[t].append(new_caption)

In [133]:
with open('train_descriptions.json','w') as file:
    json.dump(train_descriptions,file)

In [134]:
with open('train_descriptions.json','r') as file:
    train_descriptions=json.load(file)