In [1]:
import string
import numpy as np
from PIL import Image
import os
from pickle import dump, load
import numpy as np
from keras.applications.xception import Xception, preprocess_input
from keras.preprocessing.image import load_img, img_to_array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers.merge import add
from keras.models import Model, load_model
from keras.layers import Input, Dense, LSTM, Embedding, Dropout
# small library for seeing the progress of loops.
from tqdm.autonotebook import tqdm
tqdm.pandas()

Using TensorFlow backend.
  from pandas import Panel


In [2]:
# Loading a text file into memory
def load_doc(filename):
    file=open(filename,'r')
    text=file.read()
    file.close()
    return text
load_doc('Flickr8k.token.txt')



In [4]:
# get all imgs with their captions
def all_img_captions(filename):
    file = load_doc(filename)
    captions = file.split('\n')
    descriptions ={}
    for caption in captions[:-1]:
        img, caption = caption.split('\t')
        if img[:-2] not in descriptions:
            descriptions[img[:-2]] = [ caption ]
        else:
            descriptions[img[:-2]].append(caption)
    return descriptions
all_img_captions('Flickr8k.token.txt')

{'1000268201_693b08cb0e.j': ['child in pink dress is climbing up set of stairs in an entry way',
  'girl going into wooden building',
  'little girl climbing into wooden playhouse',
  'little girl climbing the stairs to her playhouse',
  'little girl in pink dress going into wooden cabin'],
 '1001773457_577c3a7d70.j': ['black dog and spotted dog are fighting',
  'black dog and tricolored dog playing with each other on the road',
  'black dog and white dog with brown spots are staring at each other in the street',
  'two dogs of different breeds looking at each other on the road',
  'two dogs on pavement moving toward each other'],
 '1002674143_1b742ab4b8.j': ['little girl covered in paint sits in front of painted rainbow with her hands in bowl',
  'little girl is sitting in front of large painted rainbow',
  'small girl in the grass plays with fingerpaints in front of white canvas with rainbow on it',
  'there is girl with pigtails sitting in front of rainbow painting',
  'young girl w

In [5]:
#Data cleaning- lower casing, removing puntuations and words containing numbers
captions=all_img_captions('Flickr8k.token.txt')
def cleaning_text(captions):
    table = str.maketrans('','',string.punctuation)
    for img,caps in captions.items():
        for i,img_caption in enumerate(caps):
            img_caption.replace("-"," ")
            desc = img_caption.split()
            #converts to lowercase
            desc = [word.lower() for word in desc]
            #remove punctuation from each token
            desc = [word.translate(table) for word in desc]
            #remove hanging 's and a 
            desc = [word for word in desc if(len(word)>1)]
            #remove tokens with numbers in them
            desc = [word for word in desc if(word.isalpha())]
            #convert back to string
            img_caption = ' '.join(desc)
            captions[img][i]= img_caption
    return captions
cleaning_text(captions)

{'1000268201_693b08cb0e.j': ['child in pink dress is climbing up set of stairs in an entry way',
  'girl going into wooden building',
  'little girl climbing into wooden playhouse',
  'little girl climbing the stairs to her playhouse',
  'little girl in pink dress going into wooden cabin'],
 '1001773457_577c3a7d70.j': ['black dog and spotted dog are fighting',
  'black dog and tricolored dog playing with each other on the road',
  'black dog and white dog with brown spots are staring at each other in the street',
  'two dogs of different breeds looking at each other on the road',
  'two dogs on pavement moving toward each other'],
 '1002674143_1b742ab4b8.j': ['little girl covered in paint sits in front of painted rainbow with her hands in bowl',
  'little girl is sitting in front of large painted rainbow',
  'small girl in the grass plays with fingerpaints in front of white canvas with rainbow on it',
  'there is girl with pigtails sitting in front of rainbow painting',
  'young girl w

In [6]:
#vocabulary list
captions=all_img_captions('Flickr8k.token.txt')
descriptions=cleaning_text(captions)
def text_vocabulary(descriptions):
    # build vocabulary of all unique words
    vocab = set()
    for key in descriptions.keys():
        [vocab.update(d.split()) for d in descriptions[key]]
    return vocab
text_vocabulary(descriptions)

{'licked',
 'lie',
 'skimpy',
 'beside',
 'bloody',
 'begin',
 'differentcolored',
 'bug',
 'stood',
 'backpacks',
 'roddick',
 'countryside',
 'stoppie',
 'bro',
 'reached',
 'avoiding',
 'constructions',
 'necks',
 'buildings',
 'canes',
 'judo',
 'miasto',
 'teenager',
 'shown',
 'uniforms',
 'vests',
 'amid',
 'cocacola',
 'nearby',
 'tricycles',
 'snowgear',
 'pieces',
 'partake',
 'took',
 'locker',
 'gokart',
 'bmx',
 'winning',
 'getting',
 'mirror',
 'beaten',
 'hit',
 'ledges',
 'chess',
 'alike',
 'filled',
 'videotaped',
 'mans',
 'safely',
 'stripes',
 'celebrities',
 'rounds',
 'couple',
 'presses',
 'torii',
 'intently',
 'clay',
 'tigger',
 'water',
 'duel',
 'strong',
 'arched',
 'winter',
 'mouths',
 'treetops',
 'travelling',
 'multicolored',
 'attaching',
 'melted',
 'pillowcase',
 'grows',
 'headband',
 'handicap',
 'graze',
 'slipping',
 'strapped',
 'wicket',
 'leaves',
 'cds',
 'india',
 'continues',
 'nat',
 'press',
 'used',
 'ride',
 'streaming',
 'inflating'

In [7]:
#All descriptions in one file 
def save_descriptions(descriptions, filename):
    lines = list()
    for key, desc_list in descriptions.items():
        for desc in desc_list:
            lines.append(key + '\t' + desc )
    data = "\n".join(lines)
    file = open(filename,"w")
    file.write(data)
    file.close()
    return file
save_descriptions(descriptions, 'Flickr8k.token.txt')

<_io.TextIOWrapper name='Flickr8k.token.txt' mode='w' encoding='cp1252'>

In [21]:
# Set these path according to project folder in your system
from pathlib import Path
from zipfile import ZipFile
dataset_text = ZipFile("C:/Image Caption Generator/Flickr8k_text.zip")
dataset_images = ZipFile("C:/Image Caption Generator/Flickr8k_Dataset.zip")

#we prepare our text data
filename = dataset_text/"Flickr8k.token.txt"
#loading the file that contains all data
#mapping them into descriptions dictionary img to 5 captions
descriptions = all_img_captions(filename)
print("Length of descriptions =" ,len(descriptions))

#cleaning the descriptions
clean_descriptions = cleaning_text(descriptions)
#building vocabulary 
vocabulary = text_vocabulary(clean_descriptions)
print("Length of vocabulary = ", len(vocabulary))
#saving each description to file 
save_descriptions(clean_descriptions, "descriptions.txt")

TypeError: unsupported operand type(s) for /: 'ZipFile' and 'str'