<a href="https://colab.research.google.com/github/jaindivij12/Automatic_image_captioning/blob/main/Automatic_image_captioning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
from numpy import array
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import string

In [19]:
def load_document(filename):
	file = open(filename, 'r')
	text = file.read()
	file.close()
	return text

filename = "/content/drive/MyDrive/flicker8k/Flickr_Data/Flickr_TextData/Flickr8k.token.txt"
doc = load_doc(filename)
print(doc[:300])


1000268201_693b08cb0e.jpg#0	A child in a pink dress is climbing up a set of stairs in an entry way .
1000268201_693b08cb0e.jpg#1	A girl going into a wooden building .
1000268201_693b08cb0e.jpg#2	A little girl climbing into a wooden playhouse .
1000268201_693b08cb0e.jpg#3	A little girl climbing the s


In [12]:

def load_descriptions(doc):
	mapping = dict()
	# process lines
	for line in doc.split('\n'):
		# split line by white space
		tokens = line.split()
		if len(line) < 2:
			continue
		# take the first token as the image id, the rest as the description
		image_id, image_desc = tokens[0], tokens[1:]
		# extract filename from image id
		image_id = image_id.split('.')[0]
		# convert description tokens back to string
		image_desc = ' '.join(image_desc)
		# create the list if needed
		if image_id not in mapping:
			mapping[image_id] = list()
		# store description
		mapping[image_id].append(image_desc)
	return mapping

# parse descriptions
descriptions = load_descriptions(doc)
print('Loaded: %d ' % len(descriptions))


Loaded: 8092 


In [13]:
list(descriptions.keys())[20:25]

['1024138940_f1fefbdce1',
 '102455176_5f8ead62d5',
 '1026685415_0431cbf574',
 '1028205764_7e8df9a2ea',
 '1030985833_b0902ea560']

In [14]:
descriptions['1024138940_f1fefbdce1']

['Two different breeds of brown and white dogs play on the beach .',
 'Two dogs are making a turn on a soft sand beach .',
 'Two dogs playing in the sand at the beach .',
 'Two dogs playing together on a beach .',
 'Two large tan dogs play along a sandy beach .']

We do basic cleaning step, which includes upper case letters to lower case letters and remove punctuation

In [15]:
def clean_descriptions(descriptions):
	# prepare translation table for removing punctuation
	table = str.maketrans('', '', string.punctuation)
	for key, desc_list in descriptions.items():
		for i in range(len(desc_list)):
			desc = desc_list[i]
			# tokenize
			desc = desc.split()
			# convert to lower case
			desc = [word.lower() for word in desc]
			# remove punctuation from each token
			desc = [w.translate(table) for w in desc]
			# remove hanging 's' and 'a'
			desc = [word for word in desc if len(word)>1]
			# remove tokens with numbers in them
			desc = [word for word in desc if word.isalpha()]
			# store as string
			desc_list[i] =  ' '.join(desc)

# clean descriptions
clean_descriptions(descriptions)

In [16]:
# convert the loaded descriptions into a vocabulary of words
def to_vocabulary(descriptions):
	# build a list of all description strings
	all_desc = set()
	for key in descriptions.keys():
		[all_desc.update(d.split()) for d in descriptions[key]]
	return all_desc

# summarize vocabulary
vocabulary = to_vocabulary(descriptions)
print('Original Vocabulary Size: %d' % len(vocabulary))

Original Vocabulary Size: 8763


In [17]:
# save descriptions to file, one per line
def save_descriptions(descriptions, filename):
	lines = list()
	for key, desc_list in descriptions.items():
		for desc in desc_list:
			lines.append(key + ' ' + desc)
	data = '\n'.join(lines)
	file = open(filename, 'w')
	file.write(data)
	file.close()

save_descriptions(descriptions, 'descriptions.txt')

In [20]:
# load a pre-defined list of photo identifiers
def load_set(filename):
	doc = load_document(filename)
	dataset = list()
	# process line by line
	for line in doc.split('\n'):
		# skip empty lines
		if len(line) < 1:
			continue
		# get the image identifier
		identifier = line.split('.')[0]
		dataset.append(identifier)
	return set(dataset)

# load training dataset (6K)
filename = '/content/drive/MyDrive/flicker8k/Flickr_Data/Flickr_TextData/Flickr_8k.trainImages.txt'
train = load_set(filename)
print('Dataset: %d' % len(train))

Dataset: 6000


In [21]:
print(train)

{'1193116658_c0161c35b5', '1295669416_21cabf594d', '2742426734_291df6da08', '3533775651_9d7e93dacf', '2854291706_d4c31dbf56', '2564663851_3a9832e4fc', '3747543364_bf5b548527', '3271084924_4778d556cc', '2272426567_9e9fb79db0', '792362827_5ab5281b99', '3681172959_6674c118d2', '2860202109_97b2b22652', '397451339_76a84bd310', '3657503733_9888ccf05e', '247652942_29ede19352', '487487795_54705c406e', '2273591668_069dcb4641', '823697339_aadbeef495', '2892992529_f3335d0a71', '1311132744_5ffd03f831', '2952320230_26601173be', '3662871327_b128d25f04', '2374179071_af22170d62', '2392460773_2aa01eb340', '3219210794_4324df188b', '3687996569_99163a41c3', '3122938209_2b2c6c1fab', '2432038587_5e4148e277', '485566887_57eac33bd1', '2633201394_ee4a7666ed', '3050976633_9c25cf6fa0', '2186139563_e60c1d4b8b', '428483413_b9370baf72', '3324056835_84904fe2f8', '1022975728_75515238d8', '3656906086_7034f69ab6', '1229756013_94663527d7', '2775744946_1ab5d500a2', '267164457_2e8b4d30aa', '431282339_0aa60dd78e', '3517466