In [1]:
import pandas as pd
import string
import glob

In [2]:
def save_descriptions(descriptions, filename):
    lines = list()
    for key, desc_list in descriptions.items():
        for desc in desc_list:
            lines.append(key + ' ' + desc)
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()

In [3]:
def load_clean_descriptions(filename, dataset):
    file = open(filename, 'r')
    doc = file.read()
    file.close()
    descriptions = dict()
    for line in doc.split('\n'):
        # split line by white space
        tokens = line.split()
        # split id from description
        image_id, image_desc = tokens[0], tokens[1:]
        # skip images not in the set
        if image_id in dataset:
            # create list
            if image_id not in descriptions:
                descriptions[image_id] = list()
            # wrap description in tokens
            desc = 'startseq ' + ' '.join(image_desc) + ' endseq'
            # store
            descriptions[image_id].append(desc)
    return descriptions

## Load all captions

In [4]:
all_captions_filename = "../../datasets/flickr8k/Flickr8k_text/Flickr8k.token.txt"

In [5]:
file = open(all_captions_filename, 'r')
all_captions = file.read()
file.close()

In [6]:
captions = dict()
for line in all_captions.split('\n'):
    tokens = line.split()
    if len(line) < 2:
        continue
    image_id, image_desc = tokens[0], tokens[1:]
    image_id = image_id.split('.')[0]
    image_desc = ' '.join(image_desc)
    if image_id not in captions:
        captions[image_id] = list()
    captions[image_id].append(image_desc)

print('Loaded: %d ' % len(captions))

Loaded: 8092 


In [7]:
captions["1000268201_693b08cb0e"]

['A child in a pink dress is climbing up a set of stairs in an entry way .',
 'A girl going into a wooden building .',
 'A little girl climbing into a wooden playhouse .',
 'A little girl climbing the stairs to her playhouse .',
 'A little girl in a pink dress going into a wooden cabin .']

### Preprocess all captions

In [8]:
def clean_descriptions(descriptions):
	# prepare translation table for removing punctuation
	table = str.maketrans('', '', string.punctuation)
	for key, desc_list in descriptions.items():
		for i in range(len(desc_list)):
			desc = desc_list[i]
			# tokenize
			desc = desc.split()
			# convert to lower case
			desc = [word.lower() for word in desc]
			# remove punctuation from each token
			desc = [w.translate(table) for w in desc]
			# remove hanging 's' and 'a'
			desc = [word for word in desc if len(word)>1]
			# remove tokens with numbers in them
			desc = [word for word in desc if word.isalpha()]
			# store as string
			desc_list[i] =  ' '.join(desc)

# clean descriptions
clean_descriptions(captions)

### Save all captions

In [9]:
save_descriptions(captions, 'captions.txt')

## Load train captions

In [10]:
train_images_names_filepath = '../../datasets/flickr8k/Flickr8k_text/Flickr_8k.trainImages.txt'
file = open(train_images_names_filepath, 'r')
train_images_names = file.read()
file.close()

In [11]:
train_images = list()
for line in train_images_names.split('\n'):
    if len(line) < 1:
        continue
    identifier = line.split('.')[0]
    train_images.append(identifier)
train_images_all = set(train_images)
print('Dataset: %d' % len(train_images))

Dataset: 6000


In [12]:
train_descriptions = load_clean_descriptions('captions.txt', train_images_all)
print('Descriptions: train=%d' % len(train_descriptions))

Descriptions: train=6000


### save train captions

In [13]:
save_descriptions(train_descriptions, 'train_captions.txt')

## Load test captions

In [14]:
test_images_names_filepath = '../../datasets/flickr8k/Flickr8k_text/Flickr_8k.testImages.txt'
file = open(test_images_names_filepath, 'r')
test_images_names = file.read()
file.close()

In [15]:
test_images = list()
for line in test_images_names.split('\n'):
    if len(line) < 1:
        continue
    identifier = line.split('.')[0]
    test_images.append(identifier)
test_images_all = set(test_images)
print('Dataset: %d' % len(test_images))

Dataset: 1000


In [16]:
test_descriptions = load_clean_descriptions('captions.txt', test_images_all)
print('Descriptions: test=%d' % len(test_descriptions))

Descriptions: test=1000


### save test captions

In [17]:
save_descriptions(test_descriptions, 'test_captions.txt')

## Process and save lemma_captions

In [20]:
lemma_captions_filename = '../../datasets/flickr8k/Flickr8k_text/Flickr8k.lemma.token.txt'

In [21]:
file = open(lemma_captions_filename, 'r')
lemma_captions = file.read()
file.close()

captions = dict()
for line in lemma_captions.split('\n'):
    tokens = line.split()
    if len(line) < 2:
        continue
    image_id, image_desc = tokens[0], tokens[1:]
    image_id = image_id.split('.')[0]
    image_desc = ' '.join(image_desc)
    if image_id not in captions:
        captions[image_id] = list()
    captions[image_id].append(image_desc)

print('Loaded: %d ' % len(captions))

Loaded: 8092 


In [23]:
clean_descriptions(captions)

In [25]:
save_descriptions(captions, 'lemma_captions.txt')

# TEST

In [None]:
df = pd.read_csv(all_captions_filename,sep="\t",names=["image","caption"])

In [None]:
df.to_csv("descriptions.csv",sep=' ', index=False,header=False)

In [None]:
descriptions

In [None]:
descriptions

In [None]:
df = pd.DataFrame.from_dict(descriptions, orient='index')

In [None]:
df.to_csv('descriptions.csv',)

In [None]:
df2 = pd.read_csv('descriptions.csv')

In [None]:
df2.to_dict("index")

In [None]:
descriptions = dict()
for line in doc.split('\n')[:-1]:
    tokens = line.split()
    image_id, image_desc = tokens[0], tokens[1:]
    image_id = image_id.split('.')[0]
    image_desc = ' '.join(image_desc)
    if image_id not in descriptions:
        descriptions[image_id] = list()
    descriptions[image_id].append(image_desc)

In [None]:
df = pd.DataFrame.from_dict(descriptions, orient='index')

## Process All descriptions

In [None]:
# To lowercase
df = df.apply(lambda x: x.astype(str).str.lower())

In [None]:
#remove punctuation
df[0] = df[0].str.replace(r'[^\w\s]+', '')
df[1] = df[1].str.replace(r'[^\w\s]+', '')
df[2] = df[2].str.replace(r'[^\w\s]+', '')
df[3] = df[3].str.replace(r'[^\w\s]+', '')
df[4] = df[4].str.replace(r'[^\w\s]+', '')

In [None]:
#stip the strings
df = df.apply(lambda x: x.str.strip())

In [None]:
pd.read_csv

In [None]:
df

## Load Train descriptions

In [None]:
train_images_filename = "../../datasets/flickr8k/Flickr8k_text/Flickr_8k.trainImages.txt"
train_images_file = open(train_images_filename, 'r')
train_doc = train_images_file.read()
train = list()
for line in doc.split('\n')[:-1]:
    identifier = line.split('.')[0]
    train.append(identifier)
print(len(train))

In [None]:
train_descriptions = dict()
for train_image_id in train:
    descr = df.loc[[train_image_id]].values.tolist()[0]
    descr = ["startseq " + s + " endseq" for s in descr]
    train_descriptions[train_image_id] = descr


In [None]:
len(train_descriptions)