In [5]:
import pandas as pd

# Read in and clean up the captions file

In [8]:
captions = pd.read_csv("./data/captions.csv", header=None)

In [14]:
# Combine captions split across multiple columns due to presence of comma in caption
captions.fillna("", inplace=True)
captions[1] = captions[1] + captions[2] + captions[3]
captions.drop(columns=[2, 3], inplace=True)

In [18]:
# drop any possible duplicates
captions.drop_duplicates(inplace=True)

In [22]:
# remove the file name to just get the image ID
captions[0] = captions[0].apply(lambda x: x.split(".")[0])

In [24]:
# Convert caption to all lowercase
captions[1] = captions[1].apply(lambda x: x.lower())

In [34]:
# Rename fields
captions.rename(columns={0: 'id', 1: 'caption'}, inplace=True)

In [54]:
# Set id to index
captions.set_index('id', inplace=True)

In [55]:
captions

Unnamed: 0_level_0,caption
id,Unnamed: 1_level_1
01235,a man with a beard is holding a pair of scissors
01236,a group of people with some animals on their h...
01243,a dog with a pink collar and a red collar
01245,a young child wearing a hat and a hat.
01247,a man with a beard and a beard wearing a tie.
...,...
98752,a dog laying on a bed with a picture of a cat.
98754,a poster of a woman with a mustache and a pict...
98756,a woman with a smile on her face with a smile ...
98762,a young boy holding a blue frisbee in his hands.


# Add captions to train

In [29]:
train = pd.read_json("./data/train.jsonl", dtype=str, lines=True)

In [56]:
train.set_index("id", inplace=True)

In [58]:
train = train.join(captions)

In [61]:
train.reset_index(inplace=True)

In [62]:
train.head()

Unnamed: 0,id,img,label,text,caption
0,42953,img/42953.png,0,its their character not their color that matters,a man in a suit and tie with a picture of a man.
1,23058,img/23058.png,0,don't be afraid to love again everyone is not ...,a woman and man standing next to each other.
2,13894,img/13894.png,0,putting bows on your pet,a cat with a red collar and red tie.
3,37408,img/37408.png,0,i love everything and everybody! except for sq...,a black dog with a collar and a brown collar
4,82403,img/82403.png,0,"everybody loves chocolate chip cookies, even h...",a man in a suit and tie standing in front of a...


In [63]:
train.to_csv("./data/train_captioned.csv")

# Add captions to all other data files

In [65]:
files = ['dev_seen.jsonl', 'dev_unseen.jsonl', 'test_seen.jsonl', 'test_unseen.jsonl']
for file in files:
    data = pd.read_json(f"./data/{file}", dtype=str, lines=True)
    data.set_index("id", inplace=True)
    data = data.join(captions)
    data.reset_index(inplace=True)
    file_name = file.split('.')[0]
    data.to_csv(f"./data/{file_name}_captioned.csv")