In [38]:
from os import listdir
import torchvision
import torch

from pathlib import Path

print(torchvision.__version__, torch.__version__)

0.5.0 1.4.0


In [39]:
flickr8k_dir = Path('/home/jithin/datasets/imageCaptioning/flicker8k/Flicker8k_Dataset')
captions_file = Path('/home/jithin/datasets/imageCaptioning/captions/dataset_flickr8k.json')

In [3]:
listdir(flickr8k_dir)[:10]

['444881000_bba92e585c.jpg',
 '241345905_5826a72da1.jpg',
 '2318721455_80c6644441.jpg',
 '3365602213_dd3287a633.jpg',
 '539801139_7258ee437f.jpg',
 '3697378565_7060d9281a.jpg',
 '2147199188_d2d70b88ec.jpg',
 '143684568_3c59299bae.jpg',
 '1499495021_d295ce577c.jpg',
 '3081182021_22cfa18dd4.jpg']

In [40]:
import json

with open(captions_file, 'r') as f:
    parsed_json = json.load(f)

## json format

There are 6k training data and 2k test data  
  
  
file  
|- images: list of all the sentences for each image  
&nbsp;&nbsp;    |- sentids: list()  
&nbsp;&nbsp;    |- imgid: int  
&nbsp;&nbsp;    |- sentences: list()  
&nbsp;&nbsp;&nbsp;&nbsp;        |- tokens: list(str) -> tokenized sentences  
&nbsp;&nbsp;&nbsp;&nbsp;        |- raw: str  
&nbsp;&nbsp;&nbsp;&nbsp;        |- imgid: int  
&nbsp;&nbsp;&nbsp;&nbsp;        |- sentid: int  
&nbsp;&nbsp;    |- split: ['train', 'val', 'test']  
&nbsp;&nbsp;    |- filename: str -> only the filename of the image eg: 2513260012_03d33305cf.jpg  

In [5]:
parsed_json['images'][6999]

{'sentids': [34995, 34996, 34997, 34998, 34999],
 'imgid': 6999,
 'sentences': [{'tokens': ['a',
    'girl',
    'playing',
    'is',
    'a',
    'pile',
    'of',
    'colorful',
    'balls'],
   'raw': 'A girl playing is a pile of colorful balls .',
   'imgid': 6999,
   'sentid': 34995},
  {'tokens': ['a', 'little', 'girl', 'plays', 'in', 'a', 'ball', 'pit'],
   'raw': 'A little girl plays in a ball pit .',
   'imgid': 6999,
   'sentid': 34996},
  {'tokens': ['a',
    'little',
    'girl',
    'plays',
    'in',
    'a',
    'pit',
    'of',
    'colorful',
    'balls'],
   'raw': 'A little girl plays in a pit of colorful balls .',
   'imgid': 6999,
   'sentid': 34997},
  {'tokens': ['a', 'small', 'girl', 'is', 'playing', 'in', 'a', 'ball', 'pit'],
   'raw': 'A small girl is playing in a ball pit',
   'imgid': 6999,
   'sentid': 34998},
  {'tokens': ['a',
    'young',
    'girl',
    'with',
    'a',
    'white',
    'shirt',
    'and',
    'pink',
    'shorts',
    'rolling',
    '

In [6]:
from collections import namedtuple

annotations = namedtuple('Annotations',['image_id','sentences'])

In [13]:
train = list()
for image in parsed_json['images'][:6000]:
    image_id = image['filename']
    sentences_list = list()
    for sentence in image['sentences']:
        sentences_list.append(sentence['tokens'])
    train.append(annotations(image_id, sentences_list))
    if image['split'] != 'train':
        print(image)
        break

In [14]:
train[0]

Annotations(image_id='2513260012_03d33305cf.jpg', sentences=[['a', 'black', 'dog', 'is', 'running', 'after', 'a', 'white', 'dog', 'in', 'the', 'snow'], ['black', 'dog', 'chasing', 'brown', 'dog', 'through', 'snow'], ['two', 'dogs', 'chase', 'each', 'other', 'across', 'the', 'snowy', 'ground'], ['two', 'dogs', 'play', 'together', 'in', 'the', 'snow'], ['two', 'dogs', 'running', 'through', 'a', 'low', 'lying', 'body', 'of', 'water']])

In [42]:
from torch.utils.data import Dataset
from PIL import Image

import json
from collections import namedtuple
from pathlib import Path

annotations = namedtuple('Annotations',['image_id','sentences'])

class Flickr8k(Dataset):
    """ for flickr 8k dataset."""
    
    def __init__(self, img_dir, ann_file, split='train', transform=None, target_transform=None):
        """
        Args:
            root (str): The root dir that points to the Flickr images.
            ann_file (str): The file that contains the annotations for the images.
            split ['train', 'val', 'test']: This decides which partition to load.
            transform: Transforms for image.
            target_transforms: transforms for sentences.
        """
        self.img_dir = Path(img_dir)
        assert split in ['train', 'test', 'val']
        self.split = split
        self.transform = transform
        self.target_transform = target_transform
        self.annotations = list()
        
        # indices when spliting the json file
        if self.split == 'train':
            m, n = 0, 6000
        elif self.split == 'val':
            m, n = 6000, 7000
        elif self.split == 'test':
            m, n = 7000, 8000
            
        with open(ann_file, 'r') as ann_file:
            ann_json = json.load(ann_file)
            for image in ann_json['images'][m : n]:
                image_id = image['filename']
                sentences_list = list()
                for sentence in image['sentences']:
                    sentences_list.append(sentence['tokens'])
                self.annotations.append(annotations(image_id, sentences_list))
                
                assert image['split'] == self.split
                
            print('loading %s complete'%(self.split))
        
    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, index):
        img_id = self.annotations[index].image_id
        
        img = Image.open(self.img_dir/img_id).convert('RGB')
        if self.transform is not None:
            img = self.transform(img)
            
        # Captions
        target = self.annotations[index].sentences
        if self.target_transform is not None:
            target = self.target_transform(target)
        
        return img, target

In [43]:
dataset = Flickr8k(flickr8k_dir, captions_file, split='val')
len(dataset)

loading val complete


1000

In [44]:
dataset[0]

(<PIL.Image.Image image mode=RGB size=500x333 at 0x7F7E8E42ECC0>,
 [['the',
   'boy',
   'laying',
   'face',
   'down',
   'on',
   'a',
   'skateboard',
   'is',
   'being',
   'pushed',
   'along',
   'the',
   'ground',
   'by',
   'another',
   'boy'],
  ['two', 'girls', 'play', 'on', 'a', 'skateboard', 'in', 'a', 'courtyard'],
  ['two', 'people', 'play', 'on', 'a', 'long', 'skateboard'],
  ['two',
   'small',
   'children',
   'in',
   'red',
   'shirts',
   'playing',
   'on',
   'a',
   'skateboard'],
  ['two',
   'young',
   'children',
   'on',
   'a',
   'skateboard',
   'going',
   'across',
   'a',
   'sidewalk']])

In [26]:
dataset[0]

Annotations(image_id='2090545563_a4e66ec76b.jpg', sentences=[['the', 'boy', 'laying', 'face', 'down', 'on', 'a', 'skateboard', 'is', 'being', 'pushed', 'along', 'the', 'ground', 'by', 'another', 'boy'], ['two', 'girls', 'play', 'on', 'a', 'skateboard', 'in', 'a', 'courtyard'], ['two', 'people', 'play', 'on', 'a', 'long', 'skateboard'], ['two', 'small', 'children', 'in', 'red', 'shirts', 'playing', 'on', 'a', 'skateboard'], ['two', 'young', 'children', 'on', 'a', 'skateboard', 'going', 'across', 'a', 'sidewalk']])