In [5]:
import json
import argparse
import funcy
from sklearn.model_selection import train_test_split

In [8]:
def save_coco(file, info, licenses, images, annotations, categories):
    with open(file, 'wt', encoding='UTF-8') as coco:
        json.dump({ 'info': info, 'licenses': licenses, 'images': images, 
            'annotations': annotations, 'categories': categories}, coco, indent=2, sort_keys=True)

def filter_annotations(annotations, images):
    image_ids = funcy.lmap(lambda i: int(i['id']), images)
    return funcy.lfilter(lambda a: int(a['image_id']) in image_ids, annotations)

def main():
    with open('../datasets/coco/annotations/instances_train2017.json', 'rt', encoding='UTF-8') as annotations:
        coco = json.load(annotations)
        info = coco['info']
        licenses = coco['licenses']
        images = coco['images']
        annotations = coco['annotations']
        categories = coco['categories']

        number_of_images = len(images)

        images_with_annotations = funcy.lmap(lambda a: int(a['image_id']), annotations)

#         if args.having_annotations:
        # ignore images without annotations
        # this part is slow
        images = funcy.lremove(lambda i: i['id'] not in images_with_annotations, images)

        x, y = train_test_split(images, train_size=0.5)

        save_coco('../datasets/coco/annotations/instances_train2017_subset1.json', info, 
                  licenses, x, filter_annotations(annotations, x), categories)
        
        save_coco('../datasets/coco/annotations/instances_train2017_subset2.json', info, 
                  licenses, y, filter_annotations(annotations, y), categories)

        print("Saved {} entries in training and {} in testing".format(len(x), len(y)))


In [9]:
main()

Saved 58633 entries in training and 58633 in testing
