In [1]:
import sys
import os
import time
import json
import copy
import shutil
import logging

In [2]:
def fix_category_id(coco_dict: dict):
    coco_dict = copy.deepcopy(coco_dict)
    
    coco_categories_dict = coco_dict['categories']
    coco_annotations_dict = coco_dict['annotations']
    
    # build dictionary that correlates 91-class annotations to one-indexed 80-class annotations
    cat_convert = {}
    
    # exit if dictionary appears to have been modified already
    if coco_categories_dict[-1]['id'] != 90:
        logging.error("Idempotency Warning: It looks like you have already run this script. Do not run this script more than once.")
        sys.exit(1)
        
    i = 1
    for cat in coco_categories_dict:

        cat_convert.update({cat['id']: i}) # create simpler dictionary for later use
        cat.update({'id':i}) # rename the dictionary itself too
        i += 1
    
    # change the 'category_id' on every single bbox
    for annotation in coco_annotations_dict:
        annotation.update({'category_id' : cat_convert[annotation['category_id']]})
    
    return coco_dict

In [3]:
def move_empty_images(coco_dict: dict, coco_root: str, dataset_type: str):
    coco_dict = copy.deepcopy(coco_dict)
    
    all_images = set()
    for image in coco_dict['images']:
        all_images.add(image['id'])
    logging.info(f"Total number of images: {len(all_images)}")
    
    annotated_images = set()
    for annotation in coco_dict['annotations']:
        annotated_images.add(annotation['image_id'])
    logging.info(f"Images with at least one annotation: {len(annotated_images)}")
            
    no_annotation_images = all_images - annotated_images
    logging.info(f"Images without any annotations: {len(no_annotation_images)}")
    
    # build image_id to filename dictionary
    id_to_filename = {}
    for image in coco_dict['images']:
        id_to_filename.update({image['id']:image['file_name']})
    
    # move files to new directory
    rejects_dir = os.path.join(coco_root, 'no_annotations_'+dataset_type+'2017')
    try: 
        os.mkdir(rejects_dir)
    except OSError:
        logging.error("Idempotency Warning: It appears you have already run this script. Do not run this script more than once.")
        sys.exit(1)
        
    counter = 0
    for image_id in no_annotation_images:
        image_filename = id_to_filename[image_id]
        from_image_path = os.path.join(coco_root, f"{dataset_type}2017", image_filename)
        shutil.move(from_image_path, rejects_dir)
        counter += 1
    logging.warning(f"{counter} images have been moved out of {from_image_path}.")


In [4]:
coco_root = 'coco-data'
dataset_types = ['train', 'val']

In [22]:
dataset_type=dataset_types[1]

In [23]:
path = os.path.join(coco_root, 'annotations', f'instances_{dataset_type}2017.json')
with open(path, 'r') as j:
    json_file = json.loads(j.read())



In [16]:
json_file.keys()

dict_keys(['info', 'licenses', 'images', 'annotations', 'categories'])

In [24]:
json_file['categories'][-1]['id']

80

In [18]:
coco_dict = fix_category_id(json_file)

In [19]:
move_empty_images(coco_dict, coco_root, dataset_type)



In [20]:
shutil.move(path, path+'_old')

'coco-data/annotations/instances_val2017.json_old'

In [21]:
with open(path, 'w') as j:
    json.dump(coco_dict, j)

In [13]:
#print("Fixing COCO label indexes...")
#coco_dict = fix_category_id(json_file)
#print("Moving images with no annotations to new folder...")
# WARNING: moving images causes problem.
#move_empty_images(coco_dict, coco_root, dataset_type)

#rename old json
#shutil.move(path, path+'_old')
