# Oversample a large dataset with a small one (COCO style)

In [55]:
import json
import copy
from pathlib import Path

large = '/....json'
small = '/....json'

large1 = json.load(open(large))
small1 = json.load(open(small))
print(f"{len(large1['images'])=}; {len(small1['images'])=}")

len(large1['images'])=118287; len(small1['images'])=200


In [52]:
def oversample(large, small, percentage):
    # large = json.load(open(large))
    # small = json.load(open(small)) 
    image_ids_l = set([anno['image_id'] for anno in large['annotations']])
    with_anno = [image for image in large['images'] if image['id'] in image_ids_l]
    len_large = len(with_anno)
    len_small = len(small['images'])
    multiply_s = len_large / len_small * percentage / (1 - percentage)
    
    new_images = copy.deepcopy(large['images'])
    new_annos = copy.deepcopy(large['annotations'])
    
    lookup = {}
    for image in small['images']:
        lookup[image['id']] = {'image': image}
    for image in small['images']:
        lookup[image['id']]['annotations'] = [anno for anno in small['annotations'] if anno['image_id'] == image['id']]
    img_ids_large = set([image['id'] for image in with_anno])
    img_ids_small = set([image['id'] for image in small['images']])
    img_id_start = max(img_ids_large.union(img_ids_small)) + 1
    anno_ids_large = set([anno['image_id'] for anno in large['annotations']])
    anno_ids_small = set([anno['image_id'] for anno in small['annotations']])
    anno_id_start = max(anno_ids_large.union(anno_ids_small)) + 1
    
    count_img = 1
    count_ann = 1
    for i in range(int(multiply_s)):
        for entry in lookup.values():
            image = copy.deepcopy(entry['image'])
            image['id'] = img_id_start + count_img
            count_img += 1
            new_images.append(image)
            
            for anno in entry['annotations']:
                ann = copy.deepcopy(anno)
                ann['image_id'] = img_id_start + count_ann
                ann['id'] = anno_id_start + count_ann
                new_annos.append(ann)
                count_ann += 1

    
    return new_images, new_annos


percentage = 0.3
images, annos = oversample(large1, small1, percentage)

new = copy.deepcopy(large1)
new['images'] = images
new['annotations'] = annos
new['info']['description'] = f"{new['info']['description']}+ oversample {percentage*100:.2f}%"

In [1]:
# print(new['info'])

In [57]:
with open(Path(Path(small).parent, 'oversample_coco.json'), 'w', encoding='utf-8') as f:
    json.dump(new, f, ensure_ascii=False)