# split datasets
version: 1

info:
- split json into train.json, val.json and test.json 

### WARNING: splitbyimages is not ideal (use splitbyannotations) -> because you can fail to have all the classes in train (or val,or test). This was done, because some datasets like tao are missing annotations and images


author: nuno costa

In [23]:
from annotate_v5 import *
import platform 
import numpy as np
import time
import pandas as pd
from IPython.display import Image, display
import copy
import os
from shutil import copyfile
import matplotlib.pyplot as plt
from matplotlib.image import imread
from matplotlib.patches import Rectangle
import random

In [2]:
#Define root dir dependent on OS
rdir='D:/external_datasets/MOLA/annotations/' 
if str(platform.platform()).find('linux')>-1: rdir=rdir.replace('D:/','/mnt/d/')
print('OS: {}'.format(platform.platform()))
print('root dir: {}'.format(rdir))

OS: Windows-10-10.0.21292-SP0
root dir: D:/external_datasets/MOLA/annotations/


## 1. Init vars

In [24]:
train=70
val=20
test=100-(train+val)
injsonfile='coco2017_reorder_cleanclass.json'
infilename=injsonfile.split('.')[0]

In [25]:
# init json
molajson =  json.load(open(rdir+injsonfile))
for k in molajson:
    print(k, len(molajson[k]))

info 6
licenses 8
images 123287
annotations 1170251
categories 80


## 2. Import ids
#### #NOTE: work with ids and index so you can use numpy for faster operations

In [26]:
# categories id
cats=[]
catids=[]
for c in molajson['categories']:
    catids.append(c['id'])
    cats.append(c['name'])
#print(cats)

In [27]:
# images filepath and id
imgs=[]
imgids=[]
for c in molajson['images']:
    imgs.append(c['file_name'])
    imgids.append(c['id'])

In [28]:
# annotations category_id
ann_catids=[]
ann_ids=[]
ann_imgids=[]
for an in tqdm(molajson['annotations']):
    ann_catids.append(an['category_id'])
    ann_ids.append(an['id'])
    ann_imgids.append(an['image_id'])
print(len(ann_ids))

100%|█████████████████████████████████████████████████████████████████| 1170251/1170251 [00:01<00:00, 1079964.86it/s]

1170251





In [29]:
#TEST dupplicates v1 - slow
# duplicates_l=list(set([x for x in ann_ids if ann_ids.count(x) > 1])) # duplicates l 
#TEST dupplicates v2 - fast
#from collections import Counter
#duplicates_l=[item for item, count in Counter(ann_ids).items() if count > 1]
#TEST duplicates v3 -faster
u, c = np.unique(np.array(ann_ids), return_counts=True)
duplicates_l= u[c > 1].tolist()
print(len(duplicates_l))

273469


## 3. split by images
#QUESTION Seeded random or not?

In [30]:
#init
train_imgids=[]
val_imgids=[]
test_imgids=[]

#size
train_size=len(imgids) * train // 100 #floor division
val_size=len(imgids) * val // 100
test_size=len(imgids) * test // 100

#select images
random.shuffle(imgids) 
train_imgids.extend(imgids[:train_size])
val_imgids.extend(imgids[train_size+1:train_size+val_size-1])
test_imgids.extend(imgids[train_size+val_size+1:train_size+val_size+test_size])


print((len(train_imgids)/len(imgids))*100)
print((len(val_imgids)/len(imgids))*100)
print((len(test_imgids)/len(imgids))*100)

69.99926999602553
19.99805332273476
9.998621103603785


In [31]:
ann_catids_np=np.array(ann_catids)
train_ann_catidx=[]
val_ann_catidx=[]
test_ann_catidx=[]
for imgid in tqdm(train_imgids):
    ann_idx_np = np.where(ann_catids_np==imgid)[0] #annotation index of ids
    if not ann_idx_np.any(): continue    
    train_ann_catidx.extend(ann_idx_np.tolist())
for imgid in tqdm(val_imgids):
    ann_idx_np = np.where(ann_catids_np==imgid)[0] #annotation index of ids
    if not ann_idx_np.any(): continue    
    val_ann_catidx.extend(ann_idx_np.tolist())
for imgid in tqdm(test_imgids):
    ann_idx_np = np.where(ann_catids_np==imgid)[0] #annotation index of ids
    if not ann_idx_np.any(): continue    
    test_ann_catidx.extend(ann_idx_np.tolist())

print((len(train_ann_catidx)/len(ann_catids))*100)
print((len(val_ann_catidx)/len(ann_catids))*100)
print((len(test_ann_catidx)/len(ann_catids))*100)

100%|█████████████████████████████████████████████████████████████████████████| 86300/86300 [02:17<00:00, 627.94it/s]
100%|█████████████████████████████████████████████████████████████████████████| 24655/24655 [00:40<00:00, 614.19it/s]
100%|█████████████████████████████████████████████████████████████████████████| 12327/12327 [00:19<00:00, 628.88it/s]

9.601017217673816
0.6805377649752061
0.4258915395073365





In [32]:
l_dup=[train_ann_catidx, val_ann_catidx,test_ann_catidx ]
for i in l_dup:
    print('original: ', len(i))
    u, c = np.unique(np.array(i), return_counts=True)
    duplicates_l= u[c > 1].tolist()
    print('duplicate: ',len(duplicates_l))

original:  112356
duplicate:  0
original:  7964
duplicate:  0
original:  4984
duplicate:  0


### 4. Save splited jsons

In [33]:
percent_idx=[train_ann_catidx,val_ann_catidx, test_ann_catidx]
percent_names=['train', 'val', 'test']

In [34]:
newjson=copy.copy(molajson)

In [35]:
annotations=copy.copy(molajson['annotations']) 
for i, percent_i in enumerate(tqdm(percent_idx)):
    #get new annotations
    newjson['annotations']=[annotations[index] for index in percent_i]
    # save
    print('\n >> SAVING {}...'.format(percent_names[i]))
    outpath=rdir+'splitimg_{}/'.format(infilename)
    assure_path_exists(outpath)
    outjsonfile=outpath+'{}.json'.format(percent_names[i]) #rdir+'{}_{}.json'.format(percent_names[i],infilename)
    with open(outjsonfile, 'w') as f:
        json.dump(newjson, f)
    print("JSON SAVED : {} \n".format(outjsonfile))
    for k in molajson:
        print(k, len(newjson[k]))

  0%|                                                                                          | 0/3 [00:00<?, ?it/s]


 >> SAVING train...


 33%|███████████████████████████▎                                                      | 1/3 [00:15<00:31, 15.59s/it]

JSON SAVED : D:/external_datasets/MOLA/annotations/splitimg_coco2017_reorder_cleanclass/train.json 

info 6
licenses 8
images 123287
annotations 112356
categories 80

 >> SAVING val...


 67%|██████████████████████████████████████████████████████▋                           | 2/3 [00:20<00:12, 12.32s/it]

JSON SAVED : D:/external_datasets/MOLA/annotations/splitimg_coco2017_reorder_cleanclass/val.json 

info 6
licenses 8
images 123287
annotations 7964
categories 80

 >> SAVING test...


100%|██████████████████████████████████████████████████████████████████████████████████| 3/3 [00:25<00:00,  8.49s/it]

JSON SAVED : D:/external_datasets/MOLA/annotations/splitimg_coco2017_reorder_cleanclass/test.json 

info 6
licenses 8
images 123287
annotations 4984
categories 80





### 5. TEST SPLIT ANNOTATIONS DUPLICATES

In [71]:
injsonfile='mola_mix_aggressive.json'
outjsonfile=rdir+'split_{}/'.format(infilename)+'test.json'
# init json
molajson =  json.load(open(outjsonfile))
for k in molajson:
    print(k, len(molajson[k]))

info 5
licenses 9
categories 1261
videos 1488
images 177936
tracks 8132
segment_info 0
annotations 133266
datasets 2


In [72]:
# annotations category_id
ann_ids=[]
for an in tqdm(molajson['annotations']):
    ann_ids.append(an['id'])
print(len(ann_ids))

#TEST duplicates v3 -faster
u, c = np.unique(np.array(ann_ids), return_counts=True)
duplicates_l= u[c > 1].tolist()
print(len(duplicates_l))

100%|██████████████████████████████████████████████████████████████████| 133266/133266 [00:00<00:00, 1497403.10it/s]

133266
0



