# split datasets
version: 3

info:
- split json into train.json, val.json and test.json 

author: nuno costa

In [39]:
from annotate_v5 import *
import platform 
import numpy as np
import time
import pandas as pd
from IPython.display import Image, display
import copy
import os
from shutil import copyfile
import matplotlib.pyplot as plt
from matplotlib.image import imread
from matplotlib.patches import Rectangle
import random

In [40]:
#Define root dir dependent on OS
rdir='D:/external_datasets/MOLA/annotations/' 
if str(platform.platform()).find('Linux')>-1:
    rdir=rdir.replace('D:/external_datasets/MOLA/annotations/','/mnt/Data/Work/EASYRIDE/P19/NC/yolov5/JSONS/annotations/')
print('OS: {}'.format(platform.platform()))
print('root dir: {}'.format(rdir))

OS: Linux-5.11.0-37-generic-x86_64-with-glibc2.10
root dir: /mnt/Data/Work/EASYRIDE/P19/NC/yolov5/JSONS/annotations/


## 1. Init vars

In [41]:
train=70
val=20
test=100-(train+val)
injsonfile='cocotaolbo_fix_equal_reorder_cleanclass_cleanimg.json'
infilename=injsonfile.split('.')[0]

In [42]:
# init json
molajson =  json.load(open(rdir+injsonfile))
for k in molajson:
    print(k, len(molajson[k]))

info 5
licenses 9
categories 1298
videos 1488
images 194943
tracks 8132
segment_info 0
annotations 1348451
datasets 3


## 2. Import ids
#### #NOTE: work with ids and index so you can use numpy for faster operations

In [43]:
# categories id
catids=[]
cats=[]
for c in molajson['categories']:
    catids.append(c['id'])
    cats.append(c['name'])
print(cats)

['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush', 'aerosol_can', 'apricot', 'armchair', 'atomizer', 'ax', 'baby_buggy', 'bagpipe', 'ball', 'balloon', 'barbell', 'baseball', 'baseball_bat',

In [44]:
# annotations category_id
ann_catids=[]
ann_ids=[]
for an in tqdm(molajson['annotations']):
    ann_catids.append(an['category_id'])
    ann_ids.append(an['id'])
print(len(ann_ids))

100%|██████████| 1348451/1348451 [00:00<00:00, 1831337.64it/s]

1348451





In [45]:
#TEST dupplicates v1 - slow
# duplicates_l=list(set([x for x in ann_ids if ann_ids.count(x) > 1])) # duplicates l 
#TEST dupplicates v2 - fast
#from collections import Counter
#duplicates_l=[item for item, count in Counter(ann_ids).items() if count > 1]
#TEST duplicates v3 -faster
u, c = np.unique(np.array(ann_ids), return_counts=True)
duplicates_l= u[c > 1].tolist()
print(len(duplicates_l))
u, c = np.unique(np.array(catids), return_counts=True)
duplicates_l= u[c > 1].tolist()
print(len(duplicates_l))

0
0


## 3. split by annotations
#QUESTION Seeded random or not?

In [46]:
ann_catids_np=np.array(ann_catids)
train_ann_catidx=[]
val_ann_catidx=[]
test_ann_catidx=[]
for catid in tqdm(catids):
    ann_idx_np = np.where(ann_catids_np==catid)[0] #annotation index of ids
    if not ann_idx_np.any(): continue
    #print("\n>> ", catid)
    
    #assert ann_idx_np
    u, c = np.unique(ann_idx_np, return_counts=True)
    duplicates_l= u[c > 1].tolist()
    assert len(duplicates_l)==0 #assert duplicates (above is already)
    assert all([True if ann_catids[i]==catid else False for i in ann_idx_np] ) #assert index belongs to catid
    
    #parameters
    train_size=len(ann_idx_np) * train // 100 #floor division
    val_size=len(ann_idx_np) * val // 100
    test_size=len(ann_idx_np) * test // 100
    
    #select data
    random.shuffle(ann_idx_np) 
    train_ann_catidx.extend(ann_idx_np[:train_size].tolist())
    val_ann_catidx.extend(ann_idx_np[train_size+1:train_size+val_size-1].tolist())
    test_ann_catidx.extend(ann_idx_np[train_size+val_size+1:train_size+val_size+test_size].tolist())
    #assert 
    u, c = np.unique(train_ann_catidx, return_counts=True)
    duplicates_l= u[c > 1].tolist()
    assert len(duplicates_l)==0 #assert duplicates (above is already)



print((len(train_ann_catidx)/len(ann_catids))*100)
print((len(val_ann_catidx)/len(ann_catids))*100)
print((len(test_ann_catidx)/len(ann_catids))*100)

100%|██████████| 1298/1298 [02:23<00:00,  9.05it/s]

69.95434020220237
19.85848948163485
9.918788298573697





In [47]:
l_dup=[train_ann_catidx, val_ann_catidx,test_ann_catidx ]
for i in l_dup:
    print('original: ', len(i))
    u, c = np.unique(np.array(i), return_counts=True)
    duplicates_l= u[c > 1].tolist()
    print('duplicate: ',len(duplicates_l))

original:  943300
duplicate:  0
original:  267782
duplicate:  0
original:  133750
duplicate:  0


### 4. Save splited jsons

In [48]:
percent_idx=[train_ann_catidx,val_ann_catidx, test_ann_catidx]
percent_names=['train', 'val', 'test']

In [49]:
newjson=copy.copy(molajson)

In [50]:
annotations=copy.copy(molajson['annotations']) 
for i, percent_i in enumerate(tqdm(percent_idx)):
    #get new annotations
    newjson['annotations']=[annotations[index] for index in percent_i]
    # save
    print('\n >> SAVING {}...'.format(percent_names[i]))
    outpath=rdir+'splitann_{}/'.format(infilename)
    assure_path_exists(outpath)
    outjsonfile=outpath+'{}.json'.format(percent_names[i]) #rdir+'{}_{}.json'.format(percent_names[i],infilename)
    with open(outjsonfile, 'w') as f:
        json.dump(newjson, f)
    print("JSON SAVED : {} \n".format(outjsonfile))
    for k in molajson:
        print(k, len(newjson[k]))

  0%|          | 0/3 [00:00<?, ?it/s]


 >> SAVING train...


 33%|███▎      | 1/3 [00:58<01:57, 58.87s/it]

JSON SAVED : /mnt/Data/Work/EASYRIDE/P19/NC/yolov5/JSONS/annotations/splitann_cocotaolbo_fix_equal_reorder_cleanclass_cleanimg/train.json 

info 5
licenses 9
categories 1298
videos 1488
images 194943
tracks 8132
segment_info 0
annotations 943300
datasets 3

 >> SAVING val...


 67%|██████▋   | 2/3 [01:18<00:36, 36.01s/it]

JSON SAVED : /mnt/Data/Work/EASYRIDE/P19/NC/yolov5/JSONS/annotations/splitann_cocotaolbo_fix_equal_reorder_cleanclass_cleanimg/val.json 

info 5
licenses 9
categories 1298
videos 1488
images 194943
tracks 8132
segment_info 0
annotations 267782
datasets 3

 >> SAVING test...


100%|██████████| 3/3 [01:30<00:00, 30.20s/it]

JSON SAVED : /mnt/Data/Work/EASYRIDE/P19/NC/yolov5/JSONS/annotations/splitann_cocotaolbo_fix_equal_reorder_cleanclass_cleanimg/test.json 

info 5
licenses 9
categories 1298
videos 1488
images 194943
tracks 8132
segment_info 0
annotations 133750
datasets 3





### 5. TEST SPLIT ANNOTATIONS DUPLICATES

In [51]:
outjsonfile=rdir+'splitann_{}/'.format(infilename)+'test.json'
# init json
molajson =  json.load(open(outjsonfile))
for k in molajson:
    print(k, len(molajson[k]))

info 5
licenses 9
categories 1298
videos 1488
images 194943
tracks 8132
segment_info 0
annotations 133750
datasets 3


In [52]:
# annotations category_id
ann_ids=[]
for an in tqdm(molajson['annotations']):
    ann_ids.append(an['id'])
print(len(ann_ids))

#TEST duplicates v3 -faster
u, c = np.unique(np.array(ann_ids), return_counts=True)
duplicates_l= u[c > 1].tolist()
print(len(duplicates_l))

100%|██████████| 133750/133750 [00:00<00:00, 1980323.99it/s]

133750
0



