# Filter datasets
version: 1

info:
- filter json into train.json, val.json and test.json 

author: nuno costa

In [10]:
from annotate_v5 import *
import platform 
import numpy as np
import pandas as pd
from IPython.display import Image, display
import copy
import os
from shutil import copyfile
import matplotlib.pyplot as plt
from matplotlib.image import imread
from matplotlib.patches import Rectangle
import random

#Define root dir dependent on OS
rdir='D:/external_datasets/' 
if str(platform.platform()).find('linux')>-1: rdir='/mnt/d/external_datasets/' 
print('OS: {}'.format(platform.platform()))
print('root dir: {}'.format(rdir))

OS: Windows-10-10.0.20241-SP0
root dir: D:/external_datasets/


## 1. Init vars

In [11]:
train=70
val=20
test=100-(train+val)
injsonfile='mlab_fix_equal.json'
infilename=injsonfile.split('.')[0]

In [12]:
# init json
mlabjson =  json.load(open(rdir+injsonfile))
for k in mlabjson:
    print(k, len(mlabjson[k]))

info 5
licenses 9
categories 1261
videos 1488
images 177936
tracks 8132
segment_info 0
annotations 1338002
datasets 2


## 2. Import ids
#### #NOTE: work with ids and index so you can use numpy for faster operations

In [13]:
# categories id
catids=[]
for c in mlabjson['categories']:
    catids.append(c['id'])

In [14]:
# annotations category_id
ann_catids=[]
for an in tqdm(mlabjson['annotations']):
    ann_catids.append(an['category_id'])

100%|█████████████████████████████████████████████████████████████████| 1338002/1338002 [00:00<00:00, 1737663.90it/s]


## 3. Filter annotations
#QUESTION Seeded random or not?

In [15]:
ann_catids_np=np.array(ann_catids)
train_ann_catidx=[]
val_ann_catidx=[]
test_ann_catidx=[]
for catid in tqdm(catids):
    ann_idx_np = np.where(ann_catids_np==catid)[0] #annotation index of ids
    train_size=len(ann_idx_np) * train // 100 #floor division
    val_size=len(ann_idx_np) * val // 100
    test_size=len(ann_idx_np) * test // 100
    remain_idx_np=ann_idx_np #start 100%
    #train
    train_idx_np = np.random.choice(remain_idx_np, train_size)
    train_ann_catidx.extend(train_idx_np.tolist())
    remain_idx_np=remain_idx_np[~np.in1d(remain_idx_np,train_idx_np)]
    #val
    val_idx_np = np.random.choice(remain_idx_np, val_size)
    val_ann_catidx.extend(val_idx_np.tolist())
    remain_idx_np=remain_idx_np[~np.in1d(remain_idx_np,val_idx_np)]
    #test
    test_idx_np = np.random.choice(remain_idx_np, test_size)
    test_ann_catidx.extend(test_idx_np.tolist())
    remain_idx_np=remain_idx_np[~np.in1d(remain_idx_np,test_idx_np)]

print((len(train_ann_catidx)/len(ann_catids))*100)
print((len(val_ann_catidx)/len(ann_catids))*100)
print((len(test_ann_catidx)/len(ann_catids))*100)

100%|███████████████████████████████████████████████████████████████████████████| 1261/1261 [00:02<00:00, 444.12it/s]

69.98823619097729
19.989730957053876
9.9881016620304





### 4. Save filtered jsons

In [16]:
percent_idx=[train_ann_catidx,val_ann_catidx, test_ann_catidx]
percent_names=['train', 'val', 'test']

In [17]:
annotations=copy.copy(mlabjson['annotations']) #reset annotations
for i, percent_i in enumerate(tqdm(percent_idx)):
    #get new annotations
    mlabjson['annotations']=[annotations[index] for index in percent_i]
    # save
    print('\n >> SAVING {}...'.format(percent_names[i]))
    outjsonfile=rdir+'{}_{}.json'.format(percent_names[i],infilename)
    with open(outjsonfile, 'w') as f:
        json.dump(mlabjson, f)
    print("JSON SAVED : {} \n".format(outjsonfile))
    for k in mlabjson:
        print(k, len(mlabjson[k]))

  0%|                                                                                          | 0/3 [00:00<?, ?it/s]


 >> SAVING train...


 33%|███████████████████████████                                                      | 1/3 [02:34<05:09, 154.72s/it]

JSON SAVED : D:/external_datasets/train_mlab_fix_equal.json 

info 5
licenses 9
categories 1261
videos 1488
images 177936
tracks 8132
segment_info 0
annotations 936444
datasets 2

 >> SAVING val...


 67%|██████████████████████████████████████████████████████                           | 2/3 [03:21<02:02, 122.48s/it]

JSON SAVED : D:/external_datasets/val_mlab_fix_equal.json 

info 5
licenses 9
categories 1261
videos 1488
images 177936
tracks 8132
segment_info 0
annotations 267463
datasets 2

 >> SAVING test...


100%|██████████████████████████████████████████████████████████████████████████████████| 3/3 [03:48<00:00, 76.13s/it]

JSON SAVED : D:/external_datasets/test_mlab_fix_equal.json 

info 5
licenses 9
categories 1261
videos 1488
images 177936
tracks 8132
segment_info 0
annotations 133641
datasets 2



