# Mix/Fusion of Classes=Categories
version: 1

info: 
- mix/fusion: of classes into other classes
    0. If not mola.json : Manually add dataset descriptors when importing id, e.g. COCO(see below)
- reorder_ids: It also reorders category id

author: nuno costa

In [1]:
from annotate_v5 import *
import platform 
import numpy as np
import pandas as pd
from IPython.display import Image, display
import copy
import os
from shutil import copyfile
import matplotlib.pyplot as plt
from matplotlib.image import imread
from matplotlib.patches import Rectangle
import random

In [2]:
#Define root dir dependent on OS
rdir_dsets='D:/external_datasets/' #WARNING: DATASETS ROOT is OK?
rdir='D:/external_datasets/MOLA/' 
if str(platform.platform()).find('linux')>-1:
    dirdir_dsets=rdir_dsets.replace('D:/','/mnt/d/')
    rdir=rdir.replace('D:/','/mnt/d/')
print('OS: {}'.format(platform.platform()))
print('root datasets dir: {}'.format(rdir_dsets))
print('root dir: {}'.format(rdir))

OS: Windows-10-10.0.21332-SP0
root datasets dir: D:/external_datasets/
root dir: D:/external_datasets/MOLA/


In [48]:
#jsonfile
injsonfile="split_tao_original/train" #"split_mola_fix_equal/test"
molajson =  json.load(open(rdir+'annotations/'+injsonfile+'.json'))
for k in molajson:
    print(k, len(molajson[k]))

info 6
images 54649
annotations 83875
categories 1230
licenses 1
videos 1488
tracks 8132


## 1. Import ids
#### #NOTE: work with ids and index so you can use numpy for faster operations

In [49]:
# datasets name and id
dset_l=[]
dset_l_id=[]
try:
    for d in molajson['datasets']:
        dset_l.append(d['name'])
        dset_l_id.append(d['id'])
except: #manually add for example for only COCO
    dset_l=['TAO']
    dset_l_id=[1]
print(dset_l, dset_l_id)

['TAO'] [1]


In [50]:
# categories name and id
cat_l=[]
cat_l_id=[]
cat_l_dset=[]
for c in molajson['categories']:
    cat_l.append(c['name'])
    cat_l_id.append(c['id'])
    try:
        cat_l_dset.append(dset_l[c['dataset']-1]) # dset_l index is same as id-1
    except:
        cat_l_dset.append(dset_l[0])
#print(cat_l_id)

In [51]:
# images filepath and id
img_l=[]
img_l_id=[]
for c in molajson['images']:
    img_l.append(c['file_name'])
    img_l_id.append(c['id'])

In [52]:
# annotations category_id, image_id, bbox, and dataset
ann_catid=[]
ann_imgid=[]
ann_bbox=[]
ann_dset=[]
for an in tqdm(molajson['annotations']):
    ann_catid.append(an['category_id'])
    ann_imgid.append(an['image_id'])
    ann_bbox.append(an['bbox'])
    try:
        ann_dset.append(an['dataset'])
    except:
        ann_dset.append(dset_l_id[0])

100%|████████████████████████████████████████████████████████████████████████| 83875/83875 [00:00<00:00, 670763.31it/s]


## 2. Find mixers cat_ids
mixers example
categories= [{name:cow, id:1, dataset:1},...,{name:cow, id:200, dataset:2},...,{name:cow, id:101, dataset:3}]}

In [53]:
#mixers #TODO: SORT alphabetically
mixers_l=[]
mixers_l_catid=[]
mixers_l_catdset=[]
mixer_method="all_cats"
if mixer_method=="all_cats": #Do for all category names, even with equal 
    mixers_l=cat_l
    mixers_l_catid=[[id] for id in cat_l_id]
    mixers_l_catdset=[[dset] for dset in cat_l_dset]
    

print(mixers_l[0:5])
print(mixers_l_catid[0:5])
print(mixers_l_catdset[0:5])
print(len(mixers_l))
print(len(mixers_l_catid))
print(len(mixers_l_catdset))

['acorn', 'aerosol_can', 'air_conditioner', 'airplane', 'alarm_clock']
[[1], [2], [3], [4], [5]]
[['TAO'], ['TAO'], ['TAO'], ['TAO'], ['TAO']]
1230
1230
1230


In [54]:
# get annotations mixers
ann_catid_np=np.array(ann_catid)
ann_imgid_np=np.array(ann_imgid)
ann_bbox_np=np.array(ann_bbox)
ann_dset_np=np.array(ann_dset)
mixers_l_imgid=[]
mixers_l_bbox=[]
mixers_l_dset=[]
for catids in tqdm(mixers_l_catid):
    l_imgid=[]
    l_bbox=[]
    l_dset=[]
    for catid in catids:
        ann_idx = np.where(ann_catid_np==catid)[0].tolist() #annotation index of ids
        l_imgid.append(ann_imgid_np[ann_idx].tolist())
        l_bbox.append(ann_bbox_np[ann_idx].tolist())
        l_dset.append(ann_dset_np[ann_idx].tolist())
    mixers_l_imgid.append(l_imgid)
    mixers_l_bbox.append(l_bbox)
    mixers_l_dset.append(l_dset)

100%|████████████████████████████████████████████████████████████████████████████| 1230/1230 [00:00<00:00, 6440.82it/s]


## 3. Classes|categories to mix w/ EXCEL report

In [55]:
#INIT VARS
classtomix_l=[]
classtomix_l_catid=[]
method="" #"save_images": to save new images and excel for manual inspection; "": already saved don't need to repeat and the excel are
datadir="mixers/"+injsonfile+"/" #root folder to save mixer method . #WARNING mixers/original json that was used to save images and excel
folder=mixer_method+'/' #folder to save images and exel 
showimage=False #show images
startidx=0 # start index of image to save from each dataset
imgnr=1 # total number of images to save from each dataset
imgstep='random' # step between images: int | 'random' - int steps between images; 'rand' gets random list
#paths
path=os.path.join(rdir,datadir,folder) #path to folder
assure_path_exists(path)
excelpath=path+mixer_method+"_v1.xlsx"#path+mixer_method+"_classtomix_report.xlsx"#path to excel
#fixed path - uncomment and change to method=""
excelpath="D:/external_datasets/MOLA/mixers/split_tao_original/tao_cats_aggressive_v1.xlsx"

In [56]:
#METHODS
if method=="save_images": # save images and excel to folder for manual edit
    df=pd.DataFrame({'mixers_l': mixers_l,'mixers_l_catid': mixers_l_catid, 'mixers_l_catdset': mixers_l_catdset, 'classtomix_l': np.nan, 'classtomix_l_catid':np.nan, 'rules':np.nan })
    df.loc[0, 'rules']="To fix classes: 1) You need to fill the column classtomix_l and/or classtomix_l_catid with the information from the respective mixer columns; 2) When copy/pasting or changing, make sure the same structure maintains:  ['car', 'carrot'], [3, 52], beware of the spaces ['car', '  and always maintain the first class in the list;  3) You have 3 possibilities of filling the columns : 1-the 2 columns empty, meaning the row will not be used for classtomix; 2-only one column empty, e.g. fill the classtotix_l row with the class labels from mixers_l, then during the importing the classtomix_l_catid is filled, and vice-versa; 3-If you want to change the name of the first class in the list,e.g ['car', 'carrot'] for ['automobile', 'carrot'] you need to provide the ids to classtomix_l_catid."
    df['annotations_missing'] = np.empty((len(df), 0)).tolist()
    df['images_missing'] = np.empty((len(df), 0)).tolist()
    #save image for each mixer
    for i, mixer in enumerate(tqdm(mixers_l)): #run for each mixer category
        firstclass=mixer
        if isinstance(firstclass, list): firstclass=firstclass[0] #first class
        print('\n>> '+firstclass+'...') #class
        classpath=os.path.join(path, firstclass) # path to folder for images of  firstclass
        classpath=parse_path(classpath)+'/' #make it a folder
        assure_path_exists(classpath)
        df=save_imgs(df, rdir_dsets, classpath, i, dset_l, mixers_l, mixers_l_catid, mixers_l_bbox, mixers_l_dset,
              mixers_l_imgid, img_l, img_l_id, startidx=startidx, imgnr=imgnr, imgstep=imgstep, showimage=showimage)    
    df.to_excel(excelpath, index=False)   

In [57]:
#IMPORT EXCEL MANUAL EDIT #WARNING: CHECK EXCEL FIRST (#NOTE: donte use classes with missing annotations and images)
df=pd.read_excel(excelpath)
classtomix_df=df.loc[:,'classtomix_l']
classtomix_df_catid=df.loc[:,'classtomix_l_catid']
new_cat_l=copy.deepcopy(cat_l)
new_cat_l_id=copy.deepcopy(cat_l_id)
display(df)

# PARSE COLUMNS TO FIX
classtomix_l=classtomix_df.tolist()
classtomix_l_catid=classtomix_df_catid.tolist()
#convert strings to lists
for icl, cl in enumerate(classtomix_l): 
    if isinstance(classtomix_l[icl], str): classtomix_l[icl]=convert_unicode(classtomix_l[icl], method='liststr')
    if isinstance(classtomix_l_catid[icl], str): classtomix_l_catid[icl]=convert_unicode(classtomix_l_catid[icl], method='listnum')


#parse the columns(classtomix_l, classtomix_l_catid) based on the rules
#0. if both columns are empty
if classtomix_df.isnull().all() or classtomix_df_catid.isnull().all():
    raise RuntimeError("Go Back to the excel and add something to classtomix_l and classtomix_l_catid")
else:
    for ic, classes in enumerate(classtomix_df):
        #1. if only classtomix_l_catid empty - get 
        if not pd.isnull(classtomix_df.iloc[ic]) and pd.isnull(classtomix_df_catid.iloc[ic]):
            if classtomix_df.iloc[ic] == 'group_remain_classes': # group the remaining ids 
                chosed_id=np.array([x for x in classtomix_l_catid if str(x) != 'nan' and str(x) !='[]'][0])
                remain_id=np.setdiff1d(np.array(new_cat_l_id), chosed_id)
                classtomix_l_catid[ic]=remain_id.tolist()
                #classtomix_l[ic]="remain_group" #use this to give another name 
                break #NOTE : break because this shoul be the last entry
            if classtomix_df.iloc[ic] == 'add_remain_classes': # add the remaining 
                del classtomix_l[ic] #remove "add_remain_classes" from classes list
                chosed_id=np.array([x for x in classtomix_l_catid if str(x) != 'nan' and str(x) !='[]'][0])
                remain_id=np.setdiff1d(np.array(new_cat_l_id), chosed_id)
                remain_id_l=remain_id.tolist()
                remain_cat_l=[new_cat_l[new_cat_l_id.index(id)] for id in remain_id_l] #id to names
                for remain_id in remain_id_l: classtomix_l_catid.append([remain_id])
                for remain_cat in remain_cat_l: classtomix_l.append([remain_cat] )
                break #NOTE : break because this shoul be the last entry 
        #2. if only classtomix_l empty - raise
        if pd.isnull(classtomix_df.iloc[ic]) and not pd.isnull(classtomix_df_catid.iloc[ic]): 
            raise RuntimeError('only classtomix_l empty')
        #3. if classtomix_l and classtomix_l_catid not empty - mantain

        

Unnamed: 0,mixers_l,mixers_l_catid,mixers_l_catdset,classtomix_l,classtomix_l_catid,rules,annotations_missing,images_missing,aggressive_keywords,class
0,acorn,[1],['TAO'],aggressive,"[59, 137, 177, 263, 312, 360, 407, 460, 476, 5...",To fix classes: 1) You need to fill the column...,[1],[0],knife,"knife(626), butcher_knife(177), pocketknife(83..."
1,aerosol_can,[2],['TAO'],group_remain_classes,,,[1],[0],weapon,"bow_(weapon)(137), projectile_(weapon)(861)"
2,air_conditioner,[3],['TAO'],,,,[1],[0],gun,"gun(534),machine_gun(672),water_gun(1191)"
3,airplane,[4],['TAO'],,,,[0],[0],bat,baseball bat(59)
4,alarm_clock,[5],['TAO'],,,,[1],[0],fork,"fork(476),pitchfork(829)"
...,...,...,...,...,...,...,...,...,...,...
1225,yak,[1226],['TAO'],,,,[1],[0],,
1226,yogurt,[1227],['TAO'],,,,[1],[0],,
1227,yoke_(animal_equipment),[1228],['TAO'],,,,[1],[0],,
1228,zebra,[1229],['TAO'],,,,[0],[0],,


In [58]:
print('>> Make sure everything is correct: \n1.Drop NaN if exist, but make sure the index is the same for the two! \n2.Put classtomix_l and classtomix_l_catid as a list of lists\n3. Change name of classes if you want\n')
fixempty=True
if fixempty:
    classtomix_l=[x for x in classtomix_l if str(x) != 'nan' and str(x) !='[]']
    classtomix_l_catid=[x for x in classtomix_l_catid if str(x) != 'nan' and str(x) !='[]']
#classtomix_l[1]=['non_aggressive'] #uncomment and change name
print(len(classtomix_l))
print(len(classtomix_l_catid))

>> Make sure everything is correct: 
1.Drop NaN if exist, but make sure the index is the same for the two! 
2.Put classtomix_l and classtomix_l_catid as a list of lists
3. Change name of classes if you want

2
2


In [59]:
print(classtomix_l[-1])
print(classtomix_l_catid[-1])

['non_aggressive']
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 22

## 4. Mix classes

In [60]:
# slow # newjson=copy.deepcopy(molajson) #do deepcopy to compare
# fast
newjson={'categories':[],'annotations':[] }
newjson['categories']=copy.copy(molajson['categories'])
newjson['annotations']=copy.copy(molajson['annotations'])

In [61]:
classtomix_l_catidx=[[cat_l_id.index(id) for id in id_l] for id_l in classtomix_l_catid]
#print(classtomix_l_catidx) # they should be less one, becacuse it is ordered
print(len(classtomix_l_catidx))

2


#### Change molajson['categories']: [{name: , id: }]  
=>  1. use first index cat id; 2. change name and change id;  remove the other categories (!!!Without ordering again the category id!!!)

In [62]:
# CHANGE NAME  & GET REMOVE List
keepidx_l=[]
keepid_l=[]
firstidx=0 # get first category id
for i,id_l in enumerate(tqdm(classtomix_l_catid)): #for each classtomix
    firstcatid=id_l[firstidx] # #category id 
    firstcatidx=classtomix_l_catidx[i][firstidx]# get cat index of first catid
    if isinstance(classtomix_l[i], list): newjson['categories'][firstcatidx]['name']=classtomix_l[i][firstidx] #change name of first id 
    else: newjson['categories'][firstcatidx]['name']=classtomix_l[i]
    assert newjson['categories'][firstcatidx]['id']==firstcatid #assert id - it should be the same
    keepidx_l.append(firstcatidx) #catidx to keep
    keepid_l.append(firstcatid) #catid to keep
keepidx_l=list(dict.fromkeys(keepidx_l)) # remove duplicates in the keep list
allidx_l=[index for index, value in enumerate(molajson['categories'])] # allidx in categories
removeidx_l=[idx for idx in allidx_l if idx not in keepidx_l] # remove idx 
removeitem_l=[newjson['categories'][removeidx] for removeidx in removeidx_l] #remove items #WARNING NECESSARY BECAUSE THE INDEX WILL CHANGE
print(len(allidx_l))
print(len(removeidx_l))
print(len(allidx_l)-len(removeidx_l))
print(keepidx_l in removeidx_l)

100%|████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<?, ?it/s]

1230
1228
2
False





REMOVE CLASSES

In [63]:
# REMOVE - newjson will be changed
for removeitem in removeitem_l: newjson['categories'].remove(removeitem)

In [64]:
print(newjson['categories'][-1])
print(molajson['categories'][-1])

{'frequency': 'f', 'id': 59, 'synset': 'baseball_cap.n.01', 'image_count': 70, 'instance_count': 248, 'synonyms': ['baseball_cap', 'jockey_cap', 'golf_cap'], 'def': 'a cap with a bill', 'name': 'aggressive'}
{'frequency': 'c', 'id': 1230, 'synset': 'zucchini.n.02', 'image_count': 1, 'instance_count': 7, 'synonyms': ['zucchini', 'courgette'], 'def': 'small cucumber-shaped vegetable marrow; typically dark green', 'name': 'zucchini'}


REORDER IDs

In [65]:
# GET NEW IDs - REORDER IDs - #WARNING after remove
ct_l_id=[]
for i,c in enumerate(tqdm(newjson['categories'])):
    ct_l_id.append(c['id'])
newidx_l=[ct_l_id.index(id) for id in keepid_l] # make sure same sequence of keepid_l #SAME ORDER AS EXCEL
newid_l=[i+1 for i in range(len(keepid_l))] #reorder keepid_l
print(keepid_l)
print(newidx_l)
print(newid_l)

100%|██████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 2002.53it/s]

[59, 1]
[1, 0]
[1, 2]





In [66]:
# SORT IDS - Reorder based on Excel order - newjson will be changed
categories_l=copy.copy(newjson['categories'])
for i,idx in enumerate(newidx_l):
    categories_l[idx]['id']=newid_l[i]
for i,idx in enumerate(newidx_l):
    newjson['categories'][i]=categories_l[idx] #TODO sort the id in the correct sequence

In [67]:
#TEST
print(len(categories_l))
print(len(newjson['categories']))
print(len(molajson['categories']))
print(newjson['categories'][-1])
print(molajson['categories'][-1])

2
2
1230
{'frequency': 'r', 'id': 2, 'synset': 'acorn.n.01', 'image_count': 0, 'instance_count': 0, 'synonyms': ['acorn'], 'def': 'nut from an oak tree', 'name': 'non_aggressive'}
{'frequency': 'c', 'id': 1230, 'synset': 'zucchini.n.02', 'image_count': 1, 'instance_count': 7, 'synonyms': ['zucchini', 'courgette'], 'def': 'small cucumber-shaped vegetable marrow; typically dark green', 'name': 'zucchini'}


### QUESTION: REMOVE HYPERPARAMETERS? Mantain only id and NAME? OR irrelevant?

#### change molajson['annotations']: [{category_id: , }] 
=> 1.get annotation idx from catid; 2.update annotations id ; 3. update newjson['annotations']

In [68]:
# 1.get annotation idx from classtomix_l_catid
ann_catid_np=np.array(ann_catid)
classtomix_l_ann_catidx=[[np.where(ann_catid_np==id)[0].tolist()  for id in id_l] for id_l in classtomix_l_catid]
print(classtomix_l)
print(classtomix_l_catid)
print(keepid_l)
print(newid_l)
print(len(classtomix_l_ann_catidx[0]))

[['aggressive'], ['non_aggressive']]
[[59, 137, 177, 263, 312, 360, 407, 460, 476, 534, 541, 552, 574, 596, 626, 628, 672, 731, 826, 829, 839, 841, 861, 889, 900, 943, 986, 1027, 1067, 1176, 1191], [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 178, 179, 180, 181, 182, 183, 184, 185

In [69]:
#2.update annotations ids & 3. update newjson['annotations'] with only the annotations frow classtomix_l_catid
newjson['annotations']=copy.copy(molajson['annotations']) #reset annotations
copy_ann_l=copy.copy(newjson['annotations'])
newjson['annotations']=[] #clear
for i, ann_catidx_l in enumerate(classtomix_l_ann_catidx): #only append annotations that 
    for catidx_l in ann_catidx_l:
        for catidx in catidx_l:
            copy_ann_l[catidx]['category_id']=newid_l[i] # update catid
            newjson['annotations'].append(copy_ann_l[catidx]) #update newjson with only the  (annotations sequence id will be lost
print(len(molajson['annotations']))
print(len(newjson['annotations']))

83875
83875


In [70]:
#TEST
print(molajson['annotations'][-1])
print(newjson['annotations'][-1])

{'segmentation': [[946, 388, 1121, 388, 1121, 438, 946, 438]], 'bbox': [946, 388, 175, 50], 'area': 8750, 'iscrowd': 0, 'id': 107417, 'image_id': 24033, 'category_id': 2, 'track_id': 3880, '_scale_uuid': '2b951a18-d685-4b44-a43e-7b834eaaade6', 'scale_category': 'moving object', 'video_id': 612}
{'segmentation': [[621, 234, 813, 234, 813, 379, 621, 379]], 'bbox': [621, 234, 192, 145], 'area': 27840, 'iscrowd': 0, 'id': 331425, 'image_id': 89466, 'category_id': 2, 'track_id': 14355, '_scale_uuid': 'e9194b27-7a6c-48b0-baee-4e1a3988b270', 'scale_category': 'pet', 'video_id': 2362}


### 4. Save mixed json

In [71]:
# fast
molajson['categories']=copy.copy(newjson['categories'])
molajson['annotations']=copy.copy(newjson['annotations'])

In [72]:
# save
print('\n >> SAVING...')
jsonfile=rdir+'annotations/'+injsonfile+'_mix.json'
with open(jsonfile, 'w') as f:
    json.dump(molajson, f)
print("JSON SAVED : {} \n".format(jsonfile))
for k in molajson:
    print(k, len(molajson[k]))
cat_l=[]
for c in molajson['categories']:
    cat_l.append(c['name'])
print(cat_l)


 >> SAVING...
JSON SAVED : D:/external_datasets/MOLA/annotations/split_tao_original/train_mix.json 

info 6
images 54649
annotations 83875
categories 2
licenses 1
videos 1488
tracks 8132
['aggressive', 'non_aggressive']


### 5. TEST MIX ANNOTATIONS DUPLICATES

In [95]:
molajson = json.load(open(rdir+'annotations/split_mola_fix_equal_reorder_nomissings/mix_aggressive_addremainclasses/test.json'))

FileNotFoundError: [Errno 2] No such file or directory: 'D:/external_datasets/MOLA/annotations/split_mola_fix_equal_reorder_nomissings/mix_aggressive_addremainclasses/test.json'

In [None]:
for k in molajson:
    print(k, len(molajson[k]))

In [None]:
# annotations category_id
ann_ids=[]
for an in tqdm(molajson['annotations']):
    ann_ids.append(an['id'])
print(len(ann_ids))

#TEST duplicates v3 -faster
u, c = np.unique(np.array(ann_ids), return_counts=True)
duplicates_l= u[c > 1].tolist()
print(len(duplicates_l))

In [None]:
# categories name and id
cat_l=[]
for c in molajson['categories']:
    cat_l.append(c['name'])

In [None]:
print(len(cat_l))
print(cat_l)
