# Clean Classes=Categories with missing annotations or images
version: 1

info: 
- clean_cats: Can be used to remove classes with missing annotations or images:
    0. If not mola.json : Manually add dataset descriptors when importing id, e.g. COCO(see below)
    1. method="save_images" (if you didn't do it already)
    2. don't alter the excel
- reorder_ids: It also reorders category id

author: nuno costa

In [78]:
from annotate_v5 import *
import platform 
import numpy as np
import pandas as pd
from IPython.display import Image, display
import copy
import os
from shutil import copyfile
import matplotlib.pyplot as plt
from matplotlib.image import imread
from matplotlib.patches import Rectangle
import random

In [79]:
#Define root dir dependent on OS
rdir_dsets='D:/external_datasets/' #WARNING: DATASETS ROOT is OK?
rdir='D:/external_datasets/MOLA/'  
if str(platform.platform()).find('linux')>-1:
    dirdir_dsets=rdir_dsets.replace('D:/','/mnt/d/')
    rdir=rdir.replace('D:/','/mnt/d/')
print('OS: {}'.format(platform.platform()))
print('root datasets dir: {}'.format(rdir_dsets))
print('root dir: {}'.format(rdir))

OS: Windows-10-10.0.21292-SP0
root datasets dir: D:/external_datasets/
root dir: D:/external_datasets/MOLA/


In [92]:
#jsonfile
injsonfile="coco2017" #"split_mola_fix_equal/test"
molajson =  json.load(open(rdir+'annotations/'+injsonfile+'.json'))
for k in molajson:
    print(k, len(molajson[k]))

info 6
licenses 8
images 123287
annotations 1170251
categories 80


## 1. Import ids
#### #NOTE: work with ids and index so you can use numpy for faster operations

In [93]:
# datasets name and id
dset_l=[]
dset_l_id=[]
try:
    for d in molajson['datasets']:
        dset_l.append(d['name'])
        dset_l_id.append(d['id'])
except: #manually add for example for only COCO
    dset_l=['COCO']
    dset_l_id=[1]
print(dset_l, dset_l_id)

['COCO'] [1]


In [94]:
# categories name and id
cat_l=[]
cat_l_id=[]
cat_l_dset=[]
for c in molajson['categories']:
    cat_l.append(c['name'])
    cat_l_id.append(c['id'])
    try:
        cat_l_dset.append(dset_l[c['dataset']-1]) # dset_l index is same as id-1
    except:
        cat_l_dset.append(dset_l[0])
#print(cat_l_id)

In [95]:
# images filepath and id
img_l=[]
img_l_id=[]
for c in molajson['images']:
    img_l.append(c['file_name'])
    img_l_id.append(c['id'])

In [96]:
# annotations category_id, image_id, bbox, and dataset
ann_catid=[]
ann_imgid=[]
ann_bbox=[]
ann_dset=[]
for an in tqdm(molajson['annotations']):
    ann_catid.append(an['category_id'])
    ann_imgid.append(an['image_id'])
    ann_bbox.append(an['bbox'])
    try:
        ann_dset.append(an['dataset'])
    except:
        ann_dset.append(dset_l_id[0])

100%|██████████████████████████████████████████████████████████████████| 1170251/1170251 [00:01<00:00, 785834.27it/s]


## 2. Find cleaners cat_ids
cleaners example
categories= [{name:cow, id:1, dataset:1},...,{name:cow, id:200, dataset:2},...,{name:cow, id:101, dataset:3}]}

In [97]:
#cleaners #TODO: SORT alphabetically
cleaners_l=[]
cleaners_l_catid=[]
cleaners_l_catdset=[]
cleaner_method="all_cats"
if cleaner_method=="all_cats": #Do for all category names, even with equal 
    cleaners_l=cat_l
    cleaners_l_catid=[[id] for id in cat_l_id]
    cleaners_l_catdset=[[dset] for dset in cat_l_dset]
    

print(cleaners_l[0:5])
print(cleaners_l_catid[0:5])
print(cleaners_l_catdset[0:5])
print(len(cleaners_l))
print(len(cleaners_l_catid))
print(len(cleaners_l_catdset))

['person', 'bicycle', 'car', 'motorcycle', 'airplane']
[[1], [2], [3], [4], [5]]
[['COCO'], ['COCO'], ['COCO'], ['COCO'], ['COCO']]
80
80
80


In [98]:
# get annotations cleaners
ann_catid_np=np.array(ann_catid)
ann_imgid_np=np.array(ann_imgid)
ann_bbox_np=np.array(ann_bbox)
ann_dset_np=np.array(ann_dset)
cleaners_l_imgid=[]
cleaners_l_bbox=[]
cleaners_l_dset=[]
for catids in tqdm(cleaners_l_catid):
    l_imgid=[]
    l_bbox=[]
    l_dset=[]
    for catid in catids:
        ann_idx = np.where(ann_catid_np==catid)[0].tolist() #annotation index of ids
        l_imgid.append(ann_imgid_np[ann_idx].tolist())
        l_bbox.append(ann_bbox_np[ann_idx].tolist())
        l_dset.append(ann_dset_np[ann_idx].tolist())
    cleaners_l_imgid.append(l_imgid)
    cleaners_l_bbox.append(l_bbox)
    cleaners_l_dset.append(l_dset)

100%|████████████████████████████████████████████████████████████████████████████████| 80/80 [00:02<00:00, 35.73it/s]


## 3. Classes|categories to clean w/ EXCEL report

In [102]:
#INIT VARS
classtoclean_l=[]
classtoclean_l_catid=[]
method="save_images" #"save_images": to save new images and excel for manual inspection; "": already saved don't need to repeat and the excel are
datadir="cleaners/"+injsonfile+"/" #root folder to save cleaner method . #WARNING cleaners/original json that was used to save images and excel
folder=cleaner_method+'/' #folder to save images and exel 
showimage=False #show images
startidx=0 # start index of image to save from each dataset
imgnr=5 # total number of images to save from each dataset
imgstep='random' # step between images: int | 'random' - int steps between images; 'rand' gets random list
#paths
path=os.path.join(rdir,datadir,folder) #path to folder
assure_path_exists(path)
excelpath=path+cleaner_method+"_v1.xlsx"#path+cleaner_method+"_classtoclean_report.xlsx"#path to excel

In [100]:
#METHODS
if method=="save_images": # save images and excel to folder for manual edit
    df=pd.DataFrame({'cleaners_l': cleaners_l,'cleaners_l_catid': cleaners_l_catid, 'cleaners_l_catdset': cleaners_l_catdset, 'classtoclean_l': np.nan, 'classtoclean_l_catid':np.nan, 'rules':np.nan })
    df.loc[0, 'rules']="To fix classes: 1) You need to fill the column classtoclean_l and/or classtoclean_l_catid with the information from the respective cleaner columns; 2) When copy/pasting or changing, make sure the same structure maintains:  ['car', 'carrot'], [3, 52], beware of the spaces ['car', '  and always maintain the first class in the list;  3) You have 3 possibilities of filling the columns : 1-the 2 columns empty, meaning the row will not be used for classtoclean; 2-only one column empty, e.g. fill the classtotix_l row with the class labels from cleaners_l, then during the importing the classtoclean_l_catid is filled, and vice-versa; 3-If you want to change the name of the first class in the list,e.g ['car', 'carrot'] for ['automobile', 'carrot'] you need to provide the ids to classtoclean_l_catid."
    df['annotations_missing'] = np.empty((len(df), 0)).tolist()
    df['images_missing'] = np.empty((len(df), 0)).tolist()
    #save image for each cleaner
    for i, cleaner in enumerate(tqdm(cleaners_l)): #run for each cleaner category
        firstclass=cleaner
        if isinstance(firstclass, list): firstclass=firstclass[0] #first class
        print('\n>> '+firstclass+'...') #class
        classpath=os.path.join(path, firstclass) # path to folder for images of  firstclass
        classpath=parse_path(classpath)+'/' #make it a folder
        assure_path_exists(classpath)
        df=save_imgs(df, rdir_dsets, classpath, i, dset_l, cleaners_l, cleaners_l_catid, cleaners_l_bbox, cleaners_l_dset,
              cleaners_l_imgid, img_l, img_l_id, startidx=startidx, imgnr=imgnr, imgstep=imgstep, showimage=showimage)    
    df.to_excel(excelpath, index=False)   

  0%|                                                                                         | 0/80 [00:00<?, ?it/s]


>> person...
COCO


  1%|█                                                                                | 1/80 [00:00<01:07,  1.17it/s]


>> bicycle...
COCO



>>> Finding COCO bicycle...:   0%|                                                          | 0/7429 [00:00<?, ?it/s][A

[Errno 2] No such file or directory: 'D:/external_datasets/COCO/2017/images/val2017/000000076416.jpg'



  2%|██                                                                               | 2/80 [00:01<00:59,  1.31it/s]


>> car...
COCO


  4%|███                                                                              | 3/80 [00:01<00:54,  1.41it/s]


>> motorcycle...
COCO


  5%|████                                                                             | 4/80 [00:02<00:49,  1.52it/s]


>> airplane...
COCO



>>> Finding COCO airplane...:   0%|                                                         | 0/5278 [00:00<?, ?it/s][A

[Errno 2] No such file or directory: 'D:/external_datasets/COCO/2017/images/val2017/000000052412.jpg'



  6%|█████                                                                            | 5/80 [00:03<00:46,  1.60it/s]


>> bus...
COCO


  8%|██████                                                                           | 6/80 [00:03<00:43,  1.69it/s]


>> train...
COCO


  9%|███████                                                                          | 7/80 [00:04<00:41,  1.77it/s]


>> truck...
COCO



>>> Finding COCO truck...:   0%|                                                           | 0/10388 [00:00<?, ?it/s][A

[Errno 2] No such file or directory: 'D:/external_datasets/COCO/2017/images/val2017/000000033759.jpg'


>>> Finding COCO truck...:   0%|                                                           | 0/10388 [00:00<?, ?it/s]
 10%|████████                                                                         | 8/80 [00:04<00:45,  1.57it/s]


>> boat...
COCO


 11%|█████████                                                                        | 9/80 [00:05<00:42,  1.67it/s]


>> traffic light...
COCO


 12%|██████████                                                                      | 10/80 [00:05<00:39,  1.79it/s]


>> fire hydrant...
COCO



>>> Finding COCO fire hydrant...:   0%|                                                     | 0/1966 [00:00<?, ?it/s][A
 14%|███████████                                                                     | 11/80 [00:06<00:36,  1.87it/s]

[Errno 2] No such file or directory: 'D:/external_datasets/COCO/2017/images/val2017/000000296657.jpg'

>> stop sign...
COCO


 15%|████████████                                                                    | 12/80 [00:06<00:34,  1.95it/s]


>> parking meter...
COCO


 16%|█████████████                                                                   | 13/80 [00:07<00:35,  1.89it/s]


>> bench...
COCO


 18%|██████████████                                                                  | 14/80 [00:07<00:34,  1.94it/s]


>> bird...
COCO


 19%|███████████████                                                                 | 15/80 [00:08<00:32,  1.98it/s]


>> cat...
COCO
[Errno 2] No such file or directory: 'D:/external_datasets/COCO/2017/images/val2017/000000240940.jpg'



>>> Finding COCO cat...:   0%|                                                              | 0/4970 [00:00<?, ?it/s][A
 20%|████████████████                                                                | 16/80 [00:08<00:32,  1.97it/s]


>> dog...
COCO


 21%|█████████████████                                                               | 17/80 [00:09<00:33,  1.87it/s]


>> horse...
COCO


 22%|██████████████████                                                              | 18/80 [00:10<00:33,  1.86it/s]


>> sheep...
COCO


 24%|███████████████████                                                             | 19/80 [00:10<00:32,  1.85it/s]
>>> Finding COCO cow...:   0%|                                                              | 0/8527 [00:00<?, ?it/s][A



>> cow...
COCO
[Errno 2] No such file or directory: 'D:/external_datasets/COCO/2017/images/val2017/000000347664.jpg'


 25%|████████████████████                                                            | 20/80 [00:11<00:31,  1.88it/s]


>> elephant...
COCO


 26%|█████████████████████                                                           | 21/80 [00:11<00:32,  1.81it/s]


>> bear...
COCO


 28%|██████████████████████                                                          | 22/80 [00:12<00:31,  1.87it/s]


>> zebra...
COCO


 29%|███████████████████████                                                         | 23/80 [00:12<00:30,  1.87it/s]


>> giraffe...
COCO


 30%|████████████████████████                                                        | 24/80 [00:13<00:28,  1.94it/s]


>> backpack...
COCO



>>> Finding COCO backpack...:   0%|                                                         | 0/9091 [00:00<?, ?it/s][A
 31%|█████████████████████████                                                       | 25/80 [00:13<00:27,  1.97it/s]

[Errno 2] No such file or directory: 'D:/external_datasets/COCO/2017/images/val2017/000000492284.jpg'

>> umbrella...
COCO


 32%|██████████████████████████                                                      | 26/80 [00:14<00:27,  1.95it/s]


>> handbag...
COCO


 34%|███████████████████████████                                                     | 27/80 [00:14<00:26,  1.99it/s]


>> tie...
COCO



>>> Finding COCO tie...:   0%|                                                              | 0/6750 [00:00<?, ?it/s][A

[Errno 2] No such file or directory: 'D:/external_datasets/COCO/2017/images/val2017/000000456559.jpg'



 35%|████████████████████████████                                                    | 28/80 [00:15<00:27,  1.93it/s]


>> suitcase...
COCO


 36%|█████████████████████████████                                                   | 29/80 [00:15<00:25,  1.98it/s]


>> frisbee...
COCO


 38%|██████████████████████████████                                                  | 30/80 [00:16<00:26,  1.92it/s]


>> skis...
COCO


 39%|███████████████████████████████                                                 | 31/80 [00:16<00:25,  1.90it/s]


>> snowboard...
COCO


 40%|████████████████████████████████                                                | 32/80 [00:17<00:24,  1.93it/s]


>> sports ball...
COCO


 41%|█████████████████████████████████                                               | 33/80 [00:17<00:23,  2.00it/s]


>> kite...
COCO


 42%|██████████████████████████████████                                              | 34/80 [00:18<00:22,  2.02it/s]


>> baseball bat...
COCO


 44%|███████████████████████████████████                                             | 35/80 [00:18<00:22,  2.03it/s]


>> baseball glove...
COCO


 45%|████████████████████████████████████                                            | 36/80 [00:19<00:21,  2.01it/s]


>> skateboard...
COCO


 46%|█████████████████████████████████████                                           | 37/80 [00:19<00:22,  1.89it/s]


>> surfboard...
COCO
[Errno 2] No such file or directory: 'D:/external_datasets/COCO/2017/images/val2017/000000027972.jpg'



>>> Finding COCO surfboard...:   0%|                                                        | 0/6395 [00:00<?, ?it/s][A
 48%|██████████████████████████████████████                                          | 38/80 [00:20<00:22,  1.88it/s]


>> tennis racket...
COCO


 49%|███████████████████████████████████████                                         | 39/80 [00:20<00:21,  1.89it/s]


>> bottle...
COCO


 50%|████████████████████████████████████████                                        | 40/80 [00:21<00:21,  1.87it/s]


>> wine glass...
COCO


 51%|█████████████████████████████████████████                                       | 41/80 [00:21<00:20,  1.89it/s]


>> cup...
COCO


 52%|██████████████████████████████████████████                                      | 42/80 [00:22<00:19,  1.90it/s]


>> fork...
COCO


 54%|███████████████████████████████████████████                                     | 43/80 [00:22<00:19,  1.90it/s]


>> knife...
COCO


 55%|████████████████████████████████████████████                                    | 44/80 [00:23<00:19,  1.89it/s]


>> spoon...
COCO


 56%|█████████████████████████████████████████████                                   | 45/80 [00:24<00:18,  1.90it/s]


>> bowl...
COCO


 57%|██████████████████████████████████████████████                                  | 46/80 [00:24<00:17,  1.98it/s]


>> banana...
COCO



>>> Finding COCO banana...:   0%|                                                           | 0/9837 [00:00<?, ?it/s][A

[Errno 2] No such file or directory: 'D:/external_datasets/COCO/2017/images/val2017/000000581781.jpg'




>>> Finding COCO banana...:   0%|                                                   | 1/9837 [00:00<09:38, 17.00it/s][A
 59%|███████████████████████████████████████████████                                 | 47/80 [00:25<00:17,  1.93it/s]

[Errno 2] No such file or directory: 'D:/external_datasets/COCO/2017/images/val2017/000000066706.jpg'

>> apple...
COCO


 60%|████████████████████████████████████████████████                                | 48/80 [00:25<00:16,  1.98it/s]


>> sandwich...
COCO


 61%|█████████████████████████████████████████████████                               | 49/80 [00:25<00:15,  2.05it/s]


>> orange...
COCO


 62%|██████████████████████████████████████████████████                              | 50/80 [00:26<00:15,  1.93it/s]


>> broccoli...
COCO


 64%|███████████████████████████████████████████████████                             | 51/80 [00:27<00:14,  1.99it/s]


>> carrot...
COCO



>>> Finding COCO carrot...:   0%|                                                           | 0/8223 [00:00<?, ?it/s][A

[Errno 2] No such file or directory: 'D:/external_datasets/COCO/2017/images/val2017/000000456143.jpg'



 65%|████████████████████████████████████████████████████                            | 52/80 [00:27<00:14,  1.98it/s]


>> hot dog...
COCO


 66%|█████████████████████████████████████████████████████                           | 53/80 [00:28<00:14,  1.92it/s]


>> pizza...
COCO


 68%|██████████████████████████████████████████████████████                          | 54/80 [00:28<00:13,  1.87it/s]


>> donut...
COCO


 69%|███████████████████████████████████████████████████████                         | 55/80 [00:29<00:13,  1.92it/s]


>> cake...
COCO


 70%|████████████████████████████████████████████████████████                        | 56/80 [00:29<00:12,  1.93it/s]


>> chair...
COCO



>>> Finding COCO chair...:   0%|                                                           | 0/40282 [00:00<?, ?it/s][A

[Errno 2] No such file or directory: 'D:/external_datasets/COCO/2017/images/val2017/000000112378.jpg'



 71%|█████████████████████████████████████████████████████████                       | 57/80 [00:30<00:11,  1.92it/s]


>> couch...
COCO


 72%|██████████████████████████████████████████████████████████                      | 58/80 [00:30<00:11,  1.90it/s]


>> potted plant...
COCO


 74%|███████████████████████████████████████████████████████████                     | 59/80 [00:31<00:10,  1.98it/s]


>> bed...
COCO


 75%|████████████████████████████████████████████████████████████                    | 60/80 [00:31<00:10,  1.98it/s]
>>> Finding COCO dining table...:   0%|                                                    | 0/16411 [00:00<?, ?it/s][A



>> dining table...
COCO
[Errno 2] No such file or directory: 'D:/external_datasets/COCO/2017/images/val2017/000000216497.jpg'


 76%|█████████████████████████████████████████████████████████████                   | 61/80 [00:32<00:09,  1.95it/s]


>> toilet...
COCO


 78%|██████████████████████████████████████████████████████████████                  | 62/80 [00:32<00:09,  1.99it/s]


>> tv...
COCO


 79%|███████████████████████████████████████████████████████████████                 | 63/80 [00:33<00:08,  2.03it/s]
>>> Finding COCO laptop...:   0%|                                                           | 0/5201 [00:00<?, ?it/s][A



>> laptop...
COCO
[Errno 2] No such file or directory: 'D:/external_datasets/COCO/2017/images/val2017/000000306139.jpg'


 80%|████████████████████████████████████████████████████████████████                | 64/80 [00:33<00:08,  1.81it/s]


>> mouse...
COCO


 81%|█████████████████████████████████████████████████████████████████               | 65/80 [00:34<00:10,  1.42it/s]


>> remote...
COCO


 82%|██████████████████████████████████████████████████████████████████              | 66/80 [00:37<00:18,  1.30s/it]


>> keyboard...
COCO


 84%|███████████████████████████████████████████████████████████████████             | 67/80 [00:38<00:13,  1.06s/it]


>> cell phone...
COCO


 85%|████████████████████████████████████████████████████████████████████            | 68/80 [00:38<00:10,  1.13it/s]


>> microwave...
COCO


 86%|█████████████████████████████████████████████████████████████████████           | 69/80 [00:39<00:08,  1.32it/s]


>> oven...
COCO


 88%|██████████████████████████████████████████████████████████████████████          | 70/80 [00:39<00:06,  1.46it/s]


>> toaster...
COCO


 89%|███████████████████████████████████████████████████████████████████████         | 71/80 [00:39<00:05,  1.64it/s]


>> sink...
COCO



>>> Finding COCO sink...:   0%|                                                             | 0/5835 [00:00<?, ?it/s][A
 90%|████████████████████████████████████████████████████████████████████████        | 72/80 [00:40<00:04,  1.74it/s]

[Errno 2] No such file or directory: 'D:/external_datasets/COCO/2017/images/val2017/000000520910.jpg'

>> refrigerator...
COCO
[Errno 2] No such file or directory: 'D:/external_datasets/COCO/2017/images/val2017/000000221708.jpg'



>>> Finding COCO refrigerator...:   0%|                                                     | 0/2763 [00:00<?, ?it/s][A
 91%|█████████████████████████████████████████████████████████████████████████       | 73/80 [00:40<00:03,  1.85it/s]


>> book...
COCO


 92%|██████████████████████████████████████████████████████████████████████████      | 74/80 [00:41<00:03,  1.84it/s]


>> clock...
COCO


 94%|███████████████████████████████████████████████████████████████████████████     | 75/80 [00:41<00:02,  1.90it/s]


>> vase...
COCO


 95%|████████████████████████████████████████████████████████████████████████████    | 76/80 [00:42<00:02,  1.95it/s]


>> scissors...
COCO


 96%|█████████████████████████████████████████████████████████████████████████████   | 77/80 [00:42<00:01,  2.01it/s]


>> teddy bear...
COCO


 98%|██████████████████████████████████████████████████████████████████████████████  | 78/80 [00:43<00:00,  2.04it/s]


>> hair drier...
COCO


 99%|███████████████████████████████████████████████████████████████████████████████ | 79/80 [00:43<00:00,  1.96it/s]


>> toothbrush...
COCO


100%|████████████████████████████████████████████████████████████████████████████████| 80/80 [00:44<00:00,  1.80it/s]


In [113]:
#IMPORT EXCEL MANUAL EDIT #WARNING: CHECK EXCEL FIRST (#NOTE: donte use classes with missing annotations and images)
df=pd.read_excel(excelpath)
classtoclean_df=df.loc[:,'classtoclean_l']
classtoclean_df_catid=df.loc[:,'classtoclean_l_catid']
cleaners_df_catid=df.loc[:,'cleaners_l_catid']
annotations_missing_df=df.loc[:,'annotations_missing']
images_missing_df=df.loc[:,'images_missing']
new_cat_l=copy.deepcopy(cat_l)
new_cat_l_id=copy.deepcopy(cat_l_id)
display(df)

# PARSE COLUMNS TO FIX
classtoclean_l=[]
classtoclean_l_catid=[]
cleaners_l_catid=cleaners_df_catid.tolist()
annotations_missing=annotations_missing_df.tolist()
images_missing=images_missing_df.tolist()
#convert strings to lists
for row, mx in enumerate(cleaners_l_catid):
    if isinstance(cleaners_l_catid[row], str): cleaners_l_catid[row]=convert_unicode(cleaners_l_catid[row], method='itemnum')
    if isinstance(annotations_missing[row], str): annotations_missing[row]=convert_unicode(annotations_missing[row], method='itemnum')
    if isinstance(images_missing[row], str): images_missing[row]=convert_unicode(images_missing[row], method='itemnum')


#remove classes with missing annotations and images 
removeidx_l=[]
remove_counter=0
remove_missings=True #remove classes with missing annotations or images
if remove_missings:
    for ir,catid in enumerate(new_cat_l_id):
        clean_idx = cleaners_l_catid.index(catid) #index of cat in excel cleaners_l
        if annotations_missing[clean_idx]==1 or images_missing[clean_idx]==1:
            #print(">> removing {},{}: annotations_missing={}, images_missing={} ".format(new_cat_l[ir],new_cat_l_id[ir],annotations_missing[clean_idx],images_missing[clean_idx] ))
            remove_counter+=1
            removeidx_l.append(ir)
            #NOTE: we assume (classtoclean_l_catid) are classes with no missing annotations and images
    removeitem_l=[new_cat_l[removeidx] for removeidx in removeidx_l] ##remove items instead of index #WARNING NECESSARY BECAUSE THE INDEX WILL CHANGE
    for removeitem in removeitem_l: new_cat_l.remove(removeitem) 
    removeitem_l=[new_cat_l_id[removeidx] for removeidx in removeidx_l] ##remove items instead of index #WARNING NECESSARY BECAUSE THE INDEX WILL CHANGE
    for removeitem in removeitem_l: new_cat_l_id.remove(removeitem)
print(">> remove_counter: ", remove_counter )
print(">> len(new_cat_l): ", len(new_cat_l) )
print(">> len(new_cat_l_id): ", len(new_cat_l_id) )


classtoclean_l=new_cat_l
classtoclean_l_catid= [[id] for id in new_cat_l_id] #needs to be a list


        

Unnamed: 0,mixers_l,mixers_l_catid,mixers_l_catdset,classtomix_l,classtomix_l_catid,rules,annotations_missing,images_missing
0,person,[1],['COCO'],,,To fix classes: 1) You need to fill the column...,[0],[0]
1,bicycle,[2],['COCO'],,,,[0],[0]
2,car,[3],['COCO'],,,,[0],[0]
3,motorcycle,[4],['COCO'],,,,[0],[0]
4,airplane,[5],['COCO'],,,,[0],[0]
...,...,...,...,...,...,...,...,...
75,vase,[86],['COCO'],,,,[0],[0]
76,scissors,[87],['COCO'],,,,[0],[0]
77,teddy bear,[88],['COCO'],,,,[0],[0]
78,hair drier,[89],['COCO'],,,,[0],[0]


>> remove_counter:  0
>> len(new_cat_l):  80
>> len(new_cat_l_id):  80


In [114]:
print('>> Make sure everything is correct: 1.Drop NaN if exist, but make sure the index is the same for the two! \n')
fixempty=True
if fixempty:
    classtoclean_l=[x for x in classtoclean_l if str(x) != 'nan' and str(x) !='[]']
    classtoclean_l_catid=[x for x in classtoclean_l_catid if str(x) != 'nan' and str(x) !='[]']
print(len(classtoclean_l))
print(len(classtoclean_l_catid))

>> Make sure everything is correct: 1.Drop NaN if exist, but make sure the index is the same for the two! 

80
80


In [115]:
print(classtoclean_l[-1])
print(classtoclean_l_catid[-1])

toothbrush
[90]


## 4. clean classes

In [116]:
# slow # newjson=copy.deepcopy(molajson) #do deepcopy to compare
# fast
newjson={'categories':[],'annotations':[] }
newjson['categories']=copy.copy(molajson['categories'])
newjson['annotations']=copy.copy(molajson['annotations'])

In [117]:
classtoclean_l_catidx=[[cat_l_id.index(id) for id in id_l] for id_l in classtoclean_l_catid]
#print(classtoclean_l_catidx) # they should be less one, becacuse it is ordered
print(len(classtoclean_l_catidx))

80


#### Change molajson['categories']: [{name: , id: }]  
=>  1. use first index cat id; 2. change name and change id;  remove the other categories (!!!Without ordering again the category id!!!)

In [118]:
# CHANGE NAME  & GET REMOVE List
keepidx_l=[]
keepid_l=[]
firstidx=0 # get first category id
for i,id_l in enumerate(tqdm(classtoclean_l_catid)): #for each classtoclean
    firstcatid=id_l[firstidx] # #category id 
    firstcatidx=classtoclean_l_catidx[i][firstidx]# get cat index of first catid
    if isinstance(classtoclean_l[i], list): newjson['categories'][firstcatidx]['name']=classtoclean_l[i][firstidx] #change name of first id 
    else: newjson['categories'][firstcatidx]['name']=classtoclean_l[i]
    assert newjson['categories'][firstcatidx]['id']==firstcatid #assert id - it should be the same
    keepidx_l.append(firstcatidx) #catidx to keep
    keepid_l.append(firstcatid) #catid to keep
keepidx_l=list(dict.fromkeys(keepidx_l)) # remove duplicates in the keep list
allidx_l=[index for index, value in enumerate(molajson['categories'])] # allidx in categories
removeidx_l=[idx for idx in allidx_l if idx not in keepidx_l] # remove idx 
removeitem_l=[newjson['categories'][removeidx] for removeidx in removeidx_l] #remove items #WARNING NECESSARY BECAUSE THE INDEX WILL CHANGE
print(len(allidx_l))
print(len(removeidx_l))
print(len(allidx_l)-len(removeidx_l))
print(keepidx_l in removeidx_l)

100%|████████████████████████████████████████████████████████████████████████████████████████| 80/80 [00:00<?, ?it/s]

80
0
80
False





REMOVE CLASSES

In [119]:
# REMOVE - newjson will be changed
for removeitem in removeitem_l: newjson['categories'].remove(removeitem)

In [120]:
print(newjson['categories'][-1])
print(molajson['categories'][-1])

{'supercategory': 'indoor', 'id': 90, 'name': 'toothbrush'}
{'supercategory': 'indoor', 'id': 90, 'name': 'toothbrush'}


REORDER IDs

In [121]:
# GET NEW IDs - REORDER IDs - #WARNING after remove
ct_l_id=[]
for i,c in enumerate(tqdm(newjson['categories'])):
    ct_l_id.append(c['id'])
newidx_l=[ct_l_id.index(id) for id in keepid_l] # make sure same sequence of keepid_l #SAME ORDER AS EXCEL
newid_l=[i+1 for i in range(len(keepid_l))] #reorder keepid_l
print(keepid_l)
print(newidx_l)
print(newid_l)

100%|█████████████████████████████████████████████████████████████████████████████| 80/80 [00:00<00:00, 79758.57it/s]

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 67, 70, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 84, 85, 86, 87, 88, 89, 90]
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79]
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80]





In [122]:
# SORT IDS - Reorder based on Excel order - newjson will be changed
categories_l=copy.copy(newjson['categories'])
for i,idx in enumerate(newidx_l):
    categories_l[idx]['id']=newid_l[i]
for i,idx in enumerate(newidx_l):
    newjson['categories'][i]=categories_l[idx] #TODO sort the id in the correct sequence

In [123]:
#TEST
print(len(categories_l))
print(len(newjson['categories']))
print(len(molajson['categories']))
print(newjson['categories'][-1])
print(molajson['categories'][-1])

80
80
80
{'supercategory': 'indoor', 'id': 80, 'name': 'toothbrush'}
{'supercategory': 'indoor', 'id': 80, 'name': 'toothbrush'}


### QUESTION: REMOVE HYPERPARAMETERS? Mantain only id and NAME? OR irrelevant?

#### change molajson['annotations']: [{category_id: , }] 
=> 1.get annotation idx from catid; 2.update annotations id ; 3. update newjson['annotations']

In [124]:
# 1.get annotation idx from classtoclean_l_catid
ann_catid_np=np.array(ann_catid)
classtoclean_l_ann_catidx=[[np.where(ann_catid_np==id)[0].tolist()  for id in id_l] for id_l in classtoclean_l_catid]
print(classtoclean_l)
print(classtoclean_l_catid)
print(keepid_l)
print(newid_l)
print(len(classtoclean_l_ann_catidx[0]))

['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush']
[[1], [2], [3], [4], [5], [6], [7], [8], [9], [10], [11], [13], [14], [15], [16], [17], [18], [19], [20], [21], [22], [23], [24], [25], [2

In [125]:
#2.update annotations ids & 3. update newjson['annotations'] with only the annotations frow classtoclean_l_catid
newjson['annotations']=copy.copy(molajson['annotations']) #reset annotations
copy_ann_l=copy.copy(newjson['annotations'])
newjson['annotations']=[] #clear
for i, ann_catidx_l in enumerate(classtoclean_l_ann_catidx): #only append annotations that 
    for catidx_l in ann_catidx_l:
        for catidx in catidx_l:
            copy_ann_l[catidx]['category_id']=newid_l[i] # update catid
            newjson['annotations'].append(copy_ann_l[catidx]) #update newjson with only the  (annotations sequence id will be lost
print(len(molajson['annotations']))
print(len(newjson['annotations']))

1170251
1170251


In [126]:
#TEST
print(molajson['annotations'][-1])
print(newjson['annotations'][-1])

{'segmentation': {'counts': [179, 27, 392, 41, 380, 51, 371, 59, 363, 67, 356, 73, 350, 79, 129, 9, 207, 82, 124, 10, 208, 84, 121, 10, 209, 85, 121, 8, 209, 88, 336, 89, 154, 1, 181, 90, 154, 2, 178, 73, 2, 16, 155, 3, 176, 62, 15, 15, 155, 4, 173, 63, 18, 12, 156, 6, 169, 64, 21, 10, 156, 6, 168, 64, 24, 7, 156, 8, 166, 64, 27, 5, 156, 9, 163, 65, 189, 10, 161, 65, 189, 12, 159, 65, 190, 12, 157, 66, 191, 13, 130, 24, 1, 67, 191, 13, 126, 95, 192, 14, 122, 98, 192, 15, 119, 99, 193, 15, 117, 100, 194, 15, 114, 103, 194, 16, 109, 106, 195, 16, 106, 109, 195, 16, 104, 111, 195, 17, 101, 113, 195, 17, 99, 115, 195, 17, 95, 119, 195, 17, 92, 122, 195, 17, 90, 124, 192, 20, 88, 126, 190, 22, 86, 128, 187, 25, 85, 129, 185, 27, 84, 130, 157, 55, 83, 132, 148, 63, 81, 135, 144, 67, 80, 136, 32, 1, 110, 67, 79, 138, 30, 3, 92, 9, 8, 67, 78, 140, 28, 4, 88, 15, 6, 67, 77, 143, 25, 6, 86, 18, 4, 67, 76, 146, 21, 8, 86, 21, 1, 67, 76, 149, 15, 12, 84, 90, 75, 177, 83, 91, 75, 178, 81, 92, 74, 1

### 4. Save cleaned json

In [128]:
# fast
molajson['categories']=copy.copy(newjson['categories'])
molajson['annotations']=copy.copy(newjson['annotations'])

In [129]:
# save
print('\n >> SAVING...')
jsonfile=rdir+injsonfile+'_clean_aggressive.json'
with open(jsonfile, 'w') as f:
    json.dump(molajson, f)
print("JSON SAVED : {} \n".format(jsonfile))
for k in molajson:
    print(k, len(molajson[k]))


 >> SAVING...
JSON SAVED : D:/external_datasets/MOLA/coco2017_mix_aggressive.json 

info 6
licenses 8
images 123287
annotations 1170251
categories 80


### 5. TEST clean ANNOTATIONS DUPLICATES

In [11]:
molajson = json.load(open(rdir+'annotations/split_mola_fix_equal_reorder_nomissings/clean_aggressive_addremainclasses/test.json'))

In [12]:
for k in molajson:
    print(k, len(molajson[k]))

info 5
licenses 9
categories 353
videos 1488
images 177936
tracks 8132
segment_info 0
annotations 133228
datasets 2


In [13]:
# annotations category_id
ann_ids=[]
for an in tqdm(molajson['annotations']):
    ann_ids.append(an['id'])
print(len(ann_ids))

#TEST duplicates v3 -faster
u, c = np.unique(np.array(ann_ids), return_counts=True)
duplicates_l= u[c > 1].tolist()
print(len(duplicates_l))

100%|███████████████████████████████████████████████████████████████████| 133228/133228 [00:00<00:00, 1017127.63it/s]


133228
0


In [14]:
# categories name and id
cat_l=[]
for c in molajson['categories']:
    cat_l.append(c['name'])

In [15]:
print(len(cat_l))
print(cat_l)


353
['aggressive', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush', 'aerosol_can', 'apricot', 'armchair', 'atomizer', 'ax', 'baby_buggy', 'bagpipe', 'ball', 'balloon', 'barbell', 'baseball_bat', 'baseball_glove', 'basketball_hoop'