# Filter Train and Validation Dataset

In [1]:
import json
from sahi.utils.coco import Coco, CocoAnnotation, CocoCategory, CocoImage
from sahi.utils.file import load_json, save_json
from sahi.utils.coco import create_coco_dict
import copy
import os

def update_categories(coco,desired_name2id, update_image_filenames=False):
        """
        Rearranges category mapping of given COCO object based on given desired_name2id.
        Can also be used to filter some of the categories.

        Args:
            desired_name2id: dict
                {"big_vehicle": 1, "car": 2, "human": 3}
            update_image_filenames: bool
                If True, updates coco image file_names with absolute file paths.
        """
        # init vars
        currentid2desiredid_mapping = {}
        updated_coco = Coco(
            name=coco.name,
            image_dir=coco.image_dir,
            remapping_dict=coco.remapping_dict,
            ignore_negative_samples=coco.ignore_negative_samples,
        )
        # create category id mapping (currentid2desiredid_mapping)
        for coco_category in copy.deepcopy(coco.categories):
            current_category_id = coco_category.id
            current_category_name = coco_category.name
            if current_category_name in desired_name2id.keys():
                currentid2desiredid_mapping[current_category_id] = desired_name2id[current_category_name]
            else:
                # ignore categories that are not included in desired_name2id
                currentid2desiredid_mapping[current_category_id] = None

        # add updated categories
        for name in desired_name2id.keys():
            updated_coco_category = CocoCategory(id=desired_name2id[name], name=name, supercategory=name)
            updated_coco.add_category(updated_coco_category)

        # add updated images & annotations
        for coco_image in copy.deepcopy(coco.images):
            updated_coco_image = CocoImage.from_coco_image_dict(coco_image.json)
            # update filename to abspath
            file_name_is_abspath = True if os.path.abspath(coco_image.file_name) == coco_image.file_name else False
            if update_image_filenames and not file_name_is_abspath:
                updated_coco_image.file_name = str(Path(os.path.abspath(self.image_dir)) / coco_image.file_name)
            # update annotations
            for coco_annotation in coco_image.annotations:
                # print(coco_annotation)
                current_category_id = coco_annotation.category_id
                desired_category_id = currentid2desiredid_mapping[current_category_id]
                # append annotations with category id present in desired_name2id
                if desired_category_id is not None:
                    # update cetegory id
                    coco_annotation.category_id = desired_category_id
                    # append updated annotation to target coco dict
                    updated_coco_image.add_annotation(coco_annotation)
            updated_coco.add_image(updated_coco_image)

        # overwrite instance
        coco.__class__ = updated_coco.__class__
        coco.__dict__ = updated_coco.__dict__
        
## Final filter function
def filter_and_save_dataset(coco_json_path,
                            desired_name2id=None,
                            name_of_new_coco_json=None):
    '''
    '''
    #(Andrew 7.13.2023) Note: coco.update_categories does not check if 
    print("Loading {}...".format(coco_json_path))
    coco = Coco.from_coco_dict_or_path(coco_json_path)
    print("Done!")
    update_categories(coco,desired_name2id=desired_name2id)
    coco.calculate_stats()
    cats = coco.json_categories
    di = create_coco_dict(coco.images,coco.categories,ignore_negative_samples=True)
    di['categories']=coco.json_categories
    json.dump(di,open(name_of_new_coco_json,'w'))
    print("Saving {}...".format(name_of_new_coco_json))
    coco_u = Coco.from_coco_dict_or_path(name_of_new_coco_json)
    print("Saved!")
    print("Coco Stats:")
    print(coco_u.stats)


    
    

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
coco_json_path='/run/determined/workdir/shared_fs/01 - Users/andrew.mendez/e2e_blogposts/ngc_blog/xview_dataset/train_images_rgb_no_neg/train_640_02.json'
desired_name2id={'Fixed-wing Aircraft':1,'Cargo Plane':2}
name_of_new_coco_json = 'xview_dataset/train_images_rgb_no_neg/train_640_02_filtered.json'
filter_and_save_dataset(coco_json_path=coco_json_path,
                        desired_name2id=desired_name2id,
                        name_of_new_coco_json=name_of_new_coco_json)

In [2]:
coco_json_path='/run/determined/workdir/shared_fs/01 - Users/andrew.mendez/e2e_blogposts/ngc_blog/xview_dataset/val_images_rgb_no_neg/val_640_02.json'
desired_name2id={'Fixed-wing Aircraft':1,'Cargo Plane':2}
name_of_new_coco_json = 'xview_dataset/val_images_rgb_no_neg/val_640_02_filtered.json'
filter_and_save_dataset(coco_json_path=coco_json_path,
                        desired_name2id=desired_name2id,
                        name_of_new_coco_json=name_of_new_coco_json)

Loading /run/determined/workdir/shared_fs/01 - Users/andrew.mendez/e2e_blogposts/ngc_blog/xview_dataset/val_images_rgb_no_neg/val_640_02.json...
indexing coco dataset annotations...


Loading coco annotations: 100%|██████████| 4419/4419 [00:14<00:00, 299.21it/s]


Done!
Saving xview_dataset/val_images_rgb_no_neg/val_640_02_filtered.json...
indexing coco dataset annotations...


Loading coco annotations: 100%|██████████| 185/185 [00:00<00:00, 6106.69it/s]

Saved!
Coco Stats:
{'num_images': 185, 'num_annotations': 456, 'num_categories': 2, 'num_negative_images': 0, 'num_images_per_category': {'Cargo Plane': 175, 'Fixed-wing Aircraft': 13}, 'num_annotations_per_category': {'Fixed-wing Aircraft': 61, 'Cargo Plane': 395}, 'min_num_annotations_in_image': 1, 'max_num_annotations_in_image': 19, 'avg_num_annotations_in_image': 2.464864864864865, 'min_annotation_area': 493, 'max_annotation_area': 53460, 'avg_annotation_area': 9073.552631578947, 'min_annotation_area_per_category': {'Fixed-wing Aircraft': 493, 'Cargo Plane': 550}, 'max_annotation_area_per_category': {'Fixed-wing Aircraft': 27354, 'Cargo Plane': 53460}}



