In [1]:
import json 

import pandas as pd

from shapely.geometry import Point
from shapely.geometry import Polygon

from CCAgT_utils.converters import CCAgT
from CCAgT_utils.Categories import Helper as Categories_Helper

# Verify and Preprocess the dataset

Create some synthetic samples where have some data outside the image size, some `normals` labels, and some overlapped samples.

In [2]:
data = [{'image_name': 'A_xxx', 'geometry': Point(1, 1), 'category_id': 3},
        {'image_name': 'A_yyy', 'geometry': Point(10, 10), 'category_id': 3},
        {'image_name': 'A_yyy', 'geometry': Point(20, 20), 'category_id': 3},
        {'image_name': 'A_yyy', 'geometry': Point(30, 30), 'category_id': 3},
        {'image_name': 'A_yyy', 'geometry': Point(40, 40), 'category_id': 3},
        {'image_name': 'A_yyy', 'geometry': Point(50, 50), 'category_id': 3},
        {'image_name': 'A_yyy', 'geometry': Polygon([(40, 40), (50, 50), (50, 40)]), 'category_id': 1},
        {'image_name': 'A_yyy', 'geometry': Polygon([(40, 40), (240, 240), (240, 40)]), 'category_id': 1},
        {'image_name': 'B_yyy', 'geometry': Point(10, 10), 'category_id': 3},
        {'image_name': 'B_yyy', 'geometry': Point(2000, 2000), 'category_id': 3},
        {'image_name': 'C_xyz', 'geometry': Polygon([(40, 40), (240, 240), (240, 40)]), 'category_id': 5},
        {'image_name': 'C_xyz', 'geometry': Polygon([(30, 30), (230, 230), (230, 30)]), 'category_id': 5},
        {'image_name': 'C_xyz', 'geometry': Polygon([(200, 200), (350, 350), (350, 200)]), 'category_id': 5},
        {'image_name': 'C_xyz', 'geometry': Polygon([(340, 340), (340, 350), (350, 360)]), 'category_id': 5},
        {'image_name': 'C_xyz', 'geometry': Polygon([(380, 360), (380, 355), (395, 360)]), 'category_id': 5},
        {'image_name': 'C_xyz', 'geometry': Polygon([(400, 400), (500, 500), (500, 400)]), 'category_id': 5},
        ]

df = pd.DataFrame(data)
df

Unnamed: 0,image_name,geometry,category_id
0,A_xxx,POINT (1 1),3
1,A_yyy,POINT (10 10),3
2,A_yyy,POINT (20 20),3
3,A_yyy,POINT (30 30),3
4,A_yyy,POINT (40 40),3
5,A_yyy,POINT (50 50),3
6,A_yyy,"POLYGON ((40 40, 50 50, 50 40, 40 40))",1
7,A_yyy,"POLYGON ((40 40, 240 240, 240 40, 40 40))",1
8,B_yyy,POINT (10 10),3
9,B_yyy,POINT (2000 2000),3


## Init the class of annotations

In [3]:
ccagt_ann = CCAgT.CCAgT_Annotations(df)

## Find overlapped annotations for some specific category

In the real data, some overlapped nuclei have been annotated as different instances, but we will need just one object in a `group` of overlapped objects.

In [4]:
overlapping_annotations = ccagt_ann.find_overlapping_annotations(categories_id={5})
overlapping_annotations

{'C_xyz': [{10, 11, 12, 13}]}

In [5]:
df = ccagt_ann.union_geometries(overlapping_annotations)
df

Unnamed: 0,image_name,geometry,category_id
0,A_xxx,POINT (1 1),3
1,A_yyy,POINT (10 10),3
2,A_yyy,POINT (20 20),3
3,A_yyy,POINT (30 30),3
4,A_yyy,POINT (40 40),3
5,A_yyy,POINT (50 50),3
6,A_yyy,"POLYGON ((40 40, 50 50, 50 40, 40 40))",1
7,A_yyy,"POLYGON ((40 40, 240 240, 240 40, 40 40))",1
8,B_yyy,POINT (10 10),3
9,B_yyy,POINT (2000 2000),3


## Verify if a categories intersects with others

will verify if all base geometries (selected by `base_categories_id`) intersects with any target geometry (selected by `target_categories_id`)

In [6]:
df_base_intersects_target = ccagt_ann.verify_if_intersects(base_categories_id={1}, target_categories_id={2, 3})
df_base_intersects_target

Unnamed: 0,image_name,geometry,category_id,has_intersecting
6,A_yyy,"POLYGON ((40 40, 50 50, 50 40, 40 40))",1,True
7,A_yyy,"POLYGON ((40 40, 240 240, 240 40, 40 40))",1,True


In [7]:
# sanity test all itens at has_intersecting need to be true
ccagt_ann.verify_if_intersects(base_categories_id={1}, target_categories_id={1})

Unnamed: 0,image_name,geometry,category_id,has_intersecting
6,A_yyy,"POLYGON ((40 40, 50 50, 50 40, 40 40))",1,True
7,A_yyy,"POLYGON ((40 40, 240 240, 240 40, 40 40))",1,True


In [8]:
# to delete just: 
index_to_drop = df_base_intersects_target[df_base_intersects_target['has_intersecting'] == False].index.to_numpy()
df.drop(index_to_drop, inplace=True)
index_to_drop

array([], dtype=int64)

## Define the geometry type for each annotation

In [9]:
df['geo_type'] = ccagt_ann.geometries_type()
df

Unnamed: 0,image_name,geometry,category_id,geo_type
0,A_xxx,POINT (1 1),3,Point
1,A_yyy,POINT (10 10),3,Point
2,A_yyy,POINT (20 20),3,Point
3,A_yyy,POINT (30 30),3,Point
4,A_yyy,POINT (40 40),3,Point
5,A_yyy,POINT (50 50),3,Point
6,A_yyy,"POLYGON ((40 40, 50 50, 50 40, 40 40))",1,Polygon
7,A_yyy,"POLYGON ((40 40, 240 240, 240 40, 40 40))",1,Polygon
8,B_yyy,POINT (10 10),3,Point
9,B_yyy,POINT (2000 2000),3,Point


## Transform the Satellite point annotations to Polygons

In [10]:
sat_series = df.loc[(df['category_id']==3) & (df['geo_type']=='Point'), 'geometry']

df.loc[(df['category_id']==3) & (df['geo_type']=='Point'), 'geometry'] = ccagt_ann.satellite_point_to_polygon(sat_series)

df['geo_type'] = ccagt_ann.geometries_type()

df

Unnamed: 0,image_name,geometry,category_id,geo_type
0,A_xxx,"POLYGON ((6.352372348458314 1, 5.9449472631200...",3,Polygon
1,A_yyy,"POLYGON ((15.35237234845831 10, 14.94494726312...",3,Polygon
2,A_yyy,"POLYGON ((25.35237234845831 20, 24.94494726312...",3,Polygon
3,A_yyy,"POLYGON ((35.35237234845832 30, 34.94494726312...",3,Polygon
4,A_yyy,"POLYGON ((45.35237234845832 40, 44.94494726312...",3,Polygon
5,A_yyy,"POLYGON ((55.35237234845832 50, 54.94494726312...",3,Polygon
6,A_yyy,"POLYGON ((40 40, 50 50, 50 40, 40 40))",1,Polygon
7,A_yyy,"POLYGON ((40 40, 240 240, 240 40, 40 40))",1,Polygon
8,B_yyy,"POLYGON ((15.35237234845831 10, 14.94494726312...",3,Polygon
9,B_yyy,"POLYGON ((2005.352372348458 2000, 2004.9449472...",3,Polygon


## Verify and make fit the geometries to the images boudaries

In [11]:
df['geometry'] = ccagt_ann.fit_geometries_to_image_boundary()
df

Unnamed: 0,image_name,geometry,category_id,geo_type
0,A_xxx,"POLYGON ((6.153459981078656 0, 0 0, 0 6.153459...",3,Polygon
1,A_yyy,"POLYGON ((15.35237234845831 10, 14.94494726312...",3,Polygon
2,A_yyy,"POLYGON ((25.35237234845831 20, 24.94494726312...",3,Polygon
3,A_yyy,"POLYGON ((35.35237234845832 30, 34.94494726312...",3,Polygon
4,A_yyy,"POLYGON ((45.35237234845832 40, 44.94494726312...",3,Polygon
5,A_yyy,"POLYGON ((55.35237234845832 50, 54.94494726312...",3,Polygon
6,A_yyy,"POLYGON ((40 40, 50 50, 50 40, 40 40))",1,Polygon
7,A_yyy,"POLYGON ((40 40, 240 240, 240 40, 40 40))",1,Polygon
8,B_yyy,"POLYGON ((15.35237234845831 10, 14.94494726312...",3,Polygon
9,B_yyy,,3,Polygon


Annotations that do not fit the image boundary will have the "geometry" turned into nan. To remove these annotations just do:

In [12]:
df.dropna(axis=0, subset=['geometry'], inplace=True)


## Compute the area of the annotations (geometries)

In [13]:
df['area'] = ccagt_ann.geometries_area()
df

Unnamed: 0,image_name,geometry,category_id,geo_type,area
0,A_xxx,"POLYGON ((6.153459981078656 0, 0 0, 0 6.153459...",3,Polygon,33.431978
1,A_yyy,"POLYGON ((15.35237234845831 10, 14.94494726312...",3,Polygon,87.704582
2,A_yyy,"POLYGON ((25.35237234845831 20, 24.94494726312...",3,Polygon,87.704582
3,A_yyy,"POLYGON ((35.35237234845832 30, 34.94494726312...",3,Polygon,87.704582
4,A_yyy,"POLYGON ((45.35237234845832 40, 44.94494726312...",3,Polygon,87.704582
5,A_yyy,"POLYGON ((55.35237234845832 50, 54.94494726312...",3,Polygon,87.704582
6,A_yyy,"POLYGON ((40 40, 50 50, 50 40, 40 40))",1,Polygon,50.0
7,A_yyy,"POLYGON ((40 40, 240 240, 240 40, 40 40))",1,Polygon,20000.0
8,B_yyy,"POLYGON ((15.35237234845831 10, 14.94494726312...",3,Polygon,87.704582
10,C_xyz,"POLYGON ((380 360, 380 355, 395 360, 380 360))",5,Polygon,37.5


## Set/create the Image IDs based on the `image_name`

In [14]:
df['image_id'] = ccagt_ann.generate_ids(df['image_name'])
df

Unnamed: 0,image_name,geometry,category_id,geo_type,area,image_id
0,A_xxx,"POLYGON ((6.153459981078656 0, 0 0, 0 6.153459...",3,Polygon,33.431978,1
1,A_yyy,"POLYGON ((15.35237234845831 10, 14.94494726312...",3,Polygon,87.704582,2
2,A_yyy,"POLYGON ((25.35237234845831 20, 24.94494726312...",3,Polygon,87.704582,2
3,A_yyy,"POLYGON ((35.35237234845832 30, 34.94494726312...",3,Polygon,87.704582,2
4,A_yyy,"POLYGON ((45.35237234845832 40, 44.94494726312...",3,Polygon,87.704582,2
5,A_yyy,"POLYGON ((55.35237234845832 50, 54.94494726312...",3,Polygon,87.704582,2
6,A_yyy,"POLYGON ((40 40, 50 50, 50 40, 40 40))",1,Polygon,50.0,2
7,A_yyy,"POLYGON ((40 40, 240 240, 240 40, 40 40))",1,Polygon,20000.0,2
8,B_yyy,"POLYGON ((15.35237234845831 10, 14.94494726312...",3,Polygon,87.704582,3
10,C_xyz,"POLYGON ((380 360, 380 355, 395 360, 380 360))",5,Polygon,37.5,4


## Verify the slide IDs based on the `image_name`

In [15]:
df['slide_id'] = ccagt_ann.get_slide_id()
df

Unnamed: 0,image_name,geometry,category_id,geo_type,area,image_id,slide_id
0,A_xxx,"POLYGON ((6.153459981078656 0, 0 0, 0 6.153459...",3,Polygon,33.431978,1,A
1,A_yyy,"POLYGON ((15.35237234845831 10, 14.94494726312...",3,Polygon,87.704582,2,A
2,A_yyy,"POLYGON ((25.35237234845831 20, 24.94494726312...",3,Polygon,87.704582,2,A
3,A_yyy,"POLYGON ((35.35237234845832 30, 34.94494726312...",3,Polygon,87.704582,2,A
4,A_yyy,"POLYGON ((45.35237234845832 40, 44.94494726312...",3,Polygon,87.704582,2,A
5,A_yyy,"POLYGON ((55.35237234845832 50, 54.94494726312...",3,Polygon,87.704582,2,A
6,A_yyy,"POLYGON ((40 40, 50 50, 50 40, 40 40))",1,Polygon,50.0,2,A
7,A_yyy,"POLYGON ((40 40, 240 240, 240 40, 40 40))",1,Polygon,20000.0,2,A
8,B_yyy,"POLYGON ((15.35237234845831 10, 14.94494726312...",3,Polygon,87.704582,3,B
10,C_xyz,"POLYGON ((380 360, 380 355, 395 360, 380 360))",5,Polygon,37.5,4,C


## Verify if the `df` keeps the reference for the `ccagt_ann.df`

In [16]:
ccagt_ann.df

Unnamed: 0,image_name,geometry,category_id,geo_type,area,image_id,slide_id
0,A_xxx,"POLYGON ((6.153459981078656 0, 0 0, 0 6.153459...",3,Polygon,33.431978,1,A
1,A_yyy,"POLYGON ((15.35237234845831 10, 14.94494726312...",3,Polygon,87.704582,2,A
2,A_yyy,"POLYGON ((25.35237234845831 20, 24.94494726312...",3,Polygon,87.704582,2,A
3,A_yyy,"POLYGON ((35.35237234845832 30, 34.94494726312...",3,Polygon,87.704582,2,A
4,A_yyy,"POLYGON ((45.35237234845832 40, 44.94494726312...",3,Polygon,87.704582,2,A
5,A_yyy,"POLYGON ((55.35237234845832 50, 54.94494726312...",3,Polygon,87.704582,2,A
6,A_yyy,"POLYGON ((40 40, 50 50, 50 40, 40 40))",1,Polygon,50.0,2,A
7,A_yyy,"POLYGON ((40 40, 240 240, 240 40, 40 40))",1,Polygon,20000.0,2,A
8,B_yyy,"POLYGON ((15.35237234845831 10, 14.94494726312...",3,Polygon,87.704582,3,B
10,C_xyz,"POLYGON ((380 360, 380 355, 395 360, 380 360))",5,Polygon,37.5,4,C


## Init the `Categories_Helper`

In [17]:
helper_path = '../../data/samples/CCAgT_dataset_metadata.json'
with open(helper_path, 'r') as hf:
    dataset_helper = json.load(hf)
    
categories_helpper = dataset_helper['categories']
categories_helpper[0], len(categories_helpper)

({'color': [21, 62, 125],
  'name': 'Nucleus',
  'id': 1,
  'labelbox_schemaId': '<Unique ID for category Nucleus>',
  'minimal_area': 500,
  'supercategory': ''},
 7)

In [18]:
ccagt_helper = Categories_Helper(categories_helpper)

In [19]:
ccagt_helper.name_by_category_id

{1: 'Nucleus',
 2: 'Cluster',
 3: 'Satellite',
 4: 'Nucleus_out_of_focus',
 5: 'Overlapped_Nuclei',
 6: 'non-viable_nucleus',
 7: 'Leukocyte_Nucleus',
 0: 'background'}

## Remove annotations based on the `minimal_area` from the `Categories_Helper` file

In [20]:
min_area = ccagt_helper.min_area_by_category_id

In [21]:
df = ccagt_ann.delete_by_area(min_area, ignore_categories={3})
df

ATTENTION | 1 items has been removed from category with id 1
ATTENTION | 1 items has been removed from category with id 5


Unnamed: 0,image_name,geometry,category_id,geo_type,area,image_id,slide_id
0,A_xxx,"POLYGON ((6.153459981078656 0, 0 0, 0 6.153459...",3,Polygon,33.431978,1,A
1,A_yyy,"POLYGON ((15.35237234845831 10, 14.94494726312...",3,Polygon,87.704582,2,A
2,A_yyy,"POLYGON ((25.35237234845831 20, 24.94494726312...",3,Polygon,87.704582,2,A
3,A_yyy,"POLYGON ((35.35237234845832 30, 34.94494726312...",3,Polygon,87.704582,2,A
4,A_yyy,"POLYGON ((45.35237234845832 40, 44.94494726312...",3,Polygon,87.704582,2,A
5,A_yyy,"POLYGON ((55.35237234845832 50, 54.94494726312...",3,Polygon,87.704582,2,A
8,B_yyy,"POLYGON ((15.35237234845831 10, 14.94494726312...",3,Polygon,87.704582,3,B
7,A_yyy,"POLYGON ((40 40, 240 240, 240 40, 40 40))",1,Polygon,20000.0,2,A
11,C_xyz,"POLYGON ((400 400, 500 500, 500 400, 400 400))",5,Polygon,5000.0,4,C
12,C_xyz,"POLYGON ((29.80490967798386 29.01921471959677,...",5,Polygon,33537.828282,4,C


## save and load to a parquet file

In [22]:
filename = '../../data/samples/out/CCAgT_example.parquet.gzip'

ccagt_ann.to_parquet(filename)

In [23]:
test_ccagt = CCAgT.read_parquet(filename)

test_ccagt.df

Unnamed: 0,image_name,geometry,category_id,geo_type,area,image_id,slide_id
0,A_xxx,"POLYGON ((6.153459981078656 0, 0 0, 0 6.153459...",3,Polygon,33.431978,1,A
1,A_yyy,"POLYGON ((15.35237234845831 10, 14.94494726312...",3,Polygon,87.704582,2,A
2,A_yyy,"POLYGON ((25.35237234845831 20, 24.94494726312...",3,Polygon,87.704582,2,A
3,A_yyy,"POLYGON ((35.35237234845832 30, 34.94494726312...",3,Polygon,87.704582,2,A
4,A_yyy,"POLYGON ((45.35237234845832 40, 44.94494726312...",3,Polygon,87.704582,2,A
5,A_yyy,"POLYGON ((55.35237234845832 50, 54.94494726312...",3,Polygon,87.704582,2,A
8,B_yyy,"POLYGON ((15.35237234845831 10, 14.94494726312...",3,Polygon,87.704582,3,B
7,A_yyy,"POLYGON ((40 40, 240 240, 240 40, 40 40))",1,Polygon,20000.0,2,A
11,C_xyz,"POLYGON ((400 400, 500 500, 500 400, 400 400))",5,Polygon,5000.0,4,C
12,C_xyz,"POLYGON ((29.80490967798386 29.01921471959677,...",5,Polygon,33537.828282,4,C


## Annotations to Object Detection COCO format

### Running single core

Just reset the index to ensure that the annotations will have unique and sequential ids

In [24]:
df_to_coco = df.reset_index(drop=True)
df_to_coco.index = df_to_coco.index + 1

In [25]:
coco_ann_v1 = CCAgT.single_core_to_OD_COCO(df_to_coco, decimals=2)
len(coco_ann_v1), coco_ann_v1[0]

(10,
 {'id': 1,
  'image_id': 1,
  'category_id': 3,
  'bbox': [0.0, 0.0, 6.35, 6.35],
  'segmentation': [[6.15,
    0.0,
    0.0,
    0.0,
    0.0,
    6.15,
    1.0,
    6.35,
    3.05,
    5.94,
    4.78,
    4.78,
    5.94,
    3.05,
    6.35,
    1.0,
    6.15,
    0.0]],
  'area': 33.43,
  'iscrowd': 0})

### Running in one process per core (faster - because run in parallel)

In [26]:
coco_ann_v2 = ccagt_ann.to_OD_COCO(decimals=2)

len(coco_ann_v2), coco_ann_v2[0]

Number of cores: 12, annotations per core: 1


(10,
 {'id': 1,
  'image_id': 1,
  'category_id': 3,
  'bbox': [0.0, 0.0, 6.35, 6.35],
  'segmentation': [[6.15,
    0.0,
    0.0,
    0.0,
    0.0,
    6.15,
    1.0,
    6.35,
    3.05,
    5.94,
    4.78,
    4.78,
    5.94,
    3.05,
    6.35,
    1.0,
    6.15,
    0.0]],
  'area': 33.43,
  'iscrowd': 0})

In [27]:
coco_ann_v1 == coco_ann_v2

True