In [1]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from pycocotools import mask as maskUtils
from joblib import Parallel, delayed
import json

In [2]:
DATA_PATH = "../inputs/train_val_gkfold_split.csv"

In [3]:
def rle2mask(rle, img_w, img_h):
    
    ## transforming the string into an array of shape (2, N)
    array = np.fromiter(rle.split(), dtype = np.uint)
    array = array.reshape((-1,2)).T
    array[0] = array[0] - 1
    
    ## decompressing the rle encoding (ie, turning [3, 1, 10, 2] into [3, 4, 10, 11, 12])
    # for faster mask construction
    starts, lenghts = array
    mask_decompressed = np.concatenate([np.arange(s, s + l, dtype = np.uint) for s, l in zip(starts, lenghts)])

    ## Building the binary mask
    msk_img = np.zeros(img_w * img_h, dtype = np.uint8)
    msk_img[mask_decompressed] = 1
    msk_img = msk_img.reshape((img_h, img_w))
    msk_img = np.asfortranarray(msk_img) ## This is important so pycocotools can handle this object
    
    return msk_img

In [4]:
def annotate(idx, row, cat_ids):
        mask = rle2mask(row['annotation'], row['width'], row['height']) # Binary mask
        c_rle = maskUtils.encode(mask) # Encoding it back to rle (coco format)
        c_rle['counts'] = c_rle['counts'].decode('utf-8') # converting from binary to utf-8
        area = maskUtils.area(c_rle).item() # calculating the area
        bbox = maskUtils.toBbox(c_rle).astype(int).tolist() # calculating the bboxes
        annotation = {
            'segmentation': c_rle,
            'bbox': bbox,
            'area': area,
            'image_id':row['id'], 
            'category_id':cat_ids[row['cell_type']], 
            'iscrowd':0, 
            'id':idx
        }
        return annotation

In [5]:
def coco_structure(df, workers = 4):
    
    ## Building the header
    cat_ids = {name:id+1 for id, name in enumerate(df.cell_type.unique())}    
    cats =[{'name':name, 'id':id} for name,id in cat_ids.items()]
    images = [{'id':id, 'width':row.width, 'height':row.height, 'file_name':f'train/{id}.png'} for id,row in df.groupby('id').agg('first').iterrows()]
    
    ## Building the annotations
    annotations = Parallel(n_jobs=workers)(delayed(annotate)(idx, row, cat_ids) for idx, row in tqdm(df.iterrows(), total = len(df)))
        
    return {'categories':cats, 'images':images, 'annotations':annotations}

In [6]:
def create_coco_json(df, w_fold, save_dir):
    train_sample = df[df["fold"] != w_fold]
    train_coco_json = coco_structure(train_sample)
    
    valid_sample = df[df["fold"] == w_fold]
    valid_coco_json = coco_structure(valid_sample)

    with open(f'{save_dir}/annotations_train_f{str(w_fold)}.json', 'w', encoding='utf-8') as f:
        json.dump(train_coco_json, f, ensure_ascii=True, indent=4)
    
    with open(f'{save_dir}/annotations_valid_f{str(w_fold)}.json', 'w', encoding='utf-8') as f:
        json.dump(valid_coco_json, f, ensure_ascii=True, indent=4)

In [7]:
df = pd.read_csv(DATA_PATH)

In [8]:
create_coco_json(df, w_fold=0, save_dir="../inputs")

  0%|          | 0/58869 [00:00<?, ?it/s]

  0%|          | 0/14716 [00:00<?, ?it/s]