# Data exploration of LIVECell

In [70]:
import os,inspect
import pathlib2
import pandas as pd
import numpy as np
import json

In [71]:
## Helper functions

def check_noneffective_ids(df):
    """
    check if the given data frame has NaN and return the rows with it if any, or else return nothing
    :param df: input dataframe
    :return: the non effective rows of the dataframe
    """

    is_NaN = df.isnull()
    row_has_NaN = is_NaN.any(axis=1)
    rows_with_NaN = df[row_has_NaN]
    len_rows_with_NaN = rows_with_NaN.shape[0]
    if len_rows_with_NaN != 0:
        print(f"there are {len_rows_with_NaN} non effective rows.")
        return rows_with_NaN
    else:
        print("the rows available are all effective, i.e., without NaNs.")
        return None

def check_meta_images_ds(df):
    """

    :param df:
    :return:
    """

    pass

def check_meta_annots_ds(df):
    """

    :param df:
    :return:
    """

    pass

## Define relevant paths

In [2]:
current_dir = pathlib2.Path.cwd()
# current_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
print("currentdir: ", current_dir)
project_dir = current_dir.parent
print("project dir: ", project_dir)

currentdir:  F:\Kaggle\sartorius_cell_instance_segmentation\code\rkx_cell_is\analytics
parentdir:  F:\Kaggle\sartorius_cell_instance_segmentation\code\rkx_cell_is


In [11]:
ds_path = project_dir / 'dataset'
print("dataset dir: ", ds_path)
livecell_ds_path = ds_path / 'LIVECell_dataset_2021'
print("livecell dataset dir: ", livecell_ds_path)
livecell_ds_annot_path, livecell_ds_imgs_path = [x for x in livecell_ds_path.iterdir() if x.is_dir()]

livecell_train_meta_path = livecell_ds_annot_path / 'LIVECell' / 'livecell_coco_train.json'
livecell_val_meta_path = livecell_ds_annot_path / 'LIVECell' / 'livecell_coco_val.json'
livecell_test_meta_path = livecell_ds_annot_path / 'LIVECell' / 'livecell_coco_test.json'

livecell_train_val_img_path = livecell_ds_imgs_path / 'livecell_train_val_images'
livecell_test_img_path = livecell_ds_imgs_path / 'livecell_test_images'

dataset dir:  F:\Kaggle\sartorius_cell_instance_segmentation\code\rkx_cell_is\dataset
livecell dataset dir:  F:\Kaggle\sartorius_cell_instance_segmentation\code\rkx_cell_is\dataset\LIVECell_dataset_2021
the json file of training data meta info is: F:\Kaggle\sartorius_cell_instance_segmentation\code\rkx_cell_is\dataset\LIVECell_dataset_2021\annotations\LIVECell\livecell_coco_train.json
dict_keys(['images', 'annotations', 'categories', 'info', 'licenses'])


## Training metadata exploration

In [35]:
# Reading the json file including training metadata as a dict
with open(livecell_train_meta_path) as json_train_data:
    train_data = json.load(json_train_data)


print(train_data.keys(), '\n\n')
print('the type of images container: ', type(train_data['images']))
print('the first element of <list> images: \n', train_data['images'][0], '\n\n')
print('the type of annotations container: ', type(train_data['annotations']))
print('the first ten keys of <dict> annotations: \n', list(train_data['annotations'].keys())[:10])
print(f"the first item has the key: \n {list(train_data['annotations'].items())[0][0]} \nand the value: \n"
      f"{list(train_data['annotations'].items())[0][1]} \n\n")
print('the type of info container: ', type(train_data['info']))
print('the keys of <dict> info: \n', train_data['info'].keys(), '\n\n')
print('the type of licenses container: ', type(train_data['licenses']))
print('the first element of <list> licenses: \n', train_data['licenses'][0])

dict_keys(['images', 'annotations', 'categories', 'info', 'licenses']) 


the type of images container:  <class 'list'>
the first element of <list> images: 
 {'id': 1, 'width': 704, 'height': 520, 'file_name': 'BT474_Phase_A3_2_00d04h00m_3.tif', 'original_filename': 'BT474_Phase_A3_2_00d04h00m_3.png', 'url': 'https://darwin.v7labs.com/api/images/870028/original'} 


the type of annotations container:  <class 'dict'>
the first ten keys of <dict> annotations: 
 ['2', '3', '4', '5', '6', '7', '8', '9', '10', '12']
the first item has the key: 
 2 
and the value: 
{'id': 2, 'image_id': 1, 'category_id': 1, 'segmentation': [[288.02, 305.63, 286.01, 298.87, 286.01, 295.4, 288.02, 290.1, 293.86, 287.91, 297.51, 287.73, 300.44, 289.01, 304.27, 292.48, 304.64, 295.04, 305.18, 297.77, 305.18, 300.7, 303.91, 302.52, 301.17, 305.26, 297.33, 307.45, 294.59, 307.45, 290.58, 308.0]], 'area': 307.4786000000313, 'bbox': [286.01, 287.73, 19.170000000000016, 20.269999999999982], 'iscrowd': 0} 


the type 

In [40]:
# more about the dict annotation

print(f"There are {len(train_data['annotations'].keys())} keys in the annotation dict.")
keys_arr = np.array(list(train_data['annotations'].keys())).astype(int)
annot_values = list(train_data['annotations'].values())
id_list = [x['id'] for x in annot_values]
id_arr = np.array(id_list)
print(np.allclose(keys_arr, id_arr), ': The keys_arr out of keys of the annotation dict and id_arr out of the IDs in the '
                                     'annotation dict is elementwise equal.\n')
print('The first 100 IDs in the annotation dict:')
print(id_arr[:100])

There are 1018576 keys in the annotation dict.
True : The keys_arr out of keys of the annotation dict and id_arr out of the IDs in the annotation dict is elementwise equal.
The first 100 IDs in the annotation dict:
[  2   3   4   5   6   7   8   9  10  12  13  14  15  16  17  18  19  20
  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38
  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56
  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72  73  74
  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90  91  92
  93  94  95  96  97  98  99 100 101 102]


### Metadata: images

In [54]:
df_livecell_train_imgs_meta = pd.DataFrame.from_dict(train_data['images'])
df_livecell_train_imgs_meta.head()

Unnamed: 0,id,width,height,file_name,original_filename,url
0,1,704,520,BT474_Phase_A3_2_00d04h00m_3.tif,BT474_Phase_A3_2_00d04h00m_3.png,https://darwin.v7labs.com/api/images/870028/or...
1,133,704,520,BT474_Phase_C3_2_02d12h00m_4.tif,BT474_Phase_C3_2_02d12h00m_4.png,https://darwin.v7labs.com/api/images/37512/ori...
2,323,704,520,BT474_Phase_C3_1_01d16h00m_3.tif,BT474_Phase_C3_1_01d16h00m_3.png,https://darwin.v7labs.com/api/images/108155/or...
3,498,704,520,BT474_Phase_C3_1_02d16h00m_4.tif,BT474_Phase_C3_1_02d16h00m_4.png,https://darwin.v7labs.com/api/images/108172/or...
4,741,704,520,BT474_Phase_C3_1_04d00h00m_3.tif,BT474_Phase_C3_1_04d00h00m_3.png,https://darwin.v7labs.com/api/images/921711/or...


In [58]:
check_noneffective_ids(df_livecell_train_imgs_meta)

the rows available are all effective, i.e., without NaNs.


In [64]:
print(f"there are {df_livecell_train_imgs_meta.shape[0]} items in the image meta dataframe")
ids_img_meta_arr = df_livecell_train_imgs_meta['id'].unique()
print(f"there are {ids_img_meta_arr.shape[0]} unique images in the training dataset")

there are 3253 items in the image meta dataframe
there are 3253 unique images in the training dataset


According to the result of the following code block, the id column is sorted by size.

In [69]:
ids_img_meta_arr_copy = ids_img_meta_arr.copy()
ids_img_meta_arr_copy.sort()
print(ids_img_meta_arr_copy)
print(np.allclose(ids_img_meta_arr, ids_img_meta_arr_copy))

[      1     133     323 ... 1587591 1588143 1588375]
True


### Metadata: annotations

In [61]:
## dataframe of annotations dict

df_livecell_train_annots_meta = pd.DataFrame.from_dict(train_data['annotations'], orient='index')
df_livecell_train_annots_meta.head()

Unnamed: 0,id,image_id,category_id,segmentation,area,bbox,iscrowd
2,2,1,1,"[[288.02, 305.63, 286.01, 298.87, 286.01, 295....",307.4786,"[286.01, 287.73, 19.170000000000016, 20.269999...",0
3,3,1,1,"[[271.22, 323.34, 267.93, 322.61, 266.29, 320....",247.47555,"[263.0, 304.9, 20.44999999999999, 18.439999999...",0
4,4,1,1,"[[284.91, 279.88, 289.85, 281.52, 293.31, 281....",245.22945,"[275.42, 277.14, 23.91999999999996, 17.1600000...",0
5,5,1,1,"[[260.86, 327.64, 258.19, 325.63, 255.25, 324....",574.21305,"[246.96, 280.72, 20.98999999999998, 54.0]",0
6,6,1,1,"[[241.75, 324.69, 239.61, 326.97, 236.27, 331....",296.3114,"[229.45, 302.91, 22.590000000000003, 32.75]",0


In [59]:
check_noneffective_ids(df_livecell_train_annots_meta)

the rows available are all effective, i.e., without NaNs.


In [63]:
ids_arr = df_livecell_train_annots_meta['id'].unique()
print(f"there are {ids_arr.shape[0]} unique ids in the training dataset")
img_ids_arr = df_livecell_train_annots_meta['image_id'].unique()
print(f"there are {img_ids_arr.shape[0]} unique images in the training dataset")
category_ids_arr = df_livecell_train_annots_meta['category_id'].unique()
print(f"there are {category_ids_arr.shape[0]} unique categories in the training dataset")
iscrowd_arr = df_livecell_train_annots_meta['iscrowd'].unique()
print(f"there are {iscrowd_arr.shape[0]} unique values of iscrowd in the training dataset")

there are 1018576 unique ids in the training dataset
there are 3253 unique images in the training dataset
there are 1 unique categories in the training dataset
there are 1 unique values of iscrowd in the training dataset


### Metadata: info

In [43]:
train_data['info']

{'year': '2020',
 'version': '1.0',
 'description': 'LIVECell 2021 Dataset',
 'contributor': 'Sartorius',
 'url': 'https://osf.io/6kang/?view_only=da0516e9189b4dbdbf018475113ed343',
 'date_created': '2021/01/19'}

### Metadata: licenses

In [44]:
train_data['licenses']

[{'id': 1,
  'name': 'Attribution-NonCommercial 4.0 International License',
  'url': 'https://creativecommons.org/licenses/by-nc/4.0/'}]

## Validation metadata exploration

## Testing metadata exploration