# Data exploration of LIVECell

In [1]:
import os, inspect
import pathlib2
import pandas as pd
import numpy as np
import json
import re

In [33]:
# Helper functions

def learn_metadata(metadata_path):
    """
    learn about the basics of the metadata
    :param metadata_path: pathlib2.WindowsPath path representing the metadata file
    :return: dict metadata
    """

    # read the json file metadata as a dict
    with open(metadata_path) as json_data:
        metadata = json.load(json_data)

    # learn basic knowledge of metadata
    print(metadata.keys(), '\n\n')
    print('the type of images container: ', type(metadata['images']))
    print('the first element of <list> images: \n', metadata['images'][0], '\n\n')
    print('the type of annotations container: ', type(metadata['annotations']))
    print('the first ten keys of <dict> annotations: \n', list(metadata['annotations'].keys())[:10])
    print(f"the first item of <dict> annotations has the key: \n {list(metadata['annotations'].items())[0][0]} \nand the "
          f"value: \n{list(metadata['annotations'].items())[0][1]} \n\n")
    print('the type of categories container: ', type(metadata['categories']))
    print('the only element of <list> categories: \n', metadata['categories'][0], '\n\n')
    print('the type of info container: ', type(metadata['info']))
    print('the keys of <dict> info: \n', metadata['info'].keys(), '\n\n')
    print('the type of licenses container: ', type(metadata['licenses']))
    print('the first element of <list> licenses: \n', metadata['licenses'][0])

    return metadata

def check_noneffective_ids(df):
    """
    check if the given data frame has NaN and return the rows with it if any, or else return nothing
    :param df: pd.DataFrame input dataframe
    :return: pd.DataFrame dataframe including only non-valid rows if any
    """

    is_NaN = df.isnull()
    row_has_NaN = is_NaN.any(axis=1)
    rows_with_NaN = df[row_has_NaN]
    len_rows_with_NaN = rows_with_NaN.shape[0]
    if len_rows_with_NaN != 0:
        print(f"there are {len_rows_with_NaN} non effective rows.")
        return rows_with_NaN
    else:
        print("the rows available are all effective, i.e., without NaNs.")
        return None

def check_meta_images_df(df_img):
    """
    check the validity of metadata of images
    :param df_img: pd.DataFrame dataframe that contains image metadata
    :return: int length of all items in df_img and
             int length of unique image ids and
             pd.DataFrame duplicated dataframe in terms of file_name if any
    """

    # check the number of available items and unique ids of the dataframe
    len_df_img = df_img.shape[0]
    print(f"there are {len_df_img} items in the image metadata frame.")
    img_ids_array = df_img['id'].unique()
    len_img_ids_array = img_ids_array.shape[0]
    print(f"there are {len_img_ids_array} unique ids (not images) in the image metadata frame.")

    # check if the ids are sorted by size
    img_ids_array_copy = img_ids_array.copy()
    img_ids_array_copy.sort()
    print(np.allclose(img_ids_array, img_ids_array_copy), ': the ids are sorted by size.')

    # check if all images are of the same size given the width and height
    uniq_width_arr = df_img['width'].unique()
    uniq_height_arr = df_img['height'].unique()
    if (uniq_width_arr.size == 1) and (uniq_height_arr.size == 1):
        print((uniq_width_arr[0] == 704) and (uniq_height_arr[0] == 520), ': all the images are of size (704, 520).')
    elif (uniq_width_arr.size != 1) and (uniq_height_arr.size == 1):
        print("some images have different width.")
    elif (uniq_width_arr.size == 1) and (uniq_height_arr.size != 1):
        print("some images have different height.")
    elif (uniq_width_arr.size != 1) and (uniq_height_arr.size != 1):
        print("some images have different size.")

    # check if all the file names have the same extension .tff
    # and check if all the file names are unique
    counts_of_tif_files = df_img['file_name'].str.count('.tif').sum()
    if counts_of_tif_files == len_df_img:
        print('all files are extended by .tff.')
    else:
        print('there are files extended by different file format.')

    uniq_file_names_arr = df_img['file_name'].unique()
    len_uniq_file_names_arr = uniq_file_names_arr.shape[0]
    print(f"there are {len_uniq_file_names_arr} unique file names in the image metadata frame.")
    if len_uniq_file_names_arr < len_df_img:
        has_duplicate = df_img['file_name'].duplicated()
        non_duplicates_filename = df_img[~has_duplicate]
        return len_df_img, len_img_ids_array, non_duplicates_filename
    else:
        return len_df_img, len_img_ids_array

def check_filename_correspondence(series, image_path):
    """
    check the correspondence between the file_name column and the image file names in the corresponding directory
    :param series: pd.Series series representing file_name column extracted from the metadata of the image
    :param image_path: pathlib2.WindowsPath path representing the directory including all the images
    :return:
    """

    # check the number of valid images in the image_path
    valid_image_names_list = list()
    for cell_type_image_path in image_path.iterdir():
        for cell_image_path in cell_type_image_path.glob('*.tif'):
            if cell_image_path.stat().st_size is not None:
                valid_image_names_list.append(cell_image_path.name)
    print(f"the number of valid images in dir {image_path.name} is: {len(valid_image_names_list)}.")

    # check the correspondence between file_name column and valid images in the image_path
    valid_image_names_arr = np.asarray(valid_image_names_list)
    file_names_arr = series.to_numpy()
    print(f"the number of unique filenames in column file_name is: {file_names_arr.shape[0]}.")
    intersect_arr = np.intersect1d(file_names_arr, valid_image_names_arr)
    if intersect_arr.shape[0] == file_names_arr.shape[0]:
        print("all unique filenames in column file_name have the corresponding images.")
    else:
        print("there are missing images for the given unique filenames in column file_name.")

    return None

def check_meta_annots_df(df_img, df_annot):
    """
    check the validity of metadata of annotations
    :param df_img: pd.DataFrame dataframe that contains image metadata
    :param df_annot: pd.DataFrame dataframe that contains annotation metadata
    :return:
    """

    # check the equality between the index and column 'id' of the input dataframe
    index_arr = df_annot.index.to_numpy()
    index_arr = index_arr.astype('int64')
    ids_annot_arr = df_annot['id'].to_numpy()
    print(np.allclose(index_arr, ids_annot_arr), ': The index values and values of the id column of annotations metadata in form'
                                     ' of data frame is elementwise equal.')

    # check the number of unique values of some existing columns
    print(f"there are {ids_annot_arr.shape[0]} unique ids in the metadata for annotations.")
    img_ids_annot_arr = df_annot['image_id'].unique()
    print(f"there are {img_ids_annot_arr.shape[0]} unique image ids in the metadata for annotations.")
    category_ids_annot_arr = df_annot['category_id'].unique()
    print(f"there are {category_ids_annot_arr.shape[0]} unique categories in the metadata for annotations.")
    iscrowd_annot_arr = df_annot['iscrowd'].unique()
    print(f"there are {iscrowd_annot_arr.shape[0]} unique values of iscrowd in the metadata for annotations.")

    # check the pair relationship between image ids of metadata for images and annotations
    img_ids_img_arr = df_img['id'].to_numpy()  # already known that the id col is sorted by size
    print(np.allclose(img_ids_img_arr, np.sort(img_ids_annot_arr)), ': all the image ids of metadata for images and '
                                                                    'annotations are paired with each other.')

    # TODO: check the segmentation, area and bbox column
    return None

## Define relevant paths

In [3]:
current_dir = pathlib2.Path.cwd()
# current_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
print("currentdir: ", current_dir.as_posix())
project_dir = current_dir.parent
print("project dir: ", project_dir.as_posix())

currentdir:  F:/Kaggle/sartorius_cell_instance_segmentation/code/rkx_cell_is/analytics
project dir:  F:/Kaggle/sartorius_cell_instance_segmentation/code/rkx_cell_is


In [4]:
ds_path = project_dir / 'dataset'
print("dataset dir: ", ds_path.as_posix())
livecell_ds_path = ds_path / 'LIVECell_dataset_2021'
print("livecell dataset dir: ", livecell_ds_path.as_posix())
livecell_ds_annot_path, livecell_ds_imgs_path = [x for x in livecell_ds_path.iterdir() if x.is_dir()]

livecell_train_meta_path = livecell_ds_annot_path / 'LIVECell' / 'livecell_coco_train.json'
livecell_val_meta_path = livecell_ds_annot_path / 'LIVECell' / 'livecell_coco_val.json'
livecell_test_meta_path = livecell_ds_annot_path / 'LIVECell' / 'livecell_coco_test.json'

livecell_train_val_img_path = livecell_ds_imgs_path / 'livecell_train_val_images'
livecell_test_img_path = livecell_ds_imgs_path / 'livecell_test_images'

dataset dir:  F:/Kaggle/sartorius_cell_instance_segmentation/code/rkx_cell_is/dataset
livecell dataset dir:  F:/Kaggle/sartorius_cell_instance_segmentation/code/rkx_cell_is/dataset/LIVECell_dataset_2021


## Training metadata exploration

In [36]:
train_data = learn_metadata(livecell_train_meta_path)

dict_keys(['images', 'annotations', 'categories', 'info', 'licenses']) 


the type of images container:  <class 'list'>
the first element of <list> images: 
 {'id': 1, 'width': 704, 'height': 520, 'file_name': 'BT474_Phase_A3_2_00d04h00m_3.tif', 'original_filename': 'BT474_Phase_A3_2_00d04h00m_3.png', 'url': 'https://darwin.v7labs.com/api/images/870028/original'} 


the type of annotations container:  <class 'dict'>
the first ten keys of <dict> annotations: 
 ['2', '3', '4', '5', '6', '7', '8', '9', '10', '12']
the first item of <dict> annotations has the key: 
 2 
and the value: 
{'id': 2, 'image_id': 1, 'category_id': 1, 'segmentation': [[288.02, 305.63, 286.01, 298.87, 286.01, 295.4, 288.02, 290.1, 293.86, 287.91, 297.51, 287.73, 300.44, 289.01, 304.27, 292.48, 304.64, 295.04, 305.18, 297.77, 305.18, 300.7, 303.91, 302.52, 301.17, 305.26, 297.33, 307.45, 294.59, 307.45, 290.58, 308.0]], 'area': 307.4786000000313, 'bbox': [286.01, 287.73, 19.170000000000016, 20.269999999999982], 'isc

### Metadata: images

In [37]:
df_livecell_train_imgs_meta = pd.DataFrame.from_dict(train_data['images'])
df_livecell_train_imgs_meta.head()

Unnamed: 0,id,width,height,file_name,original_filename,url
0,1,704,520,BT474_Phase_A3_2_00d04h00m_3.tif,BT474_Phase_A3_2_00d04h00m_3.png,https://darwin.v7labs.com/api/images/870028/or...
1,133,704,520,BT474_Phase_C3_2_02d12h00m_4.tif,BT474_Phase_C3_2_02d12h00m_4.png,https://darwin.v7labs.com/api/images/37512/ori...
2,323,704,520,BT474_Phase_C3_1_01d16h00m_3.tif,BT474_Phase_C3_1_01d16h00m_3.png,https://darwin.v7labs.com/api/images/108155/or...
3,498,704,520,BT474_Phase_C3_1_02d16h00m_4.tif,BT474_Phase_C3_1_02d16h00m_4.png,https://darwin.v7labs.com/api/images/108172/or...
4,741,704,520,BT474_Phase_C3_1_04d00h00m_3.tif,BT474_Phase_C3_1_04d00h00m_3.png,https://darwin.v7labs.com/api/images/921711/or...


In [7]:
# check the validity of metadata of image
len_df_train_img = 0
len_train_img_ids = 0
non_duplicated_train_filenames = pd.DataFrame()
if check_noneffective_ids(df_livecell_train_imgs_meta) is None:
    train_check_results = check_meta_images_df(df_livecell_train_imgs_meta)
    if len(train_check_results) == 2:
        len_df_train_img, len_train_img_ids = train_check_results
    elif len(train_check_results) == 3:
        len_df_train_img, len_train_img_ids, non_duplicated_train_filenames = train_check_results
else:
    print('There are non-valid metadata in terms of image. Please handle it!')

the rows available are all effective, i.e., without NaNs.
there are 3253 items in the image metadata frame.
there are 3253 unique ids (not images) in the image metadata frame.
True : the ids are sorted by size.
True : all the images are of size (704, 520).
all files are extended by .tff.
there are 3188 unique file names in the image metadata frame.


In [8]:
print(f"the meta data of image has {non_duplicated_train_filenames.shape[0]} non-duplicated rows in terms of file_name.")

the meta data of image has 3188 non-duplicated rows in terms of file_name.


In [9]:
# one example (extracted from the duplicated file_names) to justify the duplication in terms of the file_name

df_livecell_train_imgs_meta[df_livecell_train_imgs_meta['file_name'].str.contains('Huh7_Phase_A10_2_00d16h00m_4.tif',
                                                                                  regex=False)]

Unnamed: 0,id,width,height,file_name,original_filename,url
1500,742297,704,520,Huh7_Phase_A10_2_00d16h00m_4.tif,Huh7_Phase_A10_2_00d16h00m_4.png,https://darwin.v7labs.com/api/images/37624/ori...
1726,1012330,704,520,Huh7_Phase_A10_2_00d16h00m_4.tif,Huh7_Phase_A10_2_00d16h00m_4.png,https://darwin.v7labs.com/api/images/47430/ori...


### Metadata: annotations

In [17]:
# dataframe of annotations dict

df_livecell_train_annots_meta = pd.DataFrame.from_dict(train_data['annotations'], orient='index')
df_livecell_train_annots_meta.head()

Unnamed: 0,id,image_id,category_id,segmentation,area,bbox,iscrowd
2,2,1,1,"[[288.02, 305.63, 286.01, 298.87, 286.01, 295....",307.4786,"[286.01, 287.73, 19.170000000000016, 20.269999...",0
3,3,1,1,"[[271.22, 323.34, 267.93, 322.61, 266.29, 320....",247.47555,"[263.0, 304.9, 20.44999999999999, 18.439999999...",0
4,4,1,1,"[[284.91, 279.88, 289.85, 281.52, 293.31, 281....",245.22945,"[275.42, 277.14, 23.91999999999996, 17.1600000...",0
5,5,1,1,"[[260.86, 327.64, 258.19, 325.63, 255.25, 324....",574.21305,"[246.96, 280.72, 20.98999999999998, 54.0]",0
6,6,1,1,"[[241.75, 324.69, 239.61, 326.97, 236.27, 331....",296.3114,"[229.45, 302.91, 22.590000000000003, 32.75]",0


In [59]:
check_noneffective_ids(df_livecell_train_annots_meta)

the rows available are all effective, i.e., without NaNs.


In [38]:
check_meta_annots_df(df_livecell_train_imgs_meta, df_livecell_train_annots_meta)

True : The index values and values of the id column of annotations metadata in form of data frame is elementwise equal.
there are 1018576 unique ids in the metadata for annotations.
there are 3253 unique image ids in the metadata for annotations.
there are 1 unique categories in the metadata for annotations.
there are 1 unique values of iscrowd in the metadata for annotations.
True : all the image ids of metadata for images and annotations are paired with each other.


### Partial conclusions of training metadata - images and annotations

- Both the *image_id* column in the metadata for annotations and the *id* column in the metadata for images have the same
unique image ids. That means, all the images ids on both sides can be paired with each other.

### Metadata: categories

In [14]:
train_data['categories']

[{'supercategory': 'cell', 'id': 1, 'name': 'cell'}]

### Metadata: info

In [43]:
train_data['info']

{'year': '2020',
 'version': '1.0',
 'description': 'LIVECell 2021 Dataset',
 'contributor': 'Sartorius',
 'url': 'https://osf.io/6kang/?view_only=da0516e9189b4dbdbf018475113ed343',
 'date_created': '2021/01/19'}

### Metadata: licenses

In [44]:
train_data['licenses']

[{'id': 1,
  'name': 'Attribution-NonCommercial 4.0 International License',
  'url': 'https://creativecommons.org/licenses/by-nc/4.0/'}]

## Validation metadata exploration

In [10]:
val_data = learn_metadata(livecell_val_meta_path)

dict_keys(['images', 'annotations', 'categories', 'info', 'licenses']) 


the type of images container:  <class 'list'>
the first element of <list> images: 
 {'id': 86187, 'width': 704, 'height': 520, 'file_name': 'BT474_Phase_B3_1_00d12h00m_1.tif', 'original_filename': 'BT474_Phase_B3_1_00d12h00m_1.png', 'url': 'https://darwin.v7labs.com/api/images/31634/original'} 


the type of annotations container:  <class 'dict'>
the first ten keys of <dict> annotations: 
 ['86188', '86189', '86190', '86191', '86192', '86193', '86194', '86195', '86196', '86197']
the first item of <dict> annotations has the key: 
 86188 
and the value: 
{'id': 86188, 'image_id': 86187, 'category_id': 1, 'segmentation': [[704.0, 97.82, 696.67, 94.77, 691.43, 89.32, 690.68, 86.64, 691.43, 82.96, 694.0, 79.53, 699.08, 78.52, 703.89, 78.15]], 'area': 193.82679999998072, 'bbox': [690.68, 78.15, 13.32000000000005, 19.669999999999987], 'iscrowd': 0} 


the type of categories container:  <class 'list'>
the only element of

### Metadata: images

In [11]:
df_livecell_val_imgs_meta = pd.DataFrame.from_dict(val_data['images'])
df_livecell_val_imgs_meta.head()

Unnamed: 0,id,width,height,file_name,original_filename,url
0,86187,704,520,BT474_Phase_B3_1_00d12h00m_1.tif,BT474_Phase_B3_1_00d12h00m_1.png,https://darwin.v7labs.com/api/images/31634/ori...
1,86330,704,520,BT474_Phase_C3_2_02d16h00m_4.tif,BT474_Phase_C3_2_02d16h00m_4.png,https://darwin.v7labs.com/api/images/37516/ori...
2,86516,704,520,BT474_Phase_C3_2_02d12h00m_3.tif,BT474_Phase_C3_2_02d12h00m_3.png,https://darwin.v7labs.com/api/images/37511/ori...
3,86708,704,520,BT474_Phase_B3_1_05d00h00m_3.tif,BT474_Phase_B3_1_05d00h00m_3.png,https://darwin.v7labs.com/api/images/921840/or...
4,87048,704,520,BT474_Phase_B3_2_04d04h00m_4.tif,BT474_Phase_B3_2_04d04h00m_4.png,https://darwin.v7labs.com/api/images/47298/ori...


In [12]:
# check the validity of metadata of image
len_df_val_img = 0
len_val_img_ids = 0
non_duplicated_val_filenames = pd.DataFrame()
if check_noneffective_ids(df_livecell_val_imgs_meta) is None:
    val_check_results = check_meta_images_df(df_livecell_val_imgs_meta)
    if len(val_check_results) == 2:
        len_df_val_img, len_val_img_ids = val_check_results
    elif len(val_check_results) == 3:
        len_df_val_img, len_val_img_ids, non_duplicated_val_filenames = val_check_results
else:
    print('There are non-valid metadata in terms of image. Please handle it!')

the rows available are all effective, i.e., without NaNs.
there are 570 items in the image metadata frame.
there are 570 unique ids (not images) in the image metadata frame.
True : the ids are sorted by size.
True : all the images are of size (704, 520).
all files are extended by .tff.
there are 569 unique file names in the image metadata frame.


In [13]:
print(f"the meta data of image has {non_duplicated_val_filenames.shape[0]} non-duplicated rows in terms of file_name.")

the meta data of image has 569 non-duplicated rows in terms of file_name.


In [14]:
# the only example to justify the duplication in terms of the file_name

df_livecell_val_imgs_meta[df_livecell_val_imgs_meta['file_name'].str.contains('Huh7_Phase_A10_2_00d04h00m_2.tif',
                                                                                  regex=False)]

Unnamed: 0,id,width,height,file_name,original_filename,url
229,876543,704,520,Huh7_Phase_A10_2_00d04h00m_2.tif,Huh7_Phase_A10_2_00d04h00m_2.png,https://darwin.v7labs.com/api/images/37610/ori...
354,1037056,704,520,Huh7_Phase_A10_2_00d04h00m_2.tif,Huh7_Phase_A10_2_00d04h00m_2.png,https://darwin.v7labs.com/api/images/47416/ori...


In [37]:
# check the correspondence between file_name column and the corresponding images in terms of training and validation
# dataset

concat_series = pd.concat([non_duplicated_train_filenames['file_name'], non_duplicated_val_filenames['file_name']],
                          ignore_index=True, sort=False)
if concat_series.unique().shape[0] == concat_series.shape[0]:
    print('there is no image file name from columns training file_name and validation file_name overlapped.')
elif concat_series.unique().shape[0] < concat_series.shape[0]:
    print('there are image file names overlapped! So we need to remove those duplicated.')
    concat_series = pd.Series(concat_series.unique(), name='file_name')
check_filename_correspondence(concat_series, livecell_train_val_img_path)

there are image file names overlapped! So we need to remove those duplicated.
the number of valid images in dir livecell_train_val_images is: 4184.
the number of unique filenames in column file_name is: 3727.
all unique filenames in column file_name have the corresponding images.


### Partial conclusions of training and validation metadata - images
According to the result of the blocks above, it can be derived that

- While the overall image file names of metadata for training images count to **3253**, the unique image file names (
including the first file_name of duplicated ones) count to **3188**.
- While the overall image file names of metadata for validation images count to **570**, the unique image file names (
including the first file_name of duplicated ones) count to **569**.
- Taking account of both the unique image file names of metadata for training images and metadata for validation images,
there are still overlapping image file names between them both, counting to **30**. Thus, the unique image file names of
the training and validation metadata are **3727 (= 3188 + 569 - 30)**. **However, it's not sufficient to say that the
overlapping image file names represent the corresponding images to be deprecated in either party, i.e., training or
validation images. There should be more investigation into the annotation metadata and the LIVECell repo, in order to
make the relevant decision.**
- All the unique image file names of metadata for training images and metadata for validation images have the corresponding
images that can be found.

## Testing metadata exploration

In [5]:
test_data = learn_metadata(livecell_test_meta_path)

dict_keys(['images', 'annotations', 'categories', 'info', 'licenses']) 


the type of images container:  <class 'list'>
the first element of <list> images: 
 {'id': 101438, 'width': 704, 'height': 520, 'file_name': 'BT474_Phase_D3_1_04d04h00m_3.tif', 'original_filename': 'BT474_Phase_D3_1_04d04h00m_3.png', 'url': 'https://darwin.v7labs.com/api/images/870104/original'} 


the type of annotations container:  <class 'dict'>
the first ten keys of <dict> annotations: 
 ['101439', '101440', '101441', '101442', '101443', '101444', '101445', '101446', '101447', '101448']
the first item of <dict> annotations has the key: 
 101439 
and the value: 
{'id': 101439, 'image_id': 101438, 'category_id': 1, 'segmentation': [[281.6, 234.28, 292.24, 236.48, 298.84, 239.04, 305.08, 239.41, 308.38, 239.41, 310.21, 236.11, 310.21, 230.61, 308.75, 226.21, 304.34, 220.71, 302.88, 218.14, 300.68, 214.47, 296.64, 216.3, 288.94, 216.3, 287.11, 219.61, 284.17, 226.94]], 'area': 492.789049999963, 'bbox': [281.6, 21

### Metadata: images

In [7]:
df_livecell_test_imgs_meta = pd.DataFrame.from_dict(test_data['images'])
df_livecell_test_imgs_meta.head()

Unnamed: 0,id,width,height,file_name,original_filename,url
0,101438,704,520,BT474_Phase_D3_1_04d04h00m_3.tif,BT474_Phase_D3_1_04d04h00m_3.png,https://darwin.v7labs.com/api/images/870104/or...
1,101798,704,520,BT474_Phase_D3_2_03d04h00m_2.tif,BT474_Phase_D3_2_03d04h00m_2.png,https://darwin.v7labs.com/api/images/61733/ori...
2,101943,704,520,BT474_Phase_D3_1_01d12h00m_4.tif,BT474_Phase_D3_1_01d12h00m_4.png,https://darwin.v7labs.com/api/images/921720/or...
3,102143,704,520,BT474_Phase_D3_2_04d16h00m_1.tif,BT474_Phase_D3_2_04d16h00m_1.png,https://darwin.v7labs.com/api/images/37521/ori...
4,102490,704,520,BT474_Phase_D3_2_00d12h00m_3.tif,BT474_Phase_D3_2_00d12h00m_3.png,https://darwin.v7labs.com/api/images/921792/or...


In [8]:
# check the validity of metadata of image
len_df_test_img = 0
len_test_img_ids = 0
non_duplicated_test_filenames = pd.DataFrame()
if check_noneffective_ids(df_livecell_test_imgs_meta) is None:
    test_check_results = check_meta_images_df(df_livecell_test_imgs_meta)
    if len(test_check_results) == 2:
        len_df_test_img, len_test_img_ids = test_check_results
    elif len(test_check_results) == 3:
        len_df_test_img, len_test_img_ids, non_duplicated_test_filenames = test_check_results
else:
    print('There are non-valid metadata in terms of image. Please handle it!')

the rows available are all effective, i.e., without NaNs.
there are 1564 items in the image metadata frame.
there are 1564 unique ids (not images) in the image metadata frame.
True : the ids are sorted by size.
True : all the images are of size (704, 520).
all files are extended by .tff.
there are 1512 unique file names in the image metadata frame.


In [10]:
print(f"the meta data of image has {non_duplicated_test_filenames.shape[0]} non-duplicated rows in terms of file_name.")

the meta data of image has 1512 non-duplicated rows in terms of file_name.


In [11]:
# the only example to justify the duplication in terms of the file_name

df_livecell_test_imgs_meta[df_livecell_test_imgs_meta['file_name'].str.contains('Huh7_Phase_A12_1_03d16h00m_2.tif',
                                                                                  regex=False)]

Unnamed: 0,id,width,height,file_name,original_filename,url
504,918641,704,520,Huh7_Phase_A12_1_03d16h00m_2.tif,Huh7_Phase_A12_1_03d16h00m_2.png,https://darwin.v7labs.com/api/images/37666/ori...
690,1038567,704,520,Huh7_Phase_A12_1_03d16h00m_2.tif,Huh7_Phase_A12_1_03d16h00m_2.png,https://darwin.v7labs.com/api/images/47472/ori...


In [12]:
# check the correspondence between file_name column and the corresponding images in terms of testing dataset

test_series = non_duplicated_test_filenames['file_name']
check_filename_correspondence(test_series, livecell_test_img_path)

the number of valid images in dir livecell_test_images is: 1664.
the number of unique filenames in column file_name is: 1512.
all unique filenames in column file_name have the corresponding images.


### Partial conclusions of testing metadata - images
According to the result of the blocks above, it can be derived that

- While the overall image file names of metadata for testing images count to **1564**, the unique image file names (
including the first file_name of duplicated ones) count to **1512**.
- All the unique image file names of metadata for testing images have the corresponding images that can be found.