In [1]:
from os.path import join, splitext, split
import glob
import json

import pandas as pd
import h5py

# Inputs

In [2]:
BASE_DIR = "PathologySearchComparison/DATA/DATABASE/"
DATA_DIR = "."

In [3]:
metadata = pd.read_csv("sampled_metadata.csv")
metadata = metadata.set_index('file_name')

# Functions

In [4]:
def read_h5_patch(h5_patch):
    with h5py.File(h5_patch, 'r') as f:
        dataset = f['coords']
        coords = dataset[:]
        patch_level = dataset.attrs["patch_level"]
        patch_size = dataset.attrs["patch_size"]
    return coords, patch_level, patch_size

# DataFrame Generation

In [5]:
patch_pattern = join(DATA_DIR, "*", "*", "patches", "*.h5")
patches_list = glob.glob(patch_pattern)

In [6]:
patch_dict = {}
for h5_patch in patches_list:
    coords, patch_level, patch_size = read_h5_patch(h5_patch)
    head, file_name = split(h5_patch)
    file_name = splitext(file_name)[0] + ".svs"
    head, tail = split(head)
    head, diagnosis = split(head)
    _, site = split(head)
    file_id = metadata.loc[file_name, "id"]
    slide_path = join(BASE_DIR, site, diagnosis, file_name)
    
    for coord in coords:
        patch_dict.setdefault("file_id", []).append(file_id)
        patch_dict.setdefault("file_name", []).append(file_name)
        patch_dict.setdefault("slide_path", []).append(slide_path)
        patch_dict.setdefault("patch_level", []).append(patch_level)
        patch_dict.setdefault("patch_size", []).append(patch_size)
        patch_dict.setdefault("coord1", []).append(coord[0])
        patch_dict.setdefault("coord2", []).append(coord[1])

In [7]:
patch_dataframe = pd.DataFrame(patch_dict)
patch_dataframe.to_csv("patch_dataframe.csv", index=False)