## Moving ground-truth data to n5 containers

This notebook transfers ground-truth data from h5 files to the centralized n5 container for each dataset, adding relevant metadata (resolution and offset in nm). The ground-truth data is added to the main n5 containers under the following path:  
`volumes/groundtruth/{version}/{crop_name}`  
where `version` starts at `0003` as per Larissa's instruction. Versioning the ground-truth data will enable corresponding versioning of networks.


Additionally, this notebook performs a one-time adjustment of the organization of the n5 containers: 

`volumes/mask/{background_data}`  
will become  
`volumes/masks/groundtruth/{mask_data}`  
`volumes/masks/foreground/{mask_data}`.    

In [1]:
from fst.io import read
from glob import glob

def digest_annotated_data(fname, resolution_raw=4):
    # Infer metadata from the filename, extract the data, return data and a dict of attributes.
    import re
    import numpy as np
    from fst.io import read
    crop_name_pattern = r'Crop\d{1,}'
    dims_pattern = r'\d{1,}x\d{1,}x\d{1,}'
    offset_pattern = r'[\+,-]\d{1,}[\+,-]\d{1,}[\+,-]\d{1,}'
    label_dset_name = '/volumes/labels/gt'
    
    crop_name = re.search(crop_name_pattern, fname).group(0)
    dims = np.array(re.search(dims_pattern, fname).group(0).split('x')).astype('int')    
    padded_offset = np.array(re.split('[\+,-]', re.search(offset_pattern, fname).group(0)[1:])).astype('int')
    pad_width = ((read(fname)['/volumes/labels/gt'].attrs['offset'] + 1) / 4).astype('int')
    offset_native_res = np.abs((padded_offset - pad_width))
    data = read(f'{fname}:{label_dset_name}')[:]
    
    offset_nm = offset_native_res / resolution_raw
    
    resolution_nm = resolution_raw / (np.array(data.shape) / (dims - 2 * pad_width))
    
    resdict = dict(unit= 'nm', dimensions=resolution_nm.tolist())
    attrs = {}
    attrs['name'] = crop_name
    attrs['offset'] = offset_nm.tolist()
    attrs['pixelResolution'] = resdict
        
    return data, attrs

def append_dataset(container_path, dataset_path, data, attrs=None):
    # Add a new array to an existing n5 container using chunking / compression parameters from an existing array.
    import numcodecs
    import zarr
    from fst.io import read, chmodr
    # inherit the chunking scheme and compressor
    template_path = 'volumes/raw'
    template = read(container_path)[template_path]
    compressor = numcodecs.GZip(level=template.compressor.compressor_config['level'])
      
    array = zarr.open(store=zarr.N5Store(container_path), 
                      path=dataset_path, 
                      chunks=template.chunks, 
                      compressor=compressor,
                      shape=data.shape, 
                      mode='w', 
                      dtype=data.dtype)  
    
    array[:] = data
    array.attrs.update(**attrs)
    # this is ugly but saved time. 
    chmodr(container_path + '/volumes/' + dataset_path.split('/')[1], mode='umask') 
    return array

def move_dataset(container_path, src, dest):
    # use directory operations to relocate an n5 dataset within the container
    import shutil
    import os
    from pathlib import Path
    
    src_path = str(Path(container_path) / Path(src))
    dest_path = str(Path(container_path) / Path(dest))
    dest_parent = Path(dest_path).parent
    if not os.path.exists(dest_parent):
        os.makedirs(dest_parent)
    result = shutil.move(src_path, dest_path)
    return result
    

In [11]:
fnames = glob('~/dm11_cosem/annotations/bigcat/HeLa_Cell3*/*HeLa_Cell3*.h5')
crop_version = '0003'
for fn in fnames:
    data, attrs = digest_annotated_data(fn)
    crop_name = attrs['name']
    arr = append_dataset('nrs_cosem/davis/HeLa_Cell3_4x4x4nm.n5', 
                         f'volumes/groundtruth/{crop_version}/{crop_name}', 
                         data=data, 
                         attrs=attrs)

In [38]:
# these results look sane
arr = read('/home/bennettd/nrs_cosem/davis/HeLa_Cell3_4x4x4nm.n5:/volumes/groundtruth/0003/Crop33')
print(arr)
print(list(arr.attrs.items()))

<zarr.core.Array '/volumes/groundtruth/0003/Crop33' (400, 400, 400) uint64 read-only>
[('name', 'Crop33'), ('offset', [734.5, 57.5, 1284.5]), ('pixelResolution', {'dimensions': [2.0, 2.0, 2.0], 'unit': 'nm'})]


In [39]:
## todo: Create ground-truth masks

In [None]:
def make_gt_mask(container_path, dest_path):
    pass

### Test moving datasets around within a container


In [44]:
read('/home/bennettd/nrs_cosem/davis/test.n5/volumes/raw/subraw/ch1/')

<zarr.core.Array '/volumes/raw/subraw/ch1' (2784, 1000, 8750) uint8 read-only>

In [46]:
move_dataset('/home/bennettd/nrs_cosem/davis/test.n5', 'volumes/raw/subraw/ch1', 'volumes/raw/ch1')

'/home/bennettd/nrs_cosem/davis/test.n5/volumes/raw/ch1'

In [48]:
read('/home/bennettd/nrs_cosem/davis/test.n5/volumes/raw/ch1')

<zarr.core.Array '/volumes/raw/ch1' (2784, 1000, 8750) uint8 read-only>

In [None]:
# Move extant mask datasets from `volumes/mask` to `volumes/masks/foreground` 

In [41]:
containers = glob('/home/bennettd/dm11_cosem/data/*/*nm.n5')
containers

['/home/bennettd/dm11_cosem/data/Jurkat_Cell1_4x4x4nm/Jurkat_Cell1_FS96-Area1_4x4x4nm.n5',
 '/home/bennettd/dm11_cosem/data/Pancreas_Islets_4x4x4m/Pancreas_G36-2_4x4x4nm.n5',
 '/home/bennettd/dm11_cosem/data/U2OS_Cell4_8x8x8nm/Cryo_LoadID252_Cell4_8x8x8nm.n5',
 '/home/bennettd/dm11_cosem/data/HeLa_Cell2_4x4x4nm/HeLa_Cell2_4x4x4nm.n5',
 '/home/bennettd/dm11_cosem/data/U2OS_Cell6_8x8x8nm/Cryo_LoadID253_Cell6_8x8x8nm.n5',
 '/home/bennettd/dm11_cosem/data/Chlamydomonas_4x4x4nm/Chlamydomonas_4x4x4nm.n5',
 '/home/bennettd/dm11_cosem/data/HeLa_Cell3_4x4x4nm/HeLa_Cell3_4x4x4nm.n5',
 '/home/bennettd/dm11_cosem/data/TWalther_WT45_Cell2_4x4x4nm/Cryo_20171009_WT45_Cell2_4x4x4nm.n5',
 '/home/bennettd/dm11_cosem/data/HeLa_Cell25_8x8x8nm/HeLa_Cell25_flat_8x8x8nm.n5',
 '/home/bennettd/dm11_cosem/data/Macrophage_FS80_Cell2_4x4x4nm/Cryo_FS80_Cell2_4x4x4nm.n5',
 '/home/bennettd/dm11_cosem/data/HeLa_Cell21_8x8x8nm/HeLa_Cell21_8x8x8nm.n5',
 '/home/bennettd/dm11_cosem/data/HeLa_Cell1_8x8x8nm/HeLa_Cell1_D05-

In [50]:
# when uncommented, this will do the move for all of the above containers
for c in containers:
    try:
        move_dataset(c, 'volumes/mask', 'volumes/masks/foreground')
    except:
        print(f'Did not work for {c}')

Did not work for /home/bennettd/dm11_cosem/data/Jurkat_Cell1_4x4x4nm/Jurkat_Cell1_FS96-Area1_4x4x4nm.n5
Did not work for /home/bennettd/dm11_cosem/data/Pancreas_Islets_4x4x4m/Pancreas_G36-2_4x4x4nm.n5
Did not work for /home/bennettd/dm11_cosem/data/Mouse_NA3-3_4x4x4nm/Mouse_NA3-3_4x4x4nm.n5


In [55]:
read('/home/bennettd/dm11_cosem/data/Jurkat_Cell1_4x4x4nm/Jurkat_Cell1_FS96-Area1_4x4x4nm.n5/volumes/masks/foreground')

<zarr.core.Array '/volumes/masks/foreground' (4280, 1500, 5000) uint8 read-only>