## ZARR COMPRESSION IN A MULTIPROCESSING ENVIRONMENT

"Parallel computing and synchronization"<br>
https://zarr.readthedocs.io/en/stable/tutorial.html#parallel-computing-and-synchronization

Query "multi-processing zarr" in Google for useful discussions

In [1]:
# A standard multiprocessing library mated to zarr - a standard example of writing to zarr file using multiprocessing.
# View discussion and source code here: https://github.com/zarr-developers/zarr-python/issues/199
# View block / hang discussion here: https://github.com/zarr-developers/numcodecs/issues/41
# View using Dask for multi-threading here: https://clouds.eos.ubc.ca/~phil/courses/parallel_python/03_dask_and_zarr.html

import zarr
import numpy as np
from pprint import pprint
import multiprocessing
from zarr import blosc
blosc.set_nthreads(20)
blosc.use_threads=False  # This must be set to false to prevent locking in the blosc context 

# look into synchronized zarr API: https://zarr.readthedocs.io/en/stable/api/sync.html
synchronizer = zarr.ProcessSynchronizer('example.sync')
processed_zarr = zarr.hierarchy.open_group("test.zarr", 'a', synchronizer=synchronizer)

features_arr = np.random.random_sample((10000,20))
processed_zarr.create_dataset("features_arr", data=features_arr, shape=features_arr.shape, dtype="float64", overwrite=True)

ixs = np.arange(processed_zarr["features_arr"].shape[0])
slices = np.linspace(0, processed_zarr["features_arr"].shape[0]-1, 100, dtype=np.int32)

sliceIter = []
for i in range(len(slices)-1):
    sliceIter.append({
        "min" : ixs[slices[i]],
        "max" : ixs[slices[i+1]],
        "slice_num" : i,
    })
pprint(sliceIter)

### slices breakds up the np.arange of processed_zarr["features_arr"] into n number of slices in an np.linspace
#
def mem_instantiate(param_dict):
    min_ix = param_dict["min"]
    max_ix = param_dict["max"]
    slice_num = param_dict["slice_num"]

    ### never gets past loading the features
    instantiated_features = processed_zarr["features_arr"][min_ix:max_ix]
    print(slice_num, "features loaded")


pool = multiprocessing.Pool(processes=5)
pool.map(mem_instantiate, sliceIter)
pool.close()
pool.join()

[{'max': 101, 'min': 0, 'slice_num': 0},
 {'max': 202, 'min': 101, 'slice_num': 1},
 {'max': 303, 'min': 202, 'slice_num': 2},
 {'max': 404, 'min': 303, 'slice_num': 3},
 {'max': 505, 'min': 404, 'slice_num': 4},
 {'max': 606, 'min': 505, 'slice_num': 5},
 {'max': 707, 'min': 606, 'slice_num': 6},
 {'max': 808, 'min': 707, 'slice_num': 7},
 {'max': 909, 'min': 808, 'slice_num': 8},
 {'max': 1010, 'min': 909, 'slice_num': 9},
 {'max': 1111, 'min': 1010, 'slice_num': 10},
 {'max': 1212, 'min': 1111, 'slice_num': 11},
 {'max': 1313, 'min': 1212, 'slice_num': 12},
 {'max': 1414, 'min': 1313, 'slice_num': 13},
 {'max': 1515, 'min': 1414, 'slice_num': 14},
 {'max': 1616, 'min': 1515, 'slice_num': 15},
 {'max': 1717, 'min': 1616, 'slice_num': 16},
 {'max': 1818, 'min': 1717, 'slice_num': 17},
 {'max': 1919, 'min': 1818, 'slice_num': 18},
 {'max': 2020, 'min': 1919, 'slice_num': 19},
 {'max': 2121, 'min': 2020, 'slice_num': 20},
 {'max': 2222, 'min': 2121, 'slice_num': 21},
 {'max': 2323, 'min