## Step 1: First trying to load an xarray via kerchunked zarr refs of a fake nwb


### create fake nwb file

In [1]:
import h5py
import os
import numpy as np
import kerchunk
import kerchunk.hdf
import fsspec
import ujson
import xarray as xr
import warnings
import json

In [2]:
probe_id = "12345"
filename = f"probe_{probe_id}_lfp.nwb"
data_group = 'data'
ref_file = "lfp_one_probe_ref_rand.json"
ref_filepath = os.getcwd() + '/' + ref_file

In [None]:
with h5py.File(filename, 'w') as f:
    probe_group = f.create_group(data_group)
    
    data_shape = (100, 10)
    data = np.random.random(data_shape)
    probe_data = probe_group.create_dataset("lfp", data_shape, dtype='f', data=data, chunks=(10, 2))
    probe_data.attrs['unit'] = 'volts'

print(f"File {filename} created.")

### create kerchunk/zarr/json ref file

In [5]:
with fsspec.open(filename) as f:
    h5chunks = kerchunk.hdf.SingleHdf5ToZarr(f, filename)
    refs = h5chunks.translate()

with open(ref_file, "wb") as f:
    f.write(ujson.dumps(refs).encode())

### access the data into xarray with fsspecc mapping of ref file

In [4]:
fs = fsspec.filesystem("reference", fo=ref_filepath)
m = fs.get_mapper()
ds = xr.open_dataset(
    m,
    group=data_group,
    engine="zarr",
    backend_kwargs={"consolidated": False, "mask_and_scale": False}
)

ds.lfp.values[:5,:5]

array([[0.5514354 , 0.8056615 , 0.8522118 , 0.86789536, 0.1691171 ],
       [0.50029904, 0.5817629 , 0.643184  , 0.24592522, 0.21853325],
       [0.706747  , 0.95863044, 0.65548944, 0.8093027 , 0.5735166 ],
       [0.7286443 , 0.07899158, 0.13377485, 0.60330456, 0.7686506 ],
       [0.7145909 , 0.83703053, 0.9424319 , 0.03695546, 0.562787  ]],
      dtype=float32)

ok that works.. next trying on the real data but using similar approach as above

## Step 2: Now load the real .nwb file as is

- note.. xarray needs an `_ARRAY_DIMENSIONS` attribute for each dataset in order to read it from zarr
- I'm trying to just add this `_ARRAY_DIMENSIONS` attribute right into the lfp dataset reference.
- This is slightly different than Ian's approach of creating a zarr construct with `_ARRAY_DIMENSIONS` attribute and then fitting the lfp refs into that.. 

In [3]:
def set_array_dimensions(refs, data_path, dimensions):
    ref_path = data_path + "/.zarray"
    if ref_path in refs['refs']:
        ref_str = refs['refs'][ref_path]
        ref_dict = json.loads(ref_str)  # Deserialize the JSON string
        print(ref_path)
        ref_dict['_ARRAY_DIMENSIONS'] = dimensions
        ref_str = json.dumps(ref_dict)  # Serialize back to a JSON string
        refs['refs'][ref_path] = ref_str  # Replace the reference in refs
        print(ref_dict)  # Original reference

In [4]:
data_dir='~/data/allen/'
data_dir = os.path.expanduser(data_dir)
probe_id = "810755797"
nwb_filepath = os.path.join(data_dir, f"probe_{probe_id}_lfp.nwb")
nwb_ref_filepath = os.path.join(data_dir, "lfp_one_probe_ref.json")
lfp_group_path = f"acquisition/probe_{probe_id}_lfp/probe_{probe_id}_lfp_data"
lfp_data_path = lfp_group_path + "/data"
electrodes_data_path = lfp_group_path + "/electrodes"
time_data_path = lfp_group_path + "/timestamps"

with fsspec.open(nwb_filepath) as f:
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        h5chunks = kerchunk.hdf.SingleHdf5ToZarr(f, nwb_filepath)
        refs = h5chunks.translate()

# Adding _ARRAY_DIMENSIONS
set_array_dimensions(refs, lfp_data_path, ["time", "channel"])
set_array_dimensions(refs, electrodes_data_path, ["channel"])
set_array_dimensions(refs, time_data_path, ["time"])


with open(nwb_ref_filepath, "wb") as f:
    f.write(ujson.dumps(refs).encode())

fs = fsspec.filesystem("reference", fo=nwb_ref_filepath)
ds = xr.open_dataset(
    fs.get_mapper(),
    engine="zarr",
    group=lfp_group_path, # i think setting the group is one aspect that Ian's approach was missing.. 
    backend_kwargs={"consolidated": False, "mask_and_scale": False}
)

data = ds.lfp[:100, :4]
print("==> data", data.compute())
mean = data.mean().compute()
print("==> mean", mean)

acquisition/probe_810755797_lfp/probe_810755797_lfp_data/data/.zarray
{'chunks': [41859, 1], 'compressor': {'id': 'zlib', 'level': 9}, 'dtype': '<f4', 'fill_value': 0.0, 'filters': None, 'order': 'C', 'shape': [10715666, 93], 'zarr_format': 2, '_ARRAY_DIMENSIONS': ['time', 'channel']}
acquisition/probe_810755797_lfp/probe_810755797_lfp_data/electrodes/.zarray
{'chunks': [93], 'compressor': None, 'dtype': '<i8', 'fill_value': 0, 'filters': None, 'order': 'C', 'shape': [93], 'zarr_format': 2, '_ARRAY_DIMENSIONS': ['channel']}
acquisition/probe_810755797_lfp/probe_810755797_lfp_data/timestamps/.zarray
{'chunks': [10465], 'compressor': {'id': 'zlib', 'level': 9}, 'dtype': '<f8', 'fill_value': 0.0, 'filters': None, 'order': 'C', 'shape': [10715666], 'zarr_format': 2, '_ARRAY_DIMENSIONS': ['time']}


ValueError: conflicting sizes for dimension 'phony_dim_0': length 93 on 'electrodes' and length 10715666 on {'phony_dim_0': 'data', 'phony_dim_1': 'data'}

`RuntimeError: error during blosc decompression: -1`.. maybe the underlying data is not blosc..?

In [11]:
with h5py.File(nwb_filepath, 'r') as f:
    dataset = f[lfp_data_path]
    
    print("Compression method:", dataset.compression)
    print("Compression options:", dataset.compression_opts)

Compression method: gzip
Compression options: 9


I'm not sure how to deal with this information. would it go into the kerchunk references? or xarray reading?

Now I'm getting: `ValueError: conflicting sizes for dimension 'phony_dim_0': length 93 on 'electrodes' and length 10715666 on {'phony_dim_0': 'data', 'phony_dim_1': 'data'}`... which I don't undertstand because I'm setting the dims with the `_Array_Dimensions` field

## Step 1: KERCHUNK! Create Zarr Directory with References 
- (create_zarr_lfp_one_probe_ref.py)
- Import the required libraries and set up the configuration for creating a Zarr directory containing references to locally-stored probe NWB files.
- get input filename, sizes
- create a skeleton Zarr mapping of datasets with lfp, time, and channel. populate the time array and sim the channel.
- get references from kerchunked NWB and add the lfp references to the 
- write the references to a JSON file
- Add references to lfp data

In [1]:
import fsspec
import h5py
import kerchunk.hdf
import kerchunk.utils
import math
import numpy as np
import os
import ujson
import warnings
import zarr

# Configuration
data_dir='~/data/allen/'
data_dir = os.path.expanduser(data_dir)
probe_id = "810755797"

input_directory = os.path.join(data_dir)
output_json_file = os.path.join(data_dir, "lfp_one_probe_ref.json")


def get_input_filename(probe_id):
    return os.path.join(input_directory, f"probe_{probe_id}_lfp.nwb")


def get_sizes():
    chunk_size = dict(probe=1)

    f = h5py.File(get_input_filename(probe_id), "r")
    lfp = f[f"acquisition/probe_{probe_id}_lfp/probe_{probe_id}_lfp_data/data"]
    ntime, nchannel = lfp.shape
    print(ntime, nchannel)

    lfp_dtype = lfp.dtype

    time = f[f"acquisition/probe_{probe_id}_lfp/probe_{probe_id}_lfp_data/timestamps"]
    time_dtype = time.dtype

    chunk_size = dict(time=lfp.chunks[0], channel=lfp.chunks[1])

    f.close()

    print("ntime", ntime, "nchannel", nchannel)
    print("Chunk sizes", chunk_size)
    print("Number of time chunks", math.ceil(ntime / chunk_size["time"]))

    return ntime, nchannel, lfp_dtype, time_dtype, chunk_size

In [2]:
ntime, nchannel, lfp_dtype, time_dtype, chunk_size = get_sizes()

10715666 93
ntime 10715666 nchannel 93
Chunk sizes {'time': 41859, 'channel': 1}
Number of time chunks 256


In [4]:
def create_zarr_file(ntime, nchannel, lfp_dtype, time_dtype, chunk_size):
    shape = (ntime, nchannel)
    refs = {}

    root = zarr.open_group(refs, mode="w")
    
    # LFP data
    #### Probably needs to know compression of underlying data?????????
    ## DR: compression of underlying data is gzip level 9.. but I don't know how to use this info
    lfp_data = root.create_dataset(
        name="lfp",
        shape=shape,
        synchronizer=zarr.ThreadSynchronizer(),
        chunks=(chunk_size["time"], chunk_size["channel"]),
        dtype=lfp_dtype,
    )
    lfp_data.attrs["_ARRAY_DIMENSIONS"] = ["time", "channel"]
    
    # Time coordinates, filled in later.
    time_data = root.create_dataset(
        name="time",
        chunks=chunk_size["time"],  # Do I want this chunked or not?
        shape=ntime,
        dtype=time_dtype,
    )
    time_data.attrs["_ARRAY_DIMENSIONS"] = ["time"]

    # Channel coordinates are integers starting at zero.
    channel = np.arange(nchannel, dtype=np.uint32)
    coord = root.create_dataset(
        name="channel",
        shape=nchannel,
        dtype=channel.dtype,
    )
    coord[:] = channel
    coord.attrs["_ARRAY_DIMENSIONS"] = ["channel"]

    return time_data, refs


def get_kerchunk_refs(probe_id, refs):
    filename = get_input_filename(probe_id)
    so = dict(anon=True, default_fill_cache=False, default_cache_type='first')

    # Getting all the chunk references from the NWB file here, then filtering them.
    # Can I just read the ones I am interested in?
    with fsspec.open(filename, **so) as f:
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            h5chunks = kerchunk.hdf.SingleHdf5ToZarr(f, filename)
            probe_refs = h5chunks.translate()["refs"]

    # Filter refs to only contain what we want
    # Don't know if need .zarray and .zattrs or not.
    match = f"acquisition/probe_{probe_id}_lfp/probe_{probe_id}_lfp_data/data/"
    for ref in list(probe_refs):
        if not ref.startswith(match):
            probe_refs.pop(ref)

    len_match = len(match)
    for k, v in probe_refs.items():
        suffix = k[len_match:]
        # Don't think .zarray and .zattrs are needed as the create_dataset
        # specifies the dtype, shape, etc
        if suffix[0] == ".":
            continue
        # before, this was refs[f"lfo... typo?
        refs[f"lfp/{suffix}"] = v

        if suffix == "20.2":
            print("==> CHECK k", k)
            print("==>       v", v)
            print("==>       name", f"lfp/{suffix}")


def load_and_store(time2d_data, refs):
    input_filename = get_input_filename(probe_id)
    f = h5py.File(input_filename, "r")

    print("Writing time for probe", probe_id)
    time = f[f"acquisition/probe_{probe_id}_lfp/probe_{probe_id}_lfp_data/timestamps"]
    time2d_data[:] = time[:]

    f.close()

    # LFP data kept in original files, referenced chunkwise from kerchunk-created JSON file
    get_kerchunk_refs(probe_id, refs)

    refs = kerchunk.utils.consolidate(refs)
    return refs

time2d_data, refs = create_zarr_file(ntime, nchannel, lfp_dtype, time_dtype, chunk_size)

refs = load_and_store(time2d_data, refs)

print(f"Writing {output_json_file}")
with open(output_json_file, "w") as f:
    ujson.dump(refs, f)

Writing time for probe 810755797
==> CHECK k acquisition/probe_810755797_lfp/probe_810755797_lfp_data/data/20.2
==>       v ['/Users/droumis/data/allen/probe_810755797_lfp.nwb', 163354739, 84801]
==>       name lfp/20.2
Writing /Users/droumis/data/allen/lfp_one_probe_ref.json


## Step 2: KERCHUNK! Data Access and Calculations
- (test_zarr_lfp_one_probe_ref.py)
- Open the JSON/Zarr file created in the previous step and print the dataset's structure.
- Try some data access and calculations on the LFP data read from the JSON/Zarr file.


I'm stuck at: "RuntimeError: error during blosc decompression: -1".. Maybe we do need to use the compression of underlying data in some way (gzip level 9).. I need Ian's help

In [5]:
import fsspec
import os
import xarray as xr

input_json = output_json_file
# input_json = os.path.join(directory, "lfp_one_probe_ref.json")

# Disable mask_and_scale otherwise dtypes are converted to floats.
fs = fsspec.filesystem("reference", fo=input_json)
ds = xr.open_dataset(
    fs.get_mapper(""),
    engine="zarr",
    group="",
    backend_kwargs={"consolidated": False, "mask_and_scale": False}
)
print(ds)


# Try some data access and calculations
data = ds.lfp[:100, 0]
print("==> data", data.compute())
mean = data.mean().compute()
print("==> mean", mean)


<xarray.Dataset>
Dimensions:  (channel: 93, time: 10715666)
Coordinates:
  * channel  (channel) uint32 0 1 2 3 4 5 6 7 8 9 ... 84 85 86 87 88 89 90 91 92
  * time     (time) float64 28.82 28.82 28.83 ... 9.616e+03 9.616e+03 9.616e+03
Data variables:
    lfp      (time, channel) float32 ...


RuntimeError: error during blosc decompression: -1

# Scratch

In [None]:
# import h5py
# import numpy as np

# probe_id = "12345"
# filename = f"probe_{probe_id}_lfp.nwb"
# # data_group = f"acquisition/probe_{probe_id}_lfp/probe_{probe_id}_lfp_data"
# data_group = 'lfp'

# # Create a new HDF5 file
# with h5py.File(filename, 'w') as f:
#     # Create a group structure
#     probe_group = f.create_group(data_group)
    
#     # Create a dataset with random data
#     data_shape = (100, 10)
#     data = np.random.random(data_shape)
#     probe_data = probe_group.create_dataset("data", data_shape, dtype='f', data=data, chunks=(10, 2))
    
#     # Add attributes (Optional)
#     probe_data.attrs['unit'] = 'volts'

# print(f"File {filename} created.")


# import kerchunk.hdf
# import fsspec
# import ujson

# # so = dict(anon=True, default_fill_cache=False, default_cache_type='first')

# # Open the HDF5 file and translate the chunk information
# # with fsspec.open(filename, **so) as f:
# with fsspec.open(filename) as f:
#     h5chunks = kerchunk.hdf.SingleHdf5ToZarr(f, filename)
#     refs = h5chunks.translate()#["refs"] #translates content of the HDF5 file into the Zarr format

# # Consolidate the references
# # refs = kerchunk.utils.consolidate(refs)

# # Print the references
# # print(refs)

# Save to a JSON file
# output_json_file = "lfp_one_probe_ref_rand.json"
# with open(output_json_file, "wb") as f:
#     # ujson.dump(refs, f)
#     f.write(ujson.dumps(refs).encode())

# with open(output_json_file, "r") as f:
#     content = f.read()
#     print(content)

# with fsspec.open(url, **so) as inf:
#     h5chunks = kerchunk.hdf.SingleHdf5ToZarr(inf, url, inline_threshold=100)
#     h5chunks.translate()
#     with open("single_file_kerchunk.json", "wb") as f:
#         f.write(ujson.dumps(h5chunks.translate()).encode()

# # import os

# # data_dir='~/data/allen/'
# json_filepath = '/Users/droumis/src/neuro/workflows/ephys-viewer/dev/' + output_json_file
# # data_dir = os.path.expanduser(data_dir)
# # output_json_file = os.path.join(data_dir, output_json_file)
# json_filepath

# import fsspec
# import xarray as xr

# # data_group = f"acquisition/probe_{probe_id}_lfp/probe_{probe_id}_lfp_data"
# # ref_data_group = 'refs/refs/' + data_group
# # Disable mask_and_scale otherwise dtypes are converted to floats.
# fs = fsspec.filesystem("reference", fo=json_filepath)
# m = fs.get_mapper()
# ds = xr.open_dataset(
#     m,
#     group='lfp',
#     engine="zarr",
#     backend_kwargs={"consolidated": False, "mask_and_scale": False}
# )

# print(ds)

# # Try some data access and calculations
# data = ds.data[:100, 0]  # Update this line according to the actual structure of your data
# print("==> data", data.compute())
# mean = data.mean().compute()
# print("==> mean", mean)