# Generate Ref Filesystem JSONs for Binary Parsing

In [6]:
"""generate kerchunk ref files based on specifications of docs

    goal:
    {
    ".zgroup": "{\n    \"zarr_format\": 2\n}",
    ".zattrs": "{\n    \"Conventions\": \"UGRID-0.9.0\n\"}",
    "x/.zattrs": "{\n    \"_ARRAY_DIMENSIONS\": [\n        \"node\"\n ...",
    "x/.zarray": "{\n    \"chunks\": [\n        9228245\n    ],\n    \"compressor\": null,\n    \"dtype\": \"<f8\",\n  ...",
    "x/0": ["s3://bucket/path/file.nc", 294094376, 73825960]
    }

"""

import fsspec
import fsspec.utils
import numpy as np
# import dask.array as da
# import pytest
import xarray as xr
import zarr
import os
import ujson
import s3fs

import kerchunk.combine
from kerchunk.zarr import single_zarr
from kerchunk.combine import MultiZarrToZarr
from kerchunk.hdf import SingleHdf5ToZarr
from pathlib import Path

In [8]:
# each nc file needs a meta data for the zarr to read
# see https://medium.com/pangeo/cloud-performant-netcdf4-hdf5-with-zarr-fsspec-and-intake-3d3a3e7cb935

# use code to generate from local file
fs = fsspec.filesystem('s3', anon=True) #S3 file system to manage ERA5 files
flist = (fs.glob('s3://era5-pds/2020/*/data/air_pressure_at_mean_sea_level.nc')[:2])

fs2 = fsspec.filesystem('')  #local file system to save final jsons to

so = dict(mode='rb', anon=True, default_fill_cache=False, default_cache_type='first') # args to fs.open()
# default_fill_cache=False avoids caching data in between file chunks to lowers memory usage.

def gen_json(file_url):
    with fs.open(file_url, **so) as infile:
        h5chunks = SingleHdf5ToZarr(infile, file_url, inline_threshold=300)
        # inline threshold adjusts the Size below which binary blocks are included directly in the output
        # a higher inline threshold can result in a larger json file but faster loading time
        variable = file_url.split('/')[-1].split('.')[0]
        month = file_url.split('/')[2]
        outf = f'{month}_{variable}.json' #file name to save json to
        with fs2.open(outf, 'wb') as f:
            f.write(ujson.dumps(h5chunks.translate()).encode());


for file in flist:
    gen_json(file)

In [None]:
# see generated JSONS!