# Create combined JSON for each year in parallel
read all individual referenceFileSystem JSON files and create combined JSON for entire dataset

In [None]:
import fsspec
import xarray as xr
import hvplot.xarray
import metpy

In [None]:
import ujson   # fast json
from kerchunk.combine import MultiZarrToZarr
import kerchunk

In [None]:
json_dir = 's3://esip-qhub/noaa/nwm/grid1km/json'

For file systems where files are changing, you want `skip_instance_cache=True` or else you won't see the changed files

In [None]:
fs_json = fsspec.filesystem('s3', anon=False, skip_instance_cache=True)  

In [None]:
year_list = range(1979,2021)

In [None]:
def year_combine(year):
    json_list  = fs_json.glob(f'{json_dir}/{year}*.json')
    json_list = [f's3://{json}' for json in json_list]
    combined_json = f's3://esip-qhub/noaa/nwm/grid1km/combined_{year}.json'

    mzz = MultiZarrToZarr(json_list,
        remote_protocol = 's3',
        remote_options = dict(anon=True),   
        concat_dims = ['time'], 
        identical_dims=["x", "y", "crs"],
        preprocess = kerchunk.combine.drop("reference_time"))   
    d = mzz.translate()
    with fs_json.open(combined_json, 'wb') as f:
        f.write(ujson.dumps(d).encode());

Create a dict from the mzz object

In [None]:
import sys
libDir = r'/shared/users/lib'
if libDir not in sys.path:
    sys.path.append(libDir)

In [None]:
# Activate logging
# import logging
# logging.basicConfig(level=logging.INFO, force=True)

In [None]:
import ebdpy as ebd
import os
os.environ['AWS_PROFILE'] = 'esip-qhub'
client,cluster = ebd.start_dask_cluster(
    profile=os.environ['AWS_PROFILE'],
    worker_max=20,
    region='us-west-2', 
    use_existing_cluster=True,
    adaptive_scaling=True, 
    wait_for_cluster=False, 
    propagate_env=True)

In [None]:
import dask.bag as db

In [None]:
b = db.from_sequence(year_list, npartitions=40)

In [None]:
b1 = b.map(year_combine)

In [None]:
%%time
from dask.distributed import performance_report
with performance_report(filename="dask-report-whole.html"):
    b1.compute(retries=10)

#### Examine one of the combined kerchunked dataset 

In [None]:
year = 2005
combined_json = f's3://esip-qhub/noaa/nwm/grid1km/combined_{year}.json'

In [None]:
%%time
s_opts = {'requester_pays':True, 'skip_instance_cache':True}
r_opts = {'anon':True}
fs = fsspec.filesystem("reference", fo=combined_json, ref_storage_args=s_opts,
                       remote_protocol='s3', remote_options=r_opts)
m = fs.get_mapper("")
ds = xr.open_dataset(m, engine="zarr", chunks={}, backend_kwargs=dict(consolidated=False))

In [None]:
ds

In [None]:
ds = ds[['ACCET', 'SNEQV', 'FSNO', 'crs']]

In [None]:
ds  = ds.metpy.parse_cf()

In [None]:
crs = ds['ACCET'].metpy.cartopy_crs

In [None]:
%%time
da = ds.ACCET.isel(time=500).load()

In [None]:
da.plot()

In [None]:
json_list = fs_json.glob(f's3://esip-qhub/noaa/nwm/grid1km/combined_????.json')
print(len(json_list))
print(json_list[0])
print(json_list[-1])

In [None]:
da.hvplot(x='x', y='y', rasterize=True,  cmap='turbo', tiles='OSM', alpha=0.7)