# Create kerchunk file for each NetCDF
We want to create kerchunk JSON sidecar for 100,000+ files

In [None]:
import fsspec
import xarray as xr
import hvplot.xarray
import pandas as pd

In [None]:
import ujson   # fast json
import kerchunk
from kerchunk.hdf import SingleHdf5ToZarr 
from kerchunk.combine import MultiZarrToZarr
from pathlib import Path

#### System Setup

In [None]:
# Include HyTest helpers...
import sys
libDir = r'/shared/users/lib'
if libDir not in sys.path:
    sys.path.append(libDir)
# Activate logging
import logging
logging.basicConfig(level=logging.INFO, force=True)

import os
print(f"CPUS: {os.cpu_count()}")
import psutil
svmem = psutil.virtual_memory()
print(f"Total Virtual Memory: {svmem.total/(1024*1024*1024):.2f} Gb")

#### Spin up Dask Cluster
This operation will be able to work in parallel.  Spin up a dask cluster on 
the cloud hardware to schedule the various workers.  Note that this cluster
is configured with a specific user **profile** with permissions to write to
our eventual output location. 

In [None]:
import ebdpy as ebd
os.environ['AWS_PROFILE'] = 'esip-qhub'
client,cluster = ebd.start_dask_cluster(
    profile=os.environ['AWS_PROFILE'],
    worker_max=60,
    region='us-west-2', 
    use_existing_cluster=True,
    adaptive_scaling=True, 
    wait_for_cluster=False, 
    propagate_env=True)

#### Construct list of files to kerchunk

In [None]:
fs_nc = fsspec.filesystem('s3', anon=True)

In [None]:
flist = fs_nc.ls('s3://noaa-nwm-retrospective-2-1-pds/')
flist

In [None]:
flist = fs_nc.glob('noaa-nwm-retrospective-2-1-pds/model_output/*')
print(flist[0])
print(flist[-1])

In [None]:
nc_dir = 'noaa-nwm-retrospective-2-1-pds/model_output'

In [None]:
year = '*'

Create the file list.  Listing the files takes 5 minutes, so we check to see if we previously
stored them in a .csv file. 

In [None]:
if Path('flist.csv').is_file():
    df = pd.read_csv('flist.csv')
    flist = list(df.iloc[:,1])
else:
    flist = fs_nc.glob(f'{nc_dir}/{year}/*LDAS*')
    df = pd.Series(flist)
    df.to_csv('flist.csv')
print(f'{len(flist)} files')
print(flist[0])
print(flist[-1])

#### Define the filesystem for where the json files will be stored.   
For file systems where files are changing, you want `skip_instance_cache=True` or else you won't see the changed files

In [None]:
fs_json = fsspec.filesystem('s3', anon=False, skip_instance_cache=True)  

In [None]:
json_dir = 's3://esip-qhub/noaa/nwm/grid1km/json'

In [None]:
jlist = fs_json.ls(f'{json_dir}', refresh=True)
print(len(jlist))
print(jlist[0])
print(jlist[-1])

In [None]:
p = Path(flist[0])
print(p.stem)
print(p.name)

#### Construct list of nc files that haven't already been kerchunked

In [None]:
nc_files = [Path(f).stem for f in flist]

In [None]:
json_files = [Path(f).stem for f in jlist]

In [None]:
nc_process_files = list(set(nc_files) - set(json_files))
print(len(nc_process_files))

In [None]:
nc_process_list = []
for f in nc_process_files:
    year = f[0:4]  # extract year from filename
    nc_process_list.append(f'{nc_dir}/{year}/{f}.comp')

#### Kerchunk each file in the list

In [None]:
def gen_json(u):
    with fs_nc.open(u, **so) as infile:
        h5chunks = SingleHdf5ToZarr(infile, u, inline_threshold=300)
        p = Path(u)
        fname = p.stem
        outf = f'{json_dir}/{fname}.json'
        print(outf)
        with fs_json.open(outf, 'wb') as f:
            f.write(ujson.dumps(h5chunks.translate()).encode());

In [None]:
so = dict(mode='rb', anon=True, default_fill_cache=False, default_cache_type='first')

Try kerchunking one file from the list

In [None]:
%%time
gen_json(nc_process_list[0])

In [None]:
s_opts = {'requester_pays':True, 'skip_instance_cache':True}
r_opts = {'anon':True}
fs = fsspec.filesystem("reference", fo=f's3://{jlist[0]}', ref_storage_args=s_opts,
                       remote_protocol='s3', remote_options=r_opts)
m = fs.get_mapper("")
ds = xr.open_dataset(m, engine="zarr", chunks={}, backend_kwargs=dict(consolidated=False))

In [None]:
ds

In [None]:
ds = ds[['ACCET', 'SNEQV', 'FSNO']]

In [None]:
ds

#### Parallel creation of individual JSONs for each file using Dask Bag

In [None]:
import dask.bag as db

In [None]:
b = db.from_sequence(nc_process_list, npartitions=120)

In [None]:
b1 = b.map(gen_json)

In [None]:
%%time
from dask.distributed import performance_report
with performance_report(filename="dask-report-whole.html"):
    b1.compute(retries=10)