# Create/update a Zarr Virtual Dataset from a collection of ERA5-Land NetCDF3 files on S3

In [None]:
import os
import fsspec
import ujson   # fast json
from kerchunk.hdf import SingleHdf5ToZarr 
from kerchunk.combine import MultiZarrToZarr
import xarray as xr
import dask
import hvplot.xarray
import zarr
import numpy as np

In [None]:
import warnings
warnings.filterwarnings("ignore")

#### Use AWS environment variables for credentials
in this case to specify credentials that give write access to the 'esip-qhub' bucket

In [None]:
import os
os.environ['AWS_PROFILE'] = 'esip-qhub'

In [None]:
fs_read = fsspec.filesystem('s3', anon=False, skip_instance_cache=True)

In [None]:
fs_write = fs_read

In [None]:
base_dir = 's3://esip-qhub/usgs/era5_land'
nc_files = f'{base_dir}/*.nc'
json_dir = f'{base_dir}/jsons/'
s3_ref_file = f'{base_dir}/archive.json'

#### Process all NetCDF files not found in the list of JSON files.  

In [None]:
nc_list = fs_read.glob(nc_files)
print(len(nc_list))

In [None]:
json_list = fs_read.glob(f'{json_dir}*.json')
print(len(json_list))

In [None]:
# fs_write.rm(json_list)

In [None]:
nc_processed_list = [j.split('.json')[0].replace('/jsons','') for j in json_list]

In [None]:
nc_process_list = list(set(nc_list) - set(nc_processed_list))

In [None]:
print(len(nc_process_list))

#### Reprocess any NetCDF files that have been updated since we last wrote their JSON

In [None]:
for i in range(len(json_list)):
    a = fs_read.info(json_list[i])['LastModified']
    b = fs_read.info(nc_processed_list[i])['LastModified']
    if b>a:
        nc_process_list.append(nc_processed_list[i])

In [None]:
print(len(nc_process_list))

In [None]:
flist = sorted(['s3://'+f for f in nc_process_list])

In [None]:
so = dict(mode='rb', anon=False, profile='esip-qhub', skip_instance_cache=True)

#### Create the individual JSON files directly on S3 

We passed AWS credentials to the Dask workers via environment variables above, and the dask workers don't have the AWS credentials file with profiles defined, so we don't define a profile here, we just set `anon=False` and let the workers find the credentials via the environment variables:

In [None]:
def gen_json(u):
    with fs_read.open(u, **so) as infile:
        h5chunks = SingleHdf5ToZarr(infile, u, inline_threshold=300)
        p = u.split('/')
        fname = p[-1]
        outf = f'{json_dir}{fname}.json'
        print(outf)
        with fs_write.open(outf, 'wb') as f:
            f.write(ujson.dumps(h5chunks.translate()).encode());

In [None]:
gen_json(flist[0])

In [None]:
u = flist[1]
p = u.split('/')
fname = p[-1]
outf = f'{json_dir}{fname}.json'
print(outf)

In [None]:
def configure_cluster(resource):
    ''' Helper function to configure cluster
    '''
    if resource == 'denali':
        cluster = LocalCluster(threads_per_worker=1)
        client = Client(cluster)
    
    elif resource == 'tallgrass':
        from dask_jobqueue import SLURMCluster
        cluster = SLURMCluster(queue='cpu', cores=1, interface='ib0',
                               job_extra=['--nodes=1', '--ntasks-per-node=1', '--cpus-per-task=1'],
                               memory='6GB')
        cluster.adapt(maximum_jobs=30)
        client = Client(cluster)
        
    elif resource == 'local':
        import os
        import warnings
        warnings.warn("Running locally can result in costly data transfers!\n")
        n_cores = os.cpu_count() # set to match your machine
        cluster = LocalCluster(threads_per_worker=n_cores)
        client = Client(cluster)
        
    elif resource in ['esip-qhub-gateway-v0.4']:   
        import sys, os
        sys.path.append(os.path.join(os.environ['HOME'],'shared','users','lib'))
        import ebdpy as ebd
        aws_profile = 'esip-qhub'
        ebd.set_credentials(profile=aws_profile)  # sets credentials for notebook
        aws_region = 'us-west-2'
        endpoint = f's3.{aws_region}.amazonaws.com'
        ebd.set_credentials(profile=aws_profile, region=aws_region, endpoint=endpoint)
        worker_max = 30
        client,cluster = ebd.start_dask_cluster(profile=aws_profile, worker_max=worker_max, 
                                              region=aws_region, use_existing_cluster=True,
                                              adaptive_scaling=False, wait_for_cluster=False, 
                                              worker_profile='Small Worker', propagate_env=True)
        
    return client, cluster

In [None]:
resource = 'esip-qhub-gateway-v0.4' #denali, tallgrass, local, esip-qhub-gateway-v0.4
client, cluster = configure_cluster(resource)

In [None]:
# _ = dask.compute(*[dask.delayed(gen_json)(f) for f in flist], retries=10);

In [None]:
cluster.scale(30)

In [None]:
%%time
import dask.bag as db
b = db.from_sequence(flist, npartitions=30)
b = b.map(gen_json)
results = b.compute()

In [None]:
jsons = fs_write.ls(json_dir)
jsons = sorted(['s3://'+f for f in jsons])
print(len(jsons))

In [None]:
mzz = MultiZarrToZarr(jsons,   
    remote_protocol = 's3',
    remote_options={'anon':False},
    concat_dims = ['time'])

In [None]:
%%time
d = mzz.translate()

In [None]:
%%time
fs = fsspec.filesystem("reference", fo=d, ref_storage_args={'skip_instance_cache':True},
                       remote_protocol='s3', remote_options={'anon':False})
m = fs.get_mapper("")
ds = xr.open_dataset(m, engine="zarr", backend_kwargs={'consolidated':False}, chunks={})
ds

In [None]:
ds.data_vars

In [None]:
local_consolidated_json = 'era5_land.json'
mzz.translate(local_consolidated_json)

In [None]:
s3_json = 's3://esip-qhub/usgs/era5_land/archive2.json'
_ = fs_write.upload(local_consolidated_json, s3_json)

In [None]:
fs_write.info(s3_json)

In [None]:
%%time
fs_s3 = fsspec.filesystem("reference", fo=s3_json, ref_storage_args={'skip_instance_cache':True},
                       remote_protocol='s3', remote_options={'anon':False})
m = fs_s3.get_mapper("")
ds = xr.open_dataset(m, engine="zarr", backend_kwargs={'consolidated':False}, chunks={})
ds

In [None]:
ds.sd.hvplot(x='longitude', y='latitude', cmap='turbo', rasterize=True, geo=True, tiles='ESRI')

In [None]:
import intake
url = 'era5_intake.yml'
cat = intake.open_catalog(url)
list(cat)

In [None]:
fs_write.upload(url, 's3://esip-qhub/usgs/era5_land/era5_intake.yml')

In [None]:
url = 's3://esip-qhub/usgs/era5_land/era5_intake.yml'

In [None]:
cat = intake.open_catalog(url)
list(cat)

In [None]:
cat['era5-land']

In [None]:
ds = cat['era5-land'].to_dask()

In [None]:
ds