# Explore cloud-optimized CONUS404 dataset
This dataset was created by extracting specified variables from a collection of wrf2d output files, rechunking to better facilitate data extraction for a variety of use cases, and adding CF conventions to allow easier analysis, visualization and data extraction using Xarray and Holoviz.

In [None]:
%xmode minimal
import fsspec
import xarray as xr
import hvplot.xarray
import intake
import os
import metpy
import cartopy.crs as ccrs
from dask.distributed import LocalCluster, Client

#### Open dataset from Intake Catalog
* Automatically select on-prem dataset from /caldera if running on prem (Denali/Tallgrass)
* Automatically select cloud data on S3 if not running on prem 

To test whether we are on-prem, we see if SLURM_CLUSTER_NAME is defined.  If SLURM_CLUSTER_NAME is not defined, the user is either not on Denali/Tallgrass on the main node, which they should not be on

In [None]:
url = 'https://raw.githubusercontent.com/hytest-org/hytest/main/dataset_catalog/hytest_intake_catalog.yml'

In [None]:
cat = intake.open_catalog(url)
list(cat)

#### Start as Dask client using an appropriate Dask Cluster
This is an optional step, but can speed up data loading significantly, especially when accessing data from the Cloud

In [None]:
def configure_cluster(machine):
    ''' Helper function to configure cluster
    '''
    if machine == 'denali':
        from dask.distributed import LocalCluster, Client
        cluster = LocalCluster(threads_per_worker=1)
        client = Client(cluster)
    
     elif machine == 'tallgrass':
        from dask.distributed import Client
        from dask_jobqueue import SLURMCluster   
        import os
        
        project = os.environ['SLURM_JOB_ACCOUNT']
        
        cluster = SLURMCluster(processes=1, cores=1, 
            memory='8GB', interface='ib0',
            project=project, walltime='01:00:00',      
            job_extra={'hint': 'multithread'},
            shared_temp_directory='/caldera/hytest_scratch/tmp')
        cluster.adapt(minimum=2, maximum=30)
        client = Client(cluster)
        
    elif machine == 'local':
        import os
        import warnings
        from dask.distributed import LocalCluster, Client
        warnings.warn("Running locally can result in costly data transfers!\n")
        n_cores = os.cpu_count() # set to match your machine
        cluster = LocalCluster(threads_per_worker=n_cores)
        client = Client(cluster)
        
    elif machine in ['esip-qhub-gateway-v0.4']:   
        import sys, os
        sys.path.append(os.path.join(os.environ['HOME'],'shared','users','lib'))
        import ebdpy as ebd
        aws_profile = 'esip-qhub'
        ebd.set_credentials(profile=aws_profile)

        aws_region = 'us-west-2'
        endpoint = f's3.{aws_region}.amazonaws.com'
        ebd.set_credentials(profile=aws_profile, region=aws_region, endpoint=endpoint)
        worker_max = 30
        client, cluster = ebd.start_dask_cluster(profile=aws_profile, worker_max=worker_max, 
                                              region=aws_region, use_existing_cluster=True,
                                              adaptive_scaling=False, wait_for_cluster=False, 
                                              worker_profile='Medium Worker', propagate_env=True)
        
    return client, cluster

In [None]:
if 'SLURM_CLUSTER_NAME' in os.environ:
    dataset = 'conus404-hourly-onprem'
    machine = os.environ['SLURM_CLUSTER_NAME']
    client, cluster = configure_cluster(machine)
else:
    dataset = 'conus404-hourly-cloud'
    machine = 'esip-qhub-gateway-v0.4'
    client, cluster = configure_cluster(machine)

In [None]:
cat[dataset]

In [None]:
ds = cat[dataset].to_dask()

In [None]:
ds  = ds.metpy.parse_cf()

In [None]:
crs = ds['T2'].metpy.cartopy_crs

In [None]:
ds.SNOW

#### Use Case 1:  Load the full domain at a specific time step

In [None]:
%%time
da = ds.SNOW.sel(time='2014-03-01 00:00').load()

In [None]:
da.hvplot.quadmesh(x='lon', y='lat', rasterize=True, 
                             geo=True, tiles='OSM', alpha=0.7, cmap='turbo')

#### Use case 2: Load the full time series at a specific grid cell

In [None]:
ds.PREC_ACC_NC

In [None]:
lat,lon = 39.978322,-105.2772194    
x, y = crs.transform_point(lon, lat, src_crs=ccrs.PlateCarree())   # PlateCaree = Lat,Lon
print(x,y)

In [None]:
%%time
da = ds.PREC_ACC_NC.sel(x=x, y=y, method='nearest').sel(time=slice('2013-01-01 00:00','2013-12-31 00:00')).load()

In [None]:
da.hvplot(x='time', grid=True)

#### Stop cluster

In [None]:
cluster.shutdown()