This notbook is a an example of how to combine and virtualize large datsets like PNNL. Kudos to Scott for fiugring this one out!

In [6]:
import xarray as xr
import dask
from virtualizarr import open_virtual_dataset
import glob
import cartopy.crs as ccrs

In [2]:
# # A convenience for files that have lots of attribute
xr.set_options(display_expand_attrs=False);

In [3]:
from importlib.metadata import version
print(version('virtualizarr'))
print(version('xarray'))

1.3.1
2025.1.2


In [4]:
def virtualize_single_variable(variable_shortname, scenario='historical', save=False):
    """ Create a single variable .parquet virtualizarr reference"""
    print(f'Virtualizing {variable_shortname}... for Scenario {scenario}')
    pnnl_files = glob.glob(f'/data0/skagit_met/PNNL/{scenario}/**/*{variable_shortname}*.nc')
    pnnl_files.sort() # in-place sort (can get away w/ this because of file name structure)
    vds_list = [
        open_virtual_dataset(filepath, indexes={}, drop_variables=['x','y'], loadable_variables=['time'], decode_times=True)
        for filepath in pnnl_files
    ]
    combined_vds = xr.combine_nested(vds_list, concat_dim=['time'], coords="minimal", compat="override", combine_attrs="drop_conflicts")
    if save:
        combined_vds.virtualize.to_kerchunk(f'/data0/skagit_met/PNNL/{scenario}/{variable_shortname}.parquet', format='parquet')
        
    return combined_vds

In [None]:
all_vars = ['GLW', 'PREC_ACC_NC', 'T2', 'Q2', 'SWDOWN', 'U10', 'V10']
all_scenarios = ['historical', 'HadGEM2_ES', 'CanESM2', 'CESM1_CAM5','GFDL_ESM2M','MPI_ESM_MR']
for scenario in all_scenarios:
        all_vds = [virtualize_single_variable(variable, scenario=scenario) for variable in all_vars]
        VDS = xr.merge(all_vds)
        VDS.virtualize.to_kerchunk(f'/data0/skagit_met/PNNL/{scenario}/PNNL_{scenario}.parquet', format='parquet')
        all_vds = None
        VDS = None

In [None]:
# Grid for plotting
grid_path = '/data0/skagit_met/PNNL/historical/SERDP6km.geo_em.d01.nc'
ds_grid = xr.open_dataset(grid_path).squeeze() # Drop Time=0 scalar dimension
# Assign Coordinates in Xarray (keep landmask for viz) 
dsg = ds_grid[['LANDMASK','CLONG','CLAT']]
dsg = dsg.set_coords(("CLONG", "CLAT")).rename(dict(south_north='y', west_east='x'))
# Still not recognized as multidimensional coordinates
# Rename to x and y To match data files
dsg = dsg.set_coords(("CLONG", "CLAT")).rename(dict(south_north='y', west_east='x'))

DS = xr.open_dataset('/data0/skagit_met/PNNL/historical/PNNL_historical.parquet', engine='kerchunk', mask_and_scale=False)
# Assign Grid Lon/Lats 
DS = DS.assign_coords(dsg.coords) # Better!
# Also bring in land mask
DS.coords['LANDMASK'] = dsg.LANDMASK
DS

In [None]:
import matplotlib.pyplot as plt 
%matplotlib inline
%config InlineBackend.figure_format='retina'

# Plot 
ax = plt.axes(projection=ccrs.PlateCarree())
DS.T2.isel(time=10).where(DS.LANDMASK).plot(ax=ax, x="CLONG", y="CLAT")