# Programmatically creating `intake` catalogs

Author: Andrew Laws

This notebook was used to create a catalog of intermediate datasets used in the CONUS404 tutorial found in `evaluation/tutorials/CONUS404` that are saved to an Open Storage Network (OSN) pod. It is based on the [Project Pythia `intake` Cookbook](https://github.com/ProjectPythia/intake-cookbook).

Library imports

**Requires intake-xarray and intake-parquet to be installed in envrironment**

In [2]:
import intake
import yaml

OSN URIS:
<ul>
<li>hytest/tutorials/evaluation/conus404/c404_ceres_drb_descriptive_stats.parquet</li>
 <li>hytest/tutorials/evaluation/conus404/c404_crn_drb_descriptive_stats.parquet</li>
 <li>hytest/tutorials/evaluation/conus404/c404_crn_drb_point_values.parquet</li>
 <li>hytest/tutorials/evaluation/conus404/c404_drb.zarr</li>
 <li>hytest/tutorials/evaluation/conus404/c404_drb_zonal_stats.parquet</li>
 <li>hytest/tutorials/evaluation/conus404/c404_hcn_drb_descriptive_stats.parquet</li>
 <li>hytest/tutorials/evaluation/conus404/c404_hcn_drb_point_values.parquet</li>
 <li>hytest/tutorials/evaluation/conus404/c404_prism_drb_descriptive_stats.parquet</li>
 <li>hytest/tutorials/evaluation/conus404/ceres_drb.zarr</li>
 <li>hytest/tutorials/evaluation/conus404/ceres_drb_zonal_stats.parquet</li>
 <li>hytest/tutorials/evaluation/conus404/crn_drb.parquet</li>
 <li>hytest/tutorials/evaluation/conus404/hcn_drb.parquet</li>
 <li>hytest/tutorials/evaluation/conus404/prism_drb.zarr</li>
 <li>hytest/tutorials/evaluation/conus404/prism_drb_zonal_stats.parquet</li>
</ul>

## Base datasets that have been spatially/temporally subset

In [3]:
base_URI = "s3://hytest/tutorials/evaluation/conus404/"
storage_options = dict(endpoint_url="https://usgs.osn.mghpcc.org/", anon=True)

CONUS404 zarr

In [4]:
c404_URI = base_URI + "c404_drb.zarr"
print(c404_URI)

s3://hytest/tutorials/evaluation/conus404/c404_drb.zarr


In [5]:
c404_source = intake.open_zarr(
    c404_URI,
    storage_options=storage_options,
    chunks={},
    consolidated=True,
    decode_coords="all",
)

c404_source.name = "conus404-drb-OSN"
c404_source.description = "CONUS404 Delaware River Basin subset, 40 years of monthly data for CONUS404 forcings evaluation"
print(c404_source.yaml())


TypeError: ZarrArraySource.__init__() got an unexpected keyword argument 'chunks'

In [None]:
source_dict = yaml.load(c404_source.yaml(), Loader=yaml.CLoader)
source_dict

In [None]:
sources = source_dict["sources"]
sources

PRISM zarr

In [None]:
prism_URI = base_URI + "prism_drb.zarr"

In [None]:
prism_source = intake.open_zarr(
    prism_URI,
    storage_options=storage_options,
    chunks={},
    consolidated=True,
    decode_coords="all",
)

prism_source.name = "prism-drb-OSN"
prism_source.description = "PRISM Delaware River Basin subset, 40 years of monthly data for CONUS404 forcings evaluation"
print(prism_source.yaml())


In [None]:
sources[prism_source.name] = yaml.load(prism_source.yaml(), Loader=yaml.CLoader)[
    "sources"
][prism_source.name]
sources

CERES-EBAF zarr

In [None]:
ceres_URI = base_URI + "ceres_drb.zarr"

In [None]:
ceres_source = intake.open_zarr(
    ceres_URI,
    storage_options=storage_options,
    chunks={},
    consolidated=True,
    decode_coords="all",
)

ceres_source.name = "ceres-drb-OSN"
ceres_source.description = "CERES-EBAF Delaware River Basin subset, 40 years of monthly data for CONUS404 forcings evaluation"
print(ceres_source.yaml())


In [None]:
sources[ceres_source.name] = yaml.load(ceres_source.yaml(), Loader=yaml.CLoader)[
    "sources"
][ceres_source.name]
sources

HCN parquet

In [None]:
hcn_URI = base_URI + "hcn_drb.parquet"

In [None]:
hcn_source = intake.open_parquet(hcn_URI, storage_options=storage_options)
hcn_source.name = "hcn-drb-OSN"
hcn_source.description = "Historical Climate Network subset, 40 years of monthly data for CONUS404 forcings evaluation"
print(hcn_source.yaml())

In [None]:
sources[hcn_source.name] = yaml.load(hcn_source.yaml(), Loader=yaml.CLoader)["sources"][
    hcn_source.name
]
sources

CRN parquet

In [None]:
crn_URI = base_URI + "crn_drb.parquet"

In [None]:
crn_source = intake.open_parquet(crn_URI, storage_options=storage_options)
crn_source.name = "crn-drb-OSN"
crn_source.description = "Climate Reference Network subset, 40 years of monthly data for CONUS404 forcings evaluation"
print(crn_source.yaml())

In [None]:
sources[crn_source.name] = yaml.load(crn_source.yaml(), Loader=yaml.CLoader)["sources"][
    crn_source.name
]
sources

## Zonal stats

c404_drb_zonal_stats.parquet

ceres_drb_zonal_stats.parquet

prism_drb_zonal_stats.parquet

In [None]:
c404_zonal_URI = base_URI + "c404_drb_zonal_stats.parquet"

In [2]:
c404_zonal_source = intake.open_parquet(c404_zonal_URI, storage_options=storage_options)
c404_zonal_source.name = "c404-drb-zonal-OSN"
hcn_source.description = "CONUS404 zonal statistics in Delware River Basin"

sources[c404_zonal_source.name] = yaml.load(c404_zonal_source.yaml(), Loader=yaml.CLoader)["sources"][
    c404_zonal_source.name
]
sources

AttributeError: Unknown open method 'open_parquet'. Do you need to install a new driver from the plugin directory? https://intake.readthedocs.io/en/latest/plugin-directory.html
Registered opener methods: ['open_netcdf', 'open_opendap', 'open_rasterio', 'open_remote-xarray', 'open_xarray_image', 'open_zarr', 'open_alias', 'open_catalog', 'open_csv', 'open_intake_remote', 'open_json', 'open_jsonl', 'open_ndzarr', 'open_numpy', 'open_textfiles', 'open_tiled', 'open_tiled_cat', 'open_yaml_file_cat', 'open_yaml_files_cat', 'open_zarr_cat']

Create catalog


In [30]:
description = (
    "Catalog containing datasets used for the CONUS404 forcings evaluation notebooks."
)

catalog = {"metadata": {"version": 1, "description": description}, "sources": {}}

catalog["sources"] = sources

with open("conus404-drb-eval-tutorial-catalog.yml", "w") as f:
    yaml.dump(catalog, f)

Test functionality

In [32]:
cat = intake.open_catalog("conus404-drb-eval-tutorial-catalog.yml")
list(cat)

['ceres-drb-OSN',
 'conus404-drb-OSN',
 'crn-drb-OSN',
 'hcn-drb-OSN',
 'prism-drb-OSN']

In [33]:
cat["ceres-drb-OSN"].read()

In [35]:
cat["conus404-drb-OSN"].read()

In [36]:
cat["prism-drb-OSN"].read()

In [37]:
cat["hcn-drb-OSN"].read()

Unnamed: 0,ID,DATE,TK,PREC_ACC_NC,LATITUDE,LONGITUDE
0,USC00072730,1979-10-31,286.89,132.1,39.1467,-75.5056
1,USC00072730,1979-11-30,284.53,84.4,39.1467,-75.5056
2,USC00072730,1979-12-31,278.19,42.9,39.1467,-75.5056
3,USC00072730,1980-01-31,275.14,87.9,39.1467,-75.5056
4,USC00072730,1980-02-29,273.49,18.0,39.1467,-75.5056
...,...,...,...,...,...,...
6319,USW00014737,2022-06-30,293.31,99.5,40.6497,-75.4478
6320,USW00014737,2022-07-31,297.83,60.6,40.6497,-75.4478
6321,USW00014737,2022-08-31,297.67,72.2,40.6497,-75.4478
6322,USW00014737,2022-09-30,291.45,88.3,40.6497,-75.4478


In [38]:
cat["crn-drb-OSN"].read()

Unnamed: 0,DATE,TK,PREC_ACC_NC,LATITUDE,LONGITUDE,ID
0,2006-06-30,293.25,206.5,39.86,-75.79,Avondale
1,2006-07-31,296.94,93.4,39.86,-75.79,Avondale
2,2006-08-31,295.91,72.6,39.86,-75.79,Avondale
3,2006-09-30,290.20,159.1,39.86,-75.79,Avondale
4,2006-10-31,284.64,137.6,39.86,-75.79,Avondale
...,...,...,...,...,...,...
192,2022-06-30,,163.5,39.86,-75.79,Avondale
193,2022-07-31,297.15,113.1,39.86,-75.79,Avondale
194,2022-08-31,296.34,82.6,39.86,-75.79,Avondale
195,2022-09-30,291.53,101.5,39.86,-75.79,Avondale
