# Programmatically creating `intake` catalogs

Author: Andrew Laws

This notebook was used to create a catalog of intermediate datasets used in the CONUS404 tutorial found in `evaluation/tutorials/CONUS404` that are saved to an Open Storage Network (OSN) pod. It is based on the [Project Pythia `intake` Cookbook](https://github.com/ProjectPythia/intake-cookbook).

Library imports

**Requires intake-xarray and intake-parquet to be installed in envrironment**

In [None]:
import intake
import yaml

OSN URIS:
<ul>
<li>hytest/tutorials/evaluation/conus404/c404_ceres_drb_descriptive_stats.parquet</li>
 <li>hytest/tutorials/evaluation/conus404/c404_crn_drb_descriptive_stats.parquet</li>
 <li>hytest/tutorials/evaluation/conus404/c404_crn_drb_point_values.parquet</li>
 <li>hytest/tutorials/evaluation/conus404/c404_drb.zarr</li>
 <li>hytest/tutorials/evaluation/conus404/c404_drb_zonal_stats.parquet</li>
 <li>hytest/tutorials/evaluation/conus404/c404_hcn_drb_descriptive_stats.parquet</li>
 <li>hytest/tutorials/evaluation/conus404/c404_hcn_drb_point_values.parquet</li>
 <li>hytest/tutorials/evaluation/conus404/c404_prism_drb_descriptive_stats.parquet</li>
 <li>hytest/tutorials/evaluation/conus404/ceres_drb.zarr</li>
 <li>hytest/tutorials/evaluation/conus404/ceres_drb_zonal_stats.parquet</li>
 <li>hytest/tutorials/evaluation/conus404/crn_drb.parquet</li>
 <li>hytest/tutorials/evaluation/conus404/hcn_drb.parquet</li>
 <li>hytest/tutorials/evaluation/conus404/prism_drb.zarr</li>
 <li>hytest/tutorials/evaluation/conus404/prism_drb_zonal_stats.parquet</li>
</ul>

## Base datasets that have been spatially/temporally subset

In [None]:
base_URI = "s3://hytest/tutorials/evaluation/conus404/"
storage_options = dict(endpoint_url="https://usgs.osn.mghpcc.org/", anon=True)

CONUS404 zarr

In [None]:
c404_URI = base_URI + "c404_drb.zarr"
print(c404_URI)

In [None]:
c404_source = intake.open_zarr(
    c404_URI,
    storage_options=storage_options,
    chunks={},
    consolidated=True,
    decode_coords="all",
)

c404_source.name = "conus404-drb-OSN"
c404_source.description = "CONUS404 Delaware River Basin subset, 40 years of monthly data for CONUS404 forcings evaluation"
print(c404_source.yaml())

In [None]:
source_dict = yaml.load(c404_source.yaml(), Loader=yaml.CLoader)
source_dict

In [None]:
sources = source_dict["sources"]
sources

PRISM zarr

In [None]:
prism_URI = base_URI + "prism_drb.zarr"
prism_source = intake.open_zarr(
    prism_URI,
    storage_options=storage_options,
    chunks={},
    consolidated=True,
    decode_coords="all",
)

prism_source.name = "prism-drb-OSN"
prism_source.description = "PRISM Delaware River Basin subset, 40 years of monthly data for CONUS404 forcings evaluation"

sources[prism_source.name] = yaml.load(prism_source.yaml(), Loader=yaml.CLoader)[
    "sources"
][prism_source.name]
# sources


CERES-EBAF zarr

In [None]:
ceres_URI = base_URI + "ceres_drb.zarr"
ceres_source = intake.open_zarr(
    ceres_URI,
    storage_options=storage_options,
    chunks={},
    consolidated=True,
    decode_coords="all",
)

ceres_source.name = "ceres-drb-OSN"
ceres_source.description = "CERES-EBAF Delaware River Basin subset, 40 years of monthly data for CONUS404 forcings evaluation"

sources[ceres_source.name] = yaml.load(ceres_source.yaml(), Loader=yaml.CLoader)[
    "sources"
][ceres_source.name]
# sources


HCN parquet

In [None]:
hcn_URI = base_URI + "hcn_drb.parquet"
hcn_source = intake.open_parquet(hcn_URI, storage_options=storage_options)
hcn_source.name = "hcn-drb-OSN"
hcn_source.description = "Historical Climate Network subset, 40 years of monthly data for CONUS404 forcings evaluation"
sources[hcn_source.name] = yaml.load(hcn_source.yaml(), Loader=yaml.CLoader)["sources"][
    hcn_source.name
]
# sources


CRN parquet

In [None]:
crn_URI = base_URI + "crn_drb.parquet"
crn_source = intake.open_parquet(crn_URI, storage_options=storage_options)
crn_source.name = "crn-drb-OSN"
crn_source.description = "Climate Reference Network subset, 40 years of monthly data for CONUS404 forcings evaluation"
sources[crn_source.name] = yaml.load(crn_source.yaml(), Loader=yaml.CLoader)["sources"][
    crn_source.name
]
sources


## Zonal stats

CONUS404 zonal

In [None]:
c404_zonal_URI = base_URI + "c404_drb_zonal_stats.parquet"
c404_zonal_source = intake.open_parquet(c404_zonal_URI, storage_options=storage_options)
c404_zonal_source.name = "c404-drb-zonal-OSN"
c404_zonal_source.description = "CONUS404 zonal statistics of Delware River Basin"

sources[c404_zonal_source.name] = yaml.load(
    c404_zonal_source.yaml(), Loader=yaml.CLoader
)["sources"][c404_zonal_source.name]
# sources


PRISM zonal

In [None]:
prism_zonal_URI = base_URI + "prism_drb_zonal_stats.parquet"
prism_zonal_source = intake.open_parquet(
    prism_zonal_URI, storage_options=storage_options
)
prism_zonal_source.name = "prism-drb-zonal-OSN"
prism_zonal_source.description = "PRISM zonal statistics of Delware River Basin"

sources[prism_zonal_source.name] = yaml.load(
    prism_zonal_source.yaml(), Loader=yaml.CLoader
)["sources"][prism_zonal_source.name]
# sources


CERES-EBAF zonal

In [None]:
ceres_zonal_URI = base_URI + "ceres_drb_zonal_stats.parquet"
ceres_zonal_source = intake.open_parquet(
    ceres_zonal_URI, storage_options=storage_options
)
ceres_zonal_source.name = "ceres-drb-zonal-OSN"
ceres_zonal_source.description = "CERES-EBAF zonal statistics of Delware River Basin"

sources[ceres_zonal_source.name] = yaml.load(
    ceres_zonal_source.yaml(), Loader=yaml.CLoader
)["sources"][ceres_zonal_source.name]
sources


## Point data

HCN point data

In [None]:
hcn_point_URI = base_URI + "c404_hcn_drb_point_values.parquet"
hcn_point_source = intake.open_parquet(hcn_point_URI, storage_options=storage_options)
hcn_point_source.name = "hcn-drb-point-OSN"
hcn_point_source.description = (
    "HCN and CONUS404 point statistics of Delware River Basin"
)

sources[hcn_point_source.name] = yaml.load(
    hcn_point_source.yaml(), Loader=yaml.CLoader
)["sources"][hcn_point_source.name]
# sources


CRN point data

In [None]:
crn_point_URI = base_URI + "c404_crn_drb_point_values.parquet"
crn_point_source = intake.open_parquet(crn_point_URI, storage_options=storage_options)
crn_point_source.name = "crn-drb-point-OSN"
crn_point_source.description = (
    "CRN and CONUS404 point statistics of Delware River Basin"
)

sources[crn_point_source.name] = yaml.load(
    crn_point_source.yaml(), Loader=yaml.CLoader
)["sources"][crn_point_source.name]
# sources


## Descriptive statistics

c404_prism_drb_descriptive_stats
c404_ceres_drb_descriptive_stats
c404_hcn_drb_descriptive_stats
c404_crn_drb_descriptive_stats

PRISM descriptive stats

In [None]:
prism_desc_URI = base_URI + "c404_prism_drb_descriptive_stats.parquet"
prism_desc_source = intake.open_parquet(prism_desc_URI, storage_options=storage_options)
prism_desc_source.name = "c404-prism-drb-desc-stats-OSN"
prism_desc_source.description = (
    "Descriptive statistics for the comparison of CONUS404 to PRISM"
)

sources[prism_desc_source.name] = yaml.load(
    prism_desc_source.yaml(), Loader=yaml.CLoader
)["sources"][prism_desc_source.name]
# sources


CERES-EBAF descriptive stats

In [None]:
ceres_desc_URI = base_URI + "c404_ceres_drb_descriptive_stats.parquet"
ceres_desc_source = intake.open_parquet(ceres_desc_URI, storage_options=storage_options)
ceres_desc_source.name = "c404-ceres-drb-desc-stats-OSN"
ceres_desc_source.description = (
    "Descriptive statistics for the comparison of CONUS404 to CERES-EBAF"
)

sources[ceres_desc_source.name] = yaml.load(
    ceres_desc_source.yaml(), Loader=yaml.CLoader
)["sources"][ceres_desc_source.name]
# sources


HCN descriptive stats

In [None]:
hcn_desc_URI = base_URI + "c404_hcn_drb_descriptive_stats.parquet"
hcn_desc_source = intake.open_parquet(hcn_desc_URI, storage_options=storage_options)
hcn_desc_source.name = "c404-hcn-drb-desc-stats-OSN"
hcn_desc_source.description = (
    "Descriptive statistics for the comparison of CONUS404 to HCN"
)

sources[hcn_desc_source.name] = yaml.load(hcn_desc_source.yaml(), Loader=yaml.CLoader)[
    "sources"
][hcn_desc_source.name]
# sources


CRN descriptive stats

In [None]:
crn_desc_URI = base_URI + "c404_crn_drb_descriptive_stats.parquet"
crn_desc_source = intake.open_parquet(crn_desc_URI, storage_options=storage_options)
crn_desc_source.name = "c404-crn-drb-desc-stats-OSN"
crn_desc_source.description = (
    "Descriptive statistics for the comparison of CONUS404 to CRN"
)

sources[crn_desc_source.name] = yaml.load(crn_desc_source.yaml(), Loader=yaml.CLoader)[
    "sources"
][crn_desc_source.name]
sources


## Create catalog


In [None]:
description = (
    "Catalog containing datasets used for the CONUS404 forcings evaluation notebooks."
)

catalog = {"metadata": {"version": 1, "description": description}, "sources": {}}

catalog["sources"] = sources

catalog_path = "../subcatalogs/conus404-drb-eval-tutorial-catalog.yml"

with open(catalog_path, "w") as f:
    yaml.dump(catalog, f)

Test functionality

In [None]:
cat = intake.open_catalog(catalog_path)
list(cat)

In [None]:
cat["ceres-drb-OSN"].read()

In [None]:
cat["conus404-drb-OSN"].read()

In [None]:
cat["prism-drb-OSN"].read()

In [None]:
cat["hcn-drb-OSN"].read()

In [None]:
cat["crn-drb-OSN"].read()