In [1]:
from __future__ import annotations

%load_ext jupyter_black

The follow can be used to access data from the s3fs and google cloud file storage.


The urma data is only available in grib data will need to be converted to zarr so the loading times are not awful and the data can be used in a dask cluster.

In [2]:
import os
import glob
import datetime
from typing import *  # type: ignore

import xarray as xr
import numpy as np
import s3fs
import cfgrib
import zarr
import gcsfs  # noqa: F401



In [3]:
start_date = datetime.datetime(2019, 1, 1)
end_date = datetime.datetime(2019, 1, 2)


# - local data storage - #
local_store = os.path.abspath("../data")
if not os.path.exists(local_store):
    os.makedirs(local_store)
urma_store = os.path.join(local_store, "urma.zarr")
era5_store = os.path.join(local_store, "era5.zarr")

In [4]:
from mesoscaler.enums import (
    # - ERA5
    GEOPOTENTIAL,
    SPECIFIC_HUMIDITY,
    TEMPERATURE,
    U_COMPONENT_OF_WIND,
    V_COMPONENT_OF_WIND,
    # - URMA
    SURFACE_PRESSURE,
    TEMPERATURE_2M,
    SPECIFIC_HUMIDITY_2M,
    U_WIND_COMPONENT_10M,
    V_WIND_COMPONENT_10M,
    SURFACE_PRESSURE,
)

ERA5_DATA_VARIABLES = [
    GEOPOTENTIAL,
    SPECIFIC_HUMIDITY,
    TEMPERATURE,
    U_COMPONENT_OF_WIND,
    V_COMPONENT_OF_WIND,
]

URMA_DATA_VARIABLES = [
    SURFACE_PRESSURE,
    TEMPERATURE_2M,
    SPECIFIC_HUMIDITY_2M,
    U_WIND_COMPONENT_10M,
    V_WIND_COMPONENT_10M,
    SURFACE_PRESSURE,
]

# ERA5 Data

In [None]:
google_store = "gs://weatherbench2/datasets/era5/1959-2022-full_37-1h-0p25deg-chunk-1.zarr-v2"
ds = xr.open_zarr(google_store).sel(time=np.s_[start_date:end_date])
ds = ds[ERA5_DATA_VARIABLES].sel(level=ds.level >= 200)
ds.to_zarr(era5_store)

In [5]:
xr.open_zarr(urma_store)

Unnamed: 0,Array,Chunk
Bytes,28.57 MiB,28.57 MiB
Shape,"(1597, 2345)","(1597, 2345)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 28.57 MiB 28.57 MiB Shape (1597, 2345) (1597, 2345) Dask graph 1 chunks in 2 graph layers Data type float64 numpy.ndarray",2345  1597,

Unnamed: 0,Array,Chunk
Bytes,28.57 MiB,28.57 MiB
Shape,"(1597, 2345)","(1597, 2345)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,28.57 MiB,28.57 MiB
Shape,"(1597, 2345)","(1597, 2345)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 28.57 MiB 28.57 MiB Shape (1597, 2345) (1597, 2345) Dask graph 1 chunks in 2 graph layers Data type float64 numpy.ndarray",2345  1597,

Unnamed: 0,Array,Chunk
Bytes,28.57 MiB,28.57 MiB
Shape,"(1597, 2345)","(1597, 2345)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,342.86 MiB,14.29 MiB
Shape,"(24, 1597, 2345)","(1, 1597, 2345)"
Dask graph,24 chunks in 2 graph layers,24 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 342.86 MiB 14.29 MiB Shape (24, 1597, 2345) (1, 1597, 2345) Dask graph 24 chunks in 2 graph layers Data type float32 numpy.ndarray",2345  1597  24,

Unnamed: 0,Array,Chunk
Bytes,342.86 MiB,14.29 MiB
Shape,"(24, 1597, 2345)","(1, 1597, 2345)"
Dask graph,24 chunks in 2 graph layers,24 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,342.86 MiB,14.29 MiB
Shape,"(24, 1597, 2345)","(1, 1597, 2345)"
Dask graph,24 chunks in 2 graph layers,24 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 342.86 MiB 14.29 MiB Shape (24, 1597, 2345) (1, 1597, 2345) Dask graph 24 chunks in 2 graph layers Data type float32 numpy.ndarray",2345  1597  24,

Unnamed: 0,Array,Chunk
Bytes,342.86 MiB,14.29 MiB
Shape,"(24, 1597, 2345)","(1, 1597, 2345)"
Dask graph,24 chunks in 2 graph layers,24 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,342.86 MiB,14.29 MiB
Shape,"(24, 1597, 2345)","(1, 1597, 2345)"
Dask graph,24 chunks in 2 graph layers,24 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 342.86 MiB 14.29 MiB Shape (24, 1597, 2345) (1, 1597, 2345) Dask graph 24 chunks in 2 graph layers Data type float32 numpy.ndarray",2345  1597  24,

Unnamed: 0,Array,Chunk
Bytes,342.86 MiB,14.29 MiB
Shape,"(24, 1597, 2345)","(1, 1597, 2345)"
Dask graph,24 chunks in 2 graph layers,24 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,342.86 MiB,14.29 MiB
Shape,"(24, 1597, 2345)","(1, 1597, 2345)"
Dask graph,24 chunks in 2 graph layers,24 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 342.86 MiB 14.29 MiB Shape (24, 1597, 2345) (1, 1597, 2345) Dask graph 24 chunks in 2 graph layers Data type float32 numpy.ndarray",2345  1597  24,

Unnamed: 0,Array,Chunk
Bytes,342.86 MiB,14.29 MiB
Shape,"(24, 1597, 2345)","(1, 1597, 2345)"
Dask graph,24 chunks in 2 graph layers,24 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,342.86 MiB,14.29 MiB
Shape,"(24, 1597, 2345)","(1, 1597, 2345)"
Dask graph,24 chunks in 2 graph layers,24 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 342.86 MiB 14.29 MiB Shape (24, 1597, 2345) (1, 1597, 2345) Dask graph 24 chunks in 2 graph layers Data type float32 numpy.ndarray",2345  1597  24,

Unnamed: 0,Array,Chunk
Bytes,342.86 MiB,14.29 MiB
Shape,"(24, 1597, 2345)","(1, 1597, 2345)"
Dask graph,24 chunks in 2 graph layers,24 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


# URMA 5 data

The following will process the grib data into a .zarr format

In [None]:
URMA2P5_DATE_FMT = "noaa-urma-pds/urma2p5.%Y%m%d"

aws_bucket = "s3://noaa-urma-pds/urma2p5.*"
grib_folder = os.path.join(local_store, "urma-gribs")
if not os.path.exists(grib_folder):
    os.makedirs(grib_folder)

client = s3fs.S3FileSystem(anon=True)


def filter_dates(
    sources: Iterable[tuple[Any, datetime.datetime]], start_date: datetime.datetime, end_date: datetime.datetime
) -> Iterable[tuple[Any, datetime.datetime]]:
    return filter(lambda x: x[1] >= start_date and x[1] <= end_date, sources)


if not glob.glob(os.path.join(grib_folder, "*.grb2_wexp")): # dont download if already downloaded
    it = filter_dates(
        ((x, datetime.datetime.strptime(str(x), URMA2P5_DATE_FMT)) for x in client.glob(aws_bucket)),
        start_date,
        end_date,
    )
    for file, date in it:
        print(f"Downloading {file} {date}")
        url = client.glob(f"s3://{file}/urma2p5.t*2dvaranl_ndfd.grb2_wexp")
        client.get(url, grib_folder)



In [None]:
from mesoscaler.enums import URMA



def open_mfdataset(files: list[str], variables: list[URMA]):
    """
    Wraps the xr.open_mfdataset function to filter by type of level and level.
    This is necessary for many of the NDFD datasets, which use inconsistent
    naming schemes for the `level` coordinate.
    """

    fsets = {(dvar.type_of_level, dvar.level) for dvar in variables}

    dsets = [
        xr.open_mfdataset(
            files,
            engine="cfgrib",
            concat_dim="time",
            combine="nested",
            filter_by_keys={"typeOfLevel": tol, "level": lvl, "step": 0},
        ).drop_vars([tol, "step", "valid_time"], errors="ignore")
        for tol, lvl in fsets
    ]
    ds = xr.merge(dsets).rename({dvar.short_name: dvar for dvar in variables})[variables]

    ds.attrs = {}
    return ds


ds = open_mfdataset(
    glob.glob(os.path.join(grib_folder, "*.grb2_wexp")),
    URMA_DATA_VARIABLES,
)
ds.to_zarr(urma_store, mode="w")

In [6]:
xr.open_zarr(urma_store)

Unnamed: 0,Array,Chunk
Bytes,28.57 MiB,28.57 MiB
Shape,"(1597, 2345)","(1597, 2345)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 28.57 MiB 28.57 MiB Shape (1597, 2345) (1597, 2345) Dask graph 1 chunks in 2 graph layers Data type float64 numpy.ndarray",2345  1597,

Unnamed: 0,Array,Chunk
Bytes,28.57 MiB,28.57 MiB
Shape,"(1597, 2345)","(1597, 2345)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,28.57 MiB,28.57 MiB
Shape,"(1597, 2345)","(1597, 2345)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 28.57 MiB 28.57 MiB Shape (1597, 2345) (1597, 2345) Dask graph 1 chunks in 2 graph layers Data type float64 numpy.ndarray",2345  1597,

Unnamed: 0,Array,Chunk
Bytes,28.57 MiB,28.57 MiB
Shape,"(1597, 2345)","(1597, 2345)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,342.86 MiB,14.29 MiB
Shape,"(24, 1597, 2345)","(1, 1597, 2345)"
Dask graph,24 chunks in 2 graph layers,24 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 342.86 MiB 14.29 MiB Shape (24, 1597, 2345) (1, 1597, 2345) Dask graph 24 chunks in 2 graph layers Data type float32 numpy.ndarray",2345  1597  24,

Unnamed: 0,Array,Chunk
Bytes,342.86 MiB,14.29 MiB
Shape,"(24, 1597, 2345)","(1, 1597, 2345)"
Dask graph,24 chunks in 2 graph layers,24 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,342.86 MiB,14.29 MiB
Shape,"(24, 1597, 2345)","(1, 1597, 2345)"
Dask graph,24 chunks in 2 graph layers,24 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 342.86 MiB 14.29 MiB Shape (24, 1597, 2345) (1, 1597, 2345) Dask graph 24 chunks in 2 graph layers Data type float32 numpy.ndarray",2345  1597  24,

Unnamed: 0,Array,Chunk
Bytes,342.86 MiB,14.29 MiB
Shape,"(24, 1597, 2345)","(1, 1597, 2345)"
Dask graph,24 chunks in 2 graph layers,24 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,342.86 MiB,14.29 MiB
Shape,"(24, 1597, 2345)","(1, 1597, 2345)"
Dask graph,24 chunks in 2 graph layers,24 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 342.86 MiB 14.29 MiB Shape (24, 1597, 2345) (1, 1597, 2345) Dask graph 24 chunks in 2 graph layers Data type float32 numpy.ndarray",2345  1597  24,

Unnamed: 0,Array,Chunk
Bytes,342.86 MiB,14.29 MiB
Shape,"(24, 1597, 2345)","(1, 1597, 2345)"
Dask graph,24 chunks in 2 graph layers,24 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,342.86 MiB,14.29 MiB
Shape,"(24, 1597, 2345)","(1, 1597, 2345)"
Dask graph,24 chunks in 2 graph layers,24 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 342.86 MiB 14.29 MiB Shape (24, 1597, 2345) (1, 1597, 2345) Dask graph 24 chunks in 2 graph layers Data type float32 numpy.ndarray",2345  1597  24,

Unnamed: 0,Array,Chunk
Bytes,342.86 MiB,14.29 MiB
Shape,"(24, 1597, 2345)","(1, 1597, 2345)"
Dask graph,24 chunks in 2 graph layers,24 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,342.86 MiB,14.29 MiB
Shape,"(24, 1597, 2345)","(1, 1597, 2345)"
Dask graph,24 chunks in 2 graph layers,24 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 342.86 MiB 14.29 MiB Shape (24, 1597, 2345) (1, 1597, 2345) Dask graph 24 chunks in 2 graph layers Data type float32 numpy.ndarray",2345  1597  24,

Unnamed: 0,Array,Chunk
Bytes,342.86 MiB,14.29 MiB
Shape,"(24, 1597, 2345)","(1, 1597, 2345)"
Dask graph,24 chunks in 2 graph layers,24 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
