# Understanding Zarr

In [50]:
import json

from pathlib import Path
from typing import Any

import folium
import fsspec
import numpy as np
import pystac_client
import planetary_computer
import pyproj
import shapely

from adlfs import AzureBlobFileSystem

# Point of Interest (POI)

I want to go to Puerto Rico. But I'm from a northern temporate latitude in an area with low humidity. I'd like to find the best time of year for me to visit Puerto Rico without having oppressive weather. Can we use a zarr dataset from Planetary Computer to answer this question?

To pick a point of interest, let's use the coordinates of San Juan:

In [29]:
POI = shapely.Point(-66.063889, 18.406389)

Let's throw this point on a map, just so we're all aware of where we're looking for data.

In [30]:
location = (POI.coords[0][1], POI.coords[0][0])

point_map = folium.Map(
    location=location,
    tiles='CartoDB positron',
)

folium.CircleMarker(
    location=location,
    fill=True,
    fill_opacity=0.6,
).add_to(point_map)

point_map

In [3]:
catalog = pystac_client.Client.open(
    "https://planetarycomputer.microsoft.com/api/stac/v1",
    modifier=planetary_computer.sign_inplace,
)
collection = catalog.get_collection('daymet-daily-pr')
collection

In [4]:
asset = collection.assets["zarr-abfs"]
asset

In [5]:
zarr_root = Path(asset.href.split('//', 1)[1])
zarr_root

PosixPath('daymet-zarr/daily/pr.zarr')

In [6]:
fs = fsspec.filesystem('abfs', **asset.extra_fields['xarray:storage_options'])

In [7]:
fs.ls(str(zarr_root))

['daymet-zarr/daily/pr.zarr/.zattrs',
 'daymet-zarr/daily/pr.zarr/.zgroup',
 'daymet-zarr/daily/pr.zarr/.zmetadata',
 'daymet-zarr/daily/pr.zarr/dayl',
 'daymet-zarr/daily/pr.zarr/lambert_conformal_conic',
 'daymet-zarr/daily/pr.zarr/lat',
 'daymet-zarr/daily/pr.zarr/lon',
 'daymet-zarr/daily/pr.zarr/prcp',
 'daymet-zarr/daily/pr.zarr/srad',
 'daymet-zarr/daily/pr.zarr/swe',
 'daymet-zarr/daily/pr.zarr/time',
 'daymet-zarr/daily/pr.zarr/time_bnds',
 'daymet-zarr/daily/pr.zarr/tmax',
 'daymet-zarr/daily/pr.zarr/tmin',
 'daymet-zarr/daily/pr.zarr/vp',
 'daymet-zarr/daily/pr.zarr/x',
 'daymet-zarr/daily/pr.zarr/y',
 'daymet-zarr/daily/pr.zarr/yearday']

In [8]:
with fs.open(str(zarr_root / '.zattrs')) as f:
   cnt = f.read()

cnt

b'{\n    "Conventions": "CF-1.6",\n    "Version_data": "Daymet Data Version 4.0",\n    "Version_software": "Daymet Software Version 4.0",\n    "citation": "Please see http://daymet.ornl.gov/ for current Daymet data citation information",\n    "references": "Please see http://daymet.ornl.gov/ for current information on Daymet references",\n    "source": "Daymet Software Version 4.0",\n    "start_year": 1980\n}'

In [9]:
def ls_zarr(path: str) -> list[str]:
    return fs.ls(str(zarr_root / path))

def read_zarr_file(path: str) -> bytes:
    with fs.open(str(zarr_root / path)) as f:
        return f.read()

def read_zarr_json(path: str) -> dict[str, Any]:
    return json.loads(read_zarr_file(path))

def print_json(_json: dict[str, Any]) -> None:
    print(json.dumps(_json, indent=4))

In [10]:
zmeta = read_zarr_json('.zmetadata')
print_json(zmeta)

{
    "metadata": {
        ".zattrs": {
            "Conventions": "CF-1.6",
            "Version_data": "Daymet Data Version 4.0",
            "Version_software": "Daymet Software Version 4.0",
            "citation": "Please see http://daymet.ornl.gov/ for current Daymet data citation information",
            "references": "Please see http://daymet.ornl.gov/ for current information on Daymet references",
            "source": "Daymet Software Version 4.0",
            "start_year": 1980
        },
        ".zgroup": {
            "zarr_format": 2
        },
        "dayl/.zarray": {
            "chunks": [
                365,
                231,
                364
            ],
            "compressor": {
                "blocksize": 0,
                "clevel": 5,
                "cname": "lz4",
                "id": "blosc",
                "shuffle": 1
            },
            "dtype": "<f4",
            "fill_value": "NaN",
            "filters": null,
            "order"

In [11]:
ls_zarr('lat/')

['daymet-zarr/daily/pr.zarr/lat/.zarray',
 'daymet-zarr/daily/pr.zarr/lat/.zattrs',
 'daymet-zarr/daily/pr.zarr/lat/0.0']

In [12]:
lat_zarray = read_zarr_json('lat/.zarray')
print_json(lat_zarray)

{
    "chunks": [
        231,
        364
    ],
    "compressor": {
        "blocksize": 0,
        "clevel": 5,
        "cname": "lz4",
        "id": "blosc",
        "shuffle": 1
    },
    "dtype": "<f4",
    "fill_value": "NaN",
    "filters": null,
    "order": "C",
    "shape": [
        231,
        364
    ],
    "zarr_format": 2
}


In [13]:
lat_zattrs = read_zarr_json('lat/.zattrs')
print_json(lat_zattrs)

{
    "_ARRAY_DIMENSIONS": [
        "y",
        "x"
    ],
    "long_name": "latitude coordinate",
    "standard_name": "latitude",
    "units": "degrees_north"
}


In [14]:
lat_bytes = read_zarr_file('lat/0.0')
print(f'{lat_bytes[:100]}...')
print(len(lat_bytes))

b'\x02\x01!\x04\xd0!\x05\x00\x00\x00\x04\x00\xf8\x07\x02\x00\x18\x00\x00\x00\xff\x96\x01\x00\xe5\xfc\x00\x00\xf3\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xffv\xa9\xc6\xe2\xfd\x193Nh\x81\x9b\xb3\xcc\xe4\xfb\x13*@Vl\x81\x96\xaa\xbe\xd2\xe5\xf8\n\x1c.?Paq\x80\x90\x9e\xad\xbb\xc9\xd6\xe3\xef\xfb\x07\x12\x1d(2<ENW_'...
133112


In [17]:
import numcodecs.blosc

def read_zarr_blosc(path: str) -> Any:
    #with lz4.frame.LZ4FrameDecompressor() as decompressor:
    return numcodecs.blosc.decompress(read_zarr_file(path))

In [19]:
np.dtype('<f4')

dtype('float32')

In [25]:
lat_uncompressed = read_zarr_blosc('lat/0.0')
print(f'{lat_uncompressed[:100]}...')
print(len(lat_uncompressed))

b'\xa9u\x9fA\xc6n\x9fA\xe2g\x9fA\xfd`\x9fA\x19Z\x9fA3S\x9fANL\x9fAhE\x9fA\x81>\x9fA\x9b7\x9fA\xb30\x9fA\xcc)\x9fA\xe4"\x9fA\xfb\x1b\x9fA\x13\x15\x9fA*\x0e\x9fA@\x07\x9fAV\x00\x9fAl\xf9\x9eA\x81\xf2\x9eA\x96\xeb\x9eA\xaa\xe4\x9eA\xbe\xdd\x9eA\xd2\xd6\x9eA\xe5\xcf\x9eA'...
336336


In [26]:
dt = np.dtype(lat_zarray['dtype'])
lat_array = np.frombuffer(lat_uncompressed, dtype=dt).reshape(lat_zarray['shape'])
lat_array

array([[19.932451, 19.929089, 19.925724, ..., 18.66994 , 18.666313,
        18.662685],
       [19.924347, 19.920984, 19.91762 , ..., 18.66203 , 18.658403,
        18.654774],
       [19.916243, 19.91288 , 19.909517, ..., 18.654118, 18.650492,
        18.646864],
       ...,
       [18.091873, 18.088625, 18.085377, ..., 16.872786, 16.869282,
        16.865776],
       [18.083834, 18.080587, 18.07734 , ..., 16.864933, 16.861431,
        16.857927],
       [18.075796, 18.072548, 18.069302, ..., 16.857082, 16.85358 ,
        16.850077]], dtype=float32)

In [24]:
lon_zarray = read_zarr_json('lon/.zarray')
dt = np.dtype(lon_zarray['dtype'])
lon_array = np.frombuffer(read_zarr_blosc('lon/0.0'), dtype=dt).reshape(lon_zarray['shape'])
lon_array

array([[-67.1878  , -67.17923 , -67.170654, ..., -64.14222 , -64.13392 ,
        -64.12562 ],
       [-67.19135 , -67.182785, -67.17422 , ..., -64.14603 , -64.137726,
        -64.129425],
       [-67.19491 , -67.18634 , -67.17777 , ..., -64.14983 , -64.14153 ,
        -64.13323 ],
       ...,
       [-67.98011 , -67.9717  , -67.963295, ..., -64.99068 , -64.98253 ,
        -64.97437 ],
       [-67.983505, -67.9751  , -67.96669 , ..., -64.994316, -64.98617 ,
        -64.97801 ],
       [-67.98689 , -67.97849 , -67.970085, ..., -64.997955, -64.9898  ,
        -64.98165 ]], dtype=float32)

In [36]:
# find array coords of POI
row = lon_array[0]
col = 0
POI_col = None
while POI.x >= row[col]:
    POI_col = row[0]
    col += 1
POI_col

np.float32(-67.1878)

In [38]:
row = lat_array[]
col = 0
POI_row = None
while POI.y >= row[col]:
    POI_row = row[0]
    col += 1
POI.y, POI_row

(18.406389, None)

In [39]:
lon_array.shape

(231, 364)

In [48]:
x_zarray = read_zarr_json('x/.zarray')
dt = np.dtype(x_zarray['dtype'])
x_array = np.frombuffer(read_zarr_blosc('x/0'), dtype=dt).reshape(x_zarray['shape'])
x_array

array([3445750., 3446750., 3447750., 3448750., 3449750., 3450750.,
       3451750., 3452750., 3453750., 3454750., 3455750., 3456750.,
       3457750., 3458750., 3459750., 3460750., 3461750., 3462750.,
       3463750., 3464750., 3465750., 3466750., 3467750., 3468750.,
       3469750., 3470750., 3471750., 3472750., 3473750., 3474750.,
       3475750., 3476750., 3477750., 3478750., 3479750., 3480750.,
       3481750., 3482750., 3483750., 3484750., 3485750., 3486750.,
       3487750., 3488750., 3489750., 3490750., 3491750., 3492750.,
       3493750., 3494750., 3495750., 3496750., 3497750., 3498750.,
       3499750., 3500750., 3501750., 3502750., 3503750., 3504750.,
       3505750., 3506750., 3507750., 3508750., 3509750., 3510750.,
       3511750., 3512750., 3513750., 3514750., 3515750., 3516750.,
       3517750., 3518750., 3519750., 3520750., 3521750., 3522750.,
       3523750., 3524750., 3525750., 3526750., 3527750., 3528750.,
       3529750., 3530750., 3531750., 3532750., 3533750., 35347

In [49]:
y_zarray = read_zarr_json('y/.zarray')
dt = np.dtype(y_zarray['dtype'])
y_array = np.frombuffer(read_zarr_blosc('y/0'), dtype=dt).reshape(y_zarray['shape'])
y_array

array([-1765000., -1766000., -1767000., -1768000., -1769000., -1770000.,
       -1771000., -1772000., -1773000., -1774000., -1775000., -1776000.,
       -1777000., -1778000., -1779000., -1780000., -1781000., -1782000.,
       -1783000., -1784000., -1785000., -1786000., -1787000., -1788000.,
       -1789000., -1790000., -1791000., -1792000., -1793000., -1794000.,
       -1795000., -1796000., -1797000., -1798000., -1799000., -1800000.,
       -1801000., -1802000., -1803000., -1804000., -1805000., -1806000.,
       -1807000., -1808000., -1809000., -1810000., -1811000., -1812000.,
       -1813000., -1814000., -1815000., -1816000., -1817000., -1818000.,
       -1819000., -1820000., -1821000., -1822000., -1823000., -1824000.,
       -1825000., -1826000., -1827000., -1828000., -1829000., -1830000.,
       -1831000., -1832000., -1833000., -1834000., -1835000., -1836000.,
       -1837000., -1838000., -1839000., -1840000., -1841000., -1842000.,
       -1843000., -1844000., -1845000., -1846000., 

In [51]:
collection.extra_fields['cube:dimensions']['x']['reference_system']

{'name': 'undefined',
 'type': 'ProjectedCRS',
 '$schema': 'https://proj.org/schemas/v0.4/projjson.schema.json',
 'base_crs': {'name': 'undefined',
  'datum': {'name': 'undefined',
   'type': 'GeodeticReferenceFrame',
   'ellipsoid': {'name': 'undefined',
    'semi_major_axis': 6378137,
    'inverse_flattening': 298.257223563}},
  'coordinate_system': {'axis': [{'name': 'Longitude',
     'unit': 'degree',
     'direction': 'east',
     'abbreviation': 'lon'},
    {'name': 'Latitude',
     'unit': 'degree',
     'direction': 'north',
     'abbreviation': 'lat'}],
   'subtype': 'ellipsoidal'}},
 'conversion': {'name': 'unknown',
  'method': {'id': {'code': 9802, 'authority': 'EPSG'},
   'name': 'Lambert Conic Conformal (2SP)'},
  'parameters': [{'id': {'code': 8823, 'authority': 'EPSG'},
    'name': 'Latitude of 1st standard parallel',
    'unit': 'degree',
    'value': 25},
   {'id': {'code': 8824, 'authority': 'EPSG'},
    'name': 'Latitude of 2nd standard parallel',
    'unit': 'degre

In [58]:
src_crs = pyproj.CRS.from_json_dict(collection.extra_fields['cube:dimensions']['x']['reference_system'])
src_crs

<Projected CRS: {"name": "undefined", "type": "ProjectedCRS", "$sc ...>
Name: undefined
Axis Info [cartesian]:
- E[east]: Easting (metre)
- N[north]: Northing (metre)
Area of Use:
- undefined
Coordinate Operation:
- name: unknown
- method: Lambert Conic Conformal (2SP)
Datum: undefined
- Ellipsoid: undefined
- Prime Meridian: Greenwich

In [60]:
to_src_transformer = pyproj.Transformer.from_crs("EPSG:4326", src_crs)

In [80]:
poi_projected_x, poi_projected_y = to_src_transformer.transform(
    POI.y,
    POI.x,
)
poi_projected_x, poi_projected_y

(3626464.519424091, -1878320.0944692981)

In [71]:
grid_length = abs(int(
    (
        collection.extra_fields['cube:dimensions']['y']['extent'][1]
        - collection.extra_fields['cube:dimensions']['y']['extent'][0]
    )
    / collection.extra_fields['cube:dimensions']['y']['step']
)) + 1
grid_width = abs(int(
    (
        collection.extra_fields['cube:dimensions']['x']['extent'][1]
        - collection.extra_fields['cube:dimensions']['x']['extent'][0]
    )
    / collection.extra_fields['cube:dimensions']['x']['step']
)) + 1
grid_length, grid_width  # should match our lat and lon array shapes

(231, 364)

In [82]:
# we can make a "geotransform", for those of us comfortable with gdal world
geotransform = (
    collection.extra_fields['cube:dimensions']['x']['extent'][0],
    collection.extra_fields['cube:dimensions']['x']['step'],
    0,
    collection.extra_fields['cube:dimensions']['y']['extent'][1],
    0,
    collection.extra_fields['cube:dimensions']['y']['step'],
)
geotransform

(3445750.0, 1000.0, 0, -1765000.0, 0, -1000.0)

In [91]:
# now let's calc the pixel coords of our POI
poi_row = int((poi_projected_x - geotransform[0]) // geotransform[1])
poi_col = int((poi_projected_y - geotransform[3]) // geotransform[5])
poi_col, poi_row 

(113, 180)

In [96]:
print(POI.x, POI.y)
lon_array[113, 180], lat_array[113, 180]

-66.063889 18.406389


(np.float32(-66.06871), np.float32(18.4114))

In [None]:
# find array coords of POI
row = x_array[0]
col = 0
POI_col = None
while poi_projected_x >= row[col]:
    POI_col = row[0]
    col += 1
POI_col