In [1]:
import time
import uuid

import dask.array.wrap
import numpy as np
import xarray as xr

In [2]:
size = (100, 10_000_000)
n_chunks = size[0]

# it was not all that clear from the docs how the chunk should be set
# but it should be the number of points in each dim per chunk.
# in this example we chunk in the simplest way for writing.
# one chunk per iteration of the outermost loop
chunk_size = (size[0] // n_chunks, size[1])

In [3]:
dummies = dask.array.wrap.zeros(size, chunks=chunk_size)

In [4]:
ds = xr.Dataset(
    {"foo": (["x", "y"], dummies)},
    coords={"x": np.arange(size[0]), "y": np.arange(size[1])},
)

In [5]:
path = f"{uuid.uuid4()}.zarr"

In [6]:
ds.to_zarr(path, compute=False)

Delayed('_finalize_store-96af49aa-85d0-4eb6-ac39-26219425e106')

In [7]:
start = time.perf_counter()
for i in range(n_chunks):
    ds_chunk = xr.Dataset(
        data_vars={"foo": (["x", "y"], np.random.rand(1, size[1]))},
        coords={"x": np.arange(i, (i + 1)), "y": np.arange(size[1])},
    )
    # ds_chunk.to_zarr(path, region="auto")
    ds_chunk.to_zarr(path, region={"x": slice(i, i + 1), "y": slice(0, size[1])})
    if i % 10 == 0:
        print(f"wrote {i}")
stop = time.perf_counter()

print(
    f"writing {n_chunks} chunks took {stop-start} s -> {(stop-start)/n_chunks} s per chunk"
)
# ~100 S for 100 chunks of 80 MB each with region auto
# ~31 s for 100 chunks of 80 MB each with region set manually (expect but verify perfect chunk hit)
# ~ 260 MB/S

wrote 0
wrote 10
wrote 20
wrote 30
wrote 40
wrote 50
wrote 60
wrote 70
wrote 80
wrote 90
writing 100 chunks took 30.85953950008843 s -> 0.3085953950008843 s per chunk


In [8]:
loaded_ds = xr.open_zarr(path)

In [9]:
da = loaded_ds.to_dataarray()

In [10]:
# we can export 7.5 gb to netcdf in ~30 s
loaded_ds.to_netcdf("foo.nc", engine="h5netcdf")

In [11]:
loaded_ds

Unnamed: 0,Array,Chunk
Bytes,7.45 GiB,76.29 MiB
Shape,"(100, 10000000)","(1, 10000000)"
Dask graph,100 chunks in 2 graph layers,100 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 7.45 GiB 76.29 MiB Shape (100, 10000000) (1, 10000000) Dask graph 100 chunks in 2 graph layers Data type float64 numpy.ndarray",10000000  100,

Unnamed: 0,Array,Chunk
Bytes,7.45 GiB,76.29 MiB
Shape,"(100, 10000000)","(1, 10000000)"
Dask graph,100 chunks in 2 graph layers,100 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray


In [12]:
da

Unnamed: 0,Array,Chunk
Bytes,7.45 GiB,76.29 MiB
Shape,"(1, 100, 10000000)","(1, 1, 10000000)"
Dask graph,100 chunks in 3 graph layers,100 chunks in 3 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 7.45 GiB 76.29 MiB Shape (1, 100, 10000000) (1, 1, 10000000) Dask graph 100 chunks in 3 graph layers Data type float64 numpy.ndarray",10000000  100  1,

Unnamed: 0,Array,Chunk
Bytes,7.45 GiB,76.29 MiB
Shape,"(1, 100, 10000000)","(1, 1, 10000000)"
Dask graph,100 chunks in 3 graph layers,100 chunks in 3 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
