# Data pipeline for converting precipitation data `.pp` -> `Zarr`

In [1]:
import iris
import os
import xarray as xr
import numpy as np

In [2]:
def cube_to_zarr(cube, zarr_store, chunks={'time':10, 'grid_latitude':219, 'grid_longitude':286}, append_dim='time'):
    # Write cube to new zarr store
    # OR append cube to an existing zarr store
    ds = xr.DataArray.from_iris(cube).to_dataset()
    ds = ds.chunk(chunks=chunks)
    if os.path.isdir(zarr_store):
        ds.to_zarr(zarr_store, consolidated=True, append_dim=append_dim)
        print(f'Appended cube to {zarr_store}')
    else:
        ds.to_zarr(zarr_store, mode='w', consolidated=True)
        print(f'Written cube to {zarr_store}')

In [3]:
def pp_to_cube(filename, filepath, constraints={}):
    # Load a cube from a .pp file
    cube, = iris.load(os.path.join(filepath, file), iris.AttributeConstraint(**constraints))
    print(f'Cube loaded from {file}')
    return cube

In [4]:
def datetimes_from_cube(cube):
    return xr.DataArray.from_iris(cube).time.data

def datetimes_from_zarr(zarr_store):
    return xr.open_zarr(zarr_store).time.data

In [11]:
%%time
# Pipeline with time checking
STASH = 'm01s05i216'
filepath = '/data/cssp-china/mini-dataset-24-01-19/20CR/daily'
files = sorted(os.listdir(filepath))
zarr_store = '/data/cssp-china/zarr_precip_append'
times_written = datetimes_from_zarr(zarr_store)

for file in files[15:25]:
    print(f'Filename = {file}')
    cube = pp_to_cube(file, filepath, constraints={'STASH': STASH})
    times = datetimes_from_cube(cube)
    if not set(times).issubset(times_written):
        cube_to_zarr(cube, zarr_store)
        times_written = np.append(times_written, times)
    else:
        print(f'{file} already written to {zarr_store}')

print(f'Finish')
display(times_written)

Filename = apepda.pa51640.pp
Cube loaded from apepda.pa51640.pp
Appended cube to /data/cssp-china/zarr_precip_append
Filename = apepda.pa516e0.pp
Cube loaded from apepda.pa516e0.pp
Appended cube to /data/cssp-china/zarr_precip_append
Filename = apepda.pa516o0.pp
Cube loaded from apepda.pa516o0.pp
Appended cube to /data/cssp-china/zarr_precip_append
Filename = apepda.pa51740.pp
Cube loaded from apepda.pa51740.pp
Appended cube to /data/cssp-china/zarr_precip_append
Filename = apepda.pa517e0.pp
Cube loaded from apepda.pa517e0.pp
Appended cube to /data/cssp-china/zarr_precip_append
Filename = apepda.pa517o0.pp
Cube loaded from apepda.pa517o0.pp
Appended cube to /data/cssp-china/zarr_precip_append
Filename = apepda.pa51830.pp
Cube loaded from apepda.pa51830.pp
Appended cube to /data/cssp-china/zarr_precip_append
Filename = apepda.pa518d0.pp
Cube loaded from apepda.pa518d0.pp
Appended cube to /data/cssp-china/zarr_precip_append
Filename = apepda.pa518n0.pp
Cube loaded from apepda.pa518n0.pp


array(['1851-01-05T12:00:00.000000000', '1851-01-06T12:00:00.000000000',
       '1851-01-07T12:00:00.000000000', '1851-01-08T12:00:00.000000000',
       '1851-01-09T12:00:00.000000000', '1851-01-10T12:00:00.000000000',
       '1851-01-11T12:00:00.000000000', '1851-01-12T12:00:00.000000000',
       '1851-01-13T12:00:00.000000000', '1851-01-14T12:00:00.000000000',
       '1851-01-15T12:00:00.000000000', '1851-01-16T12:00:00.000000000',
       '1851-01-17T12:00:00.000000000', '1851-01-18T12:00:00.000000000',
       '1851-01-19T12:00:00.000000000', '1851-01-20T12:00:00.000000000',
       '1851-01-21T12:00:00.000000000', '1851-01-22T12:00:00.000000000',
       '1851-01-23T12:00:00.000000000', '1851-01-24T12:00:00.000000000',
       '1851-01-25T12:00:00.000000000', '1851-01-26T12:00:00.000000000',
       '1851-01-27T12:00:00.000000000', '1851-01-28T12:00:00.000000000',
       '1851-01-29T12:00:00.000000000', '1851-01-30T12:00:00.000000000',
       '1851-01-31T12:00:00.000000000', '1851-02-01

CPU times: user 14.8 s, sys: 1.73 s, total: 16.5 s
Wall time: 54.8 s


In [6]:
zarr_store

'/data/cssp-china/zarr_precip_append'

In [7]:
ds = xr.open_zarr(zarr_store)
ds

<xarray.Dataset>
Dimensions:                  (grid_latitude: 219, grid_longitude: 286, time: 140)
Coordinates:
    forecast_period          (time) timedelta64[ns] dask.array<chunksize=(10,), meta=np.ndarray>
    forecast_reference_time  datetime64[ns] ...
  * grid_latitude            (grid_latitude) float32 22.88 22.66 ... -25.08
  * grid_longitude           (grid_longitude) float32 323.48 323.7 ... 386.18002
  * time                     (time) datetime64[ns] 1851-01-05T12:00:00 ... 1851-05-24T12:00:00
Data variables:
    precipitation_flux       (time, grid_latitude, grid_longitude) float32 dask.array<chunksize=(10, 219, 286), meta=np.ndarray>

In [8]:
ds.precipitation_flux

<xarray.DataArray 'precipitation_flux' (time: 140, grid_latitude: 219, grid_longitude: 286)>
dask.array<zarr, shape=(140, 219, 286), dtype=float32, chunksize=(10, 219, 286), chunktype=numpy.ndarray>
Coordinates:
    forecast_period          (time) timedelta64[ns] dask.array<chunksize=(10,), meta=np.ndarray>
    forecast_reference_time  datetime64[ns] ...
  * grid_latitude            (grid_latitude) float32 22.88 22.66 ... -25.08
  * grid_longitude           (grid_longitude) float32 323.48 323.7 ... 386.18002
  * time                     (time) datetime64[ns] 1851-01-05T12:00:00 ... 1851-05-24T12:00:00
Attributes:
    STASH:          [1, 5, 216]
    cell_methods:   time: mean (interval: 1 hour)
    source:         Data from Met Office Unified Model
    standard_name:  precipitation_flux
    units:          kg m-2 s-1

In [9]:
ds.precipitation_flux.data

Unnamed: 0,Array,Chunk
Bytes,35.08 MB,2.51 MB
Shape,"(140, 219, 286)","(10, 219, 286)"
Count,15 Tasks,14 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 35.08 MB 2.51 MB Shape (140, 219, 286) (10, 219, 286) Count 15 Tasks 14 Chunks Type float32 numpy.ndarray",286  219  140,

Unnamed: 0,Array,Chunk
Bytes,35.08 MB,2.51 MB
Shape,"(140, 219, 286)","(10, 219, 286)"
Count,15 Tasks,14 Chunks
Type,float32,numpy.ndarray


In [10]:
cube = ds.precipitation_flux.to_iris()
cube

Precipitation Flux (kg m-2 s-1),time,grid_latitude,grid_longitude
Shape,140,219,286
Dimension coordinates,,,
time,x,-,-
grid_latitude,-,x,-
grid_longitude,-,-,x
Auxiliary coordinates,,,
forecast_period,x,-,-
Scalar coordinates,,,
forecast_reference_time,1849-12-01 00:00:00,1849-12-01 00:00:00,1849-12-01 00:00:00
Attributes,,,
