# Data pipeline for converting precipitation data `.pp` -> `Zarr`

In [6]:
import iris
import os
import xarray as xr
import numpy as np

In [7]:
def pp_to_cube(filename, filepath, constraints={}):
    # Load a cube from a .pp file
    cube, = iris.load(os.path.join(filepath, filename), iris.AttributeConstraint(**constraints))
    print(f'Cube loaded from {filename}')
    return cube

In [8]:
def cube_to_xr(cube):
    # Convert Iris cube to Xarray Dataset
    return xr.DataArray.from_iris(cube).to_dataset()

In [9]:
def xr_to_zarr(dataset, zarr_store, chunks={'time':10, 'grid_latitude':219, 'grid_longitude':286}, append_dim='time'):
    # Write dataset to new zarr store
    # OR append dataset to an existing zarr store
    dataset = dataset.chunk(chunks=chunks)
    if os.path.isdir(zarr_store):
        dataset.to_zarr(zarr_store, consolidated=True, append_dim=append_dim)
        print(f'Appended cube to {zarr_store}')
    else:
        dataset.to_zarr(zarr_store, mode='w', consolidated=True)
        print(f'Written cube to {zarr_store}')

In [10]:
def datetimes_from_cube(cube):
    return xr.DataArray.from_iris(cube).time.data

def datetimes_from_zarr(zarr_store):
    return xr.open_zarr(zarr_store).time.data

In [15]:
%%time
# Pipeline with time checking
STASH = 'm01s05i216'
filepath = '/data/cssp-china/mini-dataset-24-01-19/20CR/daily'
files = sorted(os.listdir(filepath))
zarr_store = '/data/cssp-china/zarr_for_theo'
times_written = datetimes_from_zarr(zarr_store)
# times_written = np.ndarray([])

for file in files[1:10]:
    print(f'Filename = {file}')
    cube = pp_to_cube(file, filepath, constraints={'STASH': STASH})
    times = datetimes_from_cube(cube)
    if not set(times).issubset(times_written):
        xr_to_zarr(cube_to_xr(cube), zarr_store)
        times_written = np.append(times_written, times)
    else:
        print(f'{file} already written to {zarr_store}')

print(f'Finish')
display(times_written)

Filename = apepda.pa511f0.pp
Cube loaded from apepda.pa511f0.pp
apepda.pa511f0.pp already written to /data/cssp-china/zarr_for_theo
Filename = apepda.pa511p0.pp
Cube loaded from apepda.pa511p0.pp
Appended cube to /data/cssp-china/zarr_for_theo
Filename = apepda.pa51240.pp
Cube loaded from apepda.pa51240.pp
Appended cube to /data/cssp-china/zarr_for_theo
Filename = apepda.pa512e0.pp
Cube loaded from apepda.pa512e0.pp
Appended cube to /data/cssp-china/zarr_for_theo
Filename = apepda.pa512o0.pp
Cube loaded from apepda.pa512o0.pp
Appended cube to /data/cssp-china/zarr_for_theo
Filename = apepda.pa51360.pp
Cube loaded from apepda.pa51360.pp
Appended cube to /data/cssp-china/zarr_for_theo
Filename = apepda.pa513g0.pp
Cube loaded from apepda.pa513g0.pp
Appended cube to /data/cssp-china/zarr_for_theo
Filename = apepda.pa513q0.pp
Cube loaded from apepda.pa513q0.pp
Appended cube to /data/cssp-china/zarr_for_theo
Filename = apepda.pa51450.pp
Cube loaded from apepda.pa51450.pp
Appended cube to /da

array(['1851-01-05T12:00:00.000000000', '1851-01-06T12:00:00.000000000',
       '1851-01-07T12:00:00.000000000', '1851-01-08T12:00:00.000000000',
       '1851-01-09T12:00:00.000000000', '1851-01-10T12:00:00.000000000',
       '1851-01-11T12:00:00.000000000', '1851-01-12T12:00:00.000000000',
       '1851-01-13T12:00:00.000000000', '1851-01-14T12:00:00.000000000',
       '1851-01-15T12:00:00.000000000', '1851-01-16T12:00:00.000000000',
       '1851-01-17T12:00:00.000000000', '1851-01-18T12:00:00.000000000',
       '1851-01-19T12:00:00.000000000', '1851-01-20T12:00:00.000000000',
       '1851-01-21T12:00:00.000000000', '1851-01-22T12:00:00.000000000',
       '1851-01-23T12:00:00.000000000', '1851-01-24T12:00:00.000000000',
       '1851-01-25T12:00:00.000000000', '1851-01-26T12:00:00.000000000',
       '1851-01-27T12:00:00.000000000', '1851-01-28T12:00:00.000000000',
       '1851-01-29T12:00:00.000000000', '1851-01-30T12:00:00.000000000',
       '1851-01-31T12:00:00.000000000', '1851-02-01

CPU times: user 13 s, sys: 1.54 s, total: 14.5 s
Wall time: 34.9 s


In [16]:
zarr_store

'/data/cssp-china/zarr_for_theo'

In [17]:
ds = xr.open_zarr(zarr_store)
ds

<xarray.Dataset>
Dimensions:                  (grid_latitude: 219, grid_longitude: 286, time: 90)
Coordinates:
    forecast_period          (time) timedelta64[ns] dask.array<chunksize=(10,), meta=np.ndarray>
    forecast_reference_time  datetime64[ns] ...
  * grid_latitude            (grid_latitude) float32 22.88 22.66 ... -25.08
  * grid_longitude           (grid_longitude) float32 323.48 323.7 ... 386.18002
  * time                     (time) datetime64[ns] 1851-01-05T12:00:00 ... 1851-04-04T12:00:00
Data variables:
    precipitation_flux       (time, grid_latitude, grid_longitude) float32 dask.array<chunksize=(10, 219, 286), meta=np.ndarray>

In [18]:
ds.precipitation_flux

<xarray.DataArray 'precipitation_flux' (time: 90, grid_latitude: 219, grid_longitude: 286)>
dask.array<zarr, shape=(90, 219, 286), dtype=float32, chunksize=(10, 219, 286), chunktype=numpy.ndarray>
Coordinates:
    forecast_period          (time) timedelta64[ns] dask.array<chunksize=(10,), meta=np.ndarray>
    forecast_reference_time  datetime64[ns] ...
  * grid_latitude            (grid_latitude) float32 22.88 22.66 ... -25.08
  * grid_longitude           (grid_longitude) float32 323.48 323.7 ... 386.18002
  * time                     (time) datetime64[ns] 1851-01-05T12:00:00 ... 1851-04-04T12:00:00
Attributes:
    STASH:          [1, 5, 216]
    cell_methods:   time: mean (interval: 1 hour)
    source:         Data from Met Office Unified Model
    standard_name:  precipitation_flux
    units:          kg m-2 s-1

In [19]:
ds.precipitation_flux.data

Unnamed: 0,Array,Chunk
Bytes,22.55 MB,2.51 MB
Shape,"(90, 219, 286)","(10, 219, 286)"
Count,10 Tasks,9 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 22.55 MB 2.51 MB Shape (90, 219, 286) (10, 219, 286) Count 10 Tasks 9 Chunks Type float32 numpy.ndarray",286  219  90,

Unnamed: 0,Array,Chunk
Bytes,22.55 MB,2.51 MB
Shape,"(90, 219, 286)","(10, 219, 286)"
Count,10 Tasks,9 Chunks
Type,float32,numpy.ndarray


In [20]:
cube = ds.precipitation_flux.to_iris()
cube

Precipitation Flux (kg m-2 s-1),time,grid_latitude,grid_longitude
Shape,90,219,286
Dimension coordinates,,,
time,x,-,-
grid_latitude,-,x,-
grid_longitude,-,-,x
Auxiliary coordinates,,,
forecast_period,x,-,-
Scalar coordinates,,,
forecast_reference_time,1849-12-01 00:00:00,1849-12-01 00:00:00,1849-12-01 00:00:00
Attributes,,,
