# EXAMPLE NOTEBOOK: 24 hour rolling mean of surface air pressure
### Purpose of this notebook is to demonstrate doing a 24 hour rolling mean with the hourly data that has been saved to Zarr

In [1]:
import os
import iris
import copy
import zarr
import xarray as xr
import numpy as np
import matplotlib.pyplot as plt

import crd_utils as crd

In [2]:
import getpass
account_key = getpass.getpass()

 ························································································


In [3]:
# zarr = '/data/cssp-china/zarr_daily_1851-1859/'
# zarr.storage.ABSStore(container, prefix='', account_name=None, account_key=None, blob_service_kwargs=None)

prefix_read ='zarr_hourly_1851-1859'
zarr_read = zarr.storage.ABSStore('cssp-china', prefix=prefix_read, account_name='awsearth', account_key=account_key, blob_service_kwargs=None)
print(f'/data/cssp-china/{prefix_read}')

/data/cssp-china/zarr_hourly_1851-1859


In [4]:
%%time
ds = xr.open_zarr(zarr_read)
ds

CPU times: user 1.01 s, sys: 44.9 ms, total: 1.05 s
Wall time: 2.51 s


In [5]:
# What is the size in GBs?
ds.nbytes / 1e9

57.945407596

In [41]:
sap = ds.surface_air_pressure.to_dataset()
sap.nbytes / 1e9

19.374937644

### Using dask_distributed, let's compute the rolling mean

In [10]:
import os
import distributed
import dask
from dask_kubernetes import KubeCluster
from dask import array as da

In [11]:
cluster = KubeCluster()
# cluster.adapt(minimum=1)
cluster.scale(n=20)
cluster

distributed.scheduler - INFO - Clear task state
distributed.scheduler - INFO -   Scheduler at:  tcp://10.244.0.246:41175
distributed.scheduler - INFO -   dashboard at:                     :8787


VBox(children=(HTML(value='<h2>KubeCluster</h2>'), HBox(children=(HTML(value='\n<div>\n  <style scoped>\n    .…

In [12]:
client = distributed.Client(cluster)
client

distributed.scheduler - INFO - Receive client connection: Client-4fde2e8a-5fba-11ea-8904-8e5c65359264
distributed.core - INFO - Starting established connection


0,1
Client  Scheduler: tcp://10.244.0.246:41175  Dashboard: /user/kaedonkers/proxy/8787/status,Cluster  Workers: 0  Cores: 0  Memory: 0 B


## Create a rolling window for a Dataset using Xarray

In [44]:
sap_roll = sap.rolling(time=24, center=False)
sap_roll

DatasetRolling [window->24,center->False,dim->time]

In [45]:
%%time
sap_rmean = sap_roll.mean()

CPU times: user 349 ms, sys: 13.4 ms, total: 362 ms
Wall time: 600 ms


## Write to a zarr

In [26]:
prefix_write = 'zarr_rolling_24hr_mean_surfaceairpressure'
zarr_write_path = f'/data/cssp-china/{prefix_write}'
zarr_write = zarr.storage.ABSStore('cssp-china', prefix=prefix_write, account_name='awsearth', account_key=account_key, blob_service_kwargs=None)

In [27]:
print(zarr_write_path)
os.path.isdir(zarr_write_path)

/data/cssp-china/zarr_rolling_24hr_mean_surfaceairpressure


False

In [28]:
!rm -rdf {zarr_write_path}

In [29]:
os.path.isdir(zarr_write_path)

False

In [42]:
chunks = dict(sap.dims, time=200)
chunks

{'grid_latitude': 219, 'grid_longitude': 286, 'time': 200}

In [46]:
%%time
sap_rmean = sap_rmean.chunk(chunks=chunks)
sap_rmean.to_zarr(zarr_write, consolidated=True, mode='w')

CPU times: user 28.1 s, sys: 2.26 s, total: 30.4 s
Wall time: 2min 8s


<xarray.backends.zarr.ZarrStore at 0x7ff8ee8d36d0>

In [47]:
ds_z = xr.open_zarr(zarr_write)
ds_z

In [48]:
ds_z.nbytes / 1e9

19.374937644

### Rolling 24hr mean works!
- Processing time with 20 dask workers = **2m 08s**