## Load common functions, libs and vars

In [13]:
print("version - 15/02/19 14:18")

version - 15/02/19 14:18


In [1]:
# %load common.py
import iris
import xarray
import os
from numcodecs import Blosc
import s3fs
import zarr
import intake
from datetime import datetime, timezone, timedelta
import cf_units
import json
import  dask_kubernetes
import distributed
import boto3
from iris.experimental.equalise_cubes import equalise_attributes
import pandas as pd
import sys
import numpy as np

sys.path.append(os.path.normpath(os.getcwd()))
from offsetmap import OffSetS3Map

sqs = boto3.client('sqs')

AWS_EARTH_TIME_FORMAT = '%Y-%m-%dT%H:%M:%SZ'
# SQS_QUEUE_URL = 'https://sqs.eu-west-2.amazonaws.com/536099501702/aws-earth-test'
SQS_QUEUE_URL = 'https://sqs.eu-west-2.amazonaws.com/536099501702/rolling_zarr_test_queue'

BUCKET = "metoffice-aws-earth-zarr"

def del_msg(msg):
    sqs.delete_message(
        QueueUrl=SQS_QUEUE_URL,
        ReceiptHandle=msg['receipt_handle']
    )
    
def get_messages(max_num=10):
    res = sqs.receive_message(QueueUrl=SQS_QUEUE_URL, MaxNumberOfMessages=max_num, VisibilityTimeout=60*10)
    messages  = []
    for message in res['Messages']:
        msg = json.loads(message['Body'])
        msg['receipt_handle'] = message['ReceiptHandle']
        messages.append(msg)
    return messages

def get_zar_path(meta):
    base = f"{BUCKET}/{meta['model']}-{meta['name']}"
    if meta.get('cell_methods', False):
        base += f"-{meta['cell_methods']}"
    if  meta.get('height', False) and (len(meta['height'].strip().split(' ')) > 1):
        base += '-at_heights'
    if meta.get('pressure', False) and (len(meta['pressure'].strip().split(' ')) > 1):
        base += '-at_pressures'
    return base + '.zarr'
    

def zarr_store(meta):
    return OffSetS3Map(root=get_zar_path(meta), temp_chunk_path=meta['name'], check=False)
    
def msg_to_path(msg):
    return f'/s3/{msg["bucket"]}/{msg["key"]}'

def reshape_to_dest_cube(cube):
    return iris.util.new_axis(iris.util.new_axis(cube, 'forecast_period'), 'forecast_reference_time')


def get_proto_zarr_array(meta):
    OffSetS3Map(root=get_zar_path(meta), temp_chunk_path=meta['name'], check=False)
    array_store = OffSetS3Map(root=get_zar_path(meta) +'/' + meta['name'], temp_chunk_path='', check=False)
    return zarr.open(array_store)

def meta_from_zarr_path(path):
    decompose = path
    decompose = decompose.rsplit('.',1)[0]
    meta = {}
    
    models = ['mo-atmospheric-global-prd', 'mo-atmospheric-mogreps-g-prd', 'mo-atmospheric-ukv-prd', 'mo-atmospheric-mogreps-uk-prd']
    for possiable_model in models:
        if path.startswith(possiable_model):
            model = possiable_model
        
    meta['model'] = model
    
    decompose = decompose.replace(model+'-', '')
    
    if path.find('-at_heights'):
        height = "0 1 2 3"
        meta['height'] = height
        decompose = decompose.replace('-at_heights', '')
        
    if path.find('-at_pressures') > 0:
        pressure = "1000 2000 3000" 
        meta['pressure'] = pressure
        decompose = decompose.replace('-at_pressures', '')
        
    meta['name'] = decompose 
    
    return meta

    /opt/conda/lib/python3.6/site-packages/intake_iris/netcdf.py
and
    /opt/conda/lib/python3.6/site-packages/intake_xarray/netcdf.py
Keeping plugin from first location.
  % (plugin_name, orig_path, new_path))


## Update the zarr metadata
The only axis that we need to update is the `forecast_reference_time`. Looking at the latest run below we can see that for the final forecast step the created time lags the forecast time by about 4-5 hours. Using that we can use the current time, derive what we expect the latest `forecast_reference_time` to be and then calculate what the dimention size needs to be (from the `origin`) to account for this data.

In [2]:
def estimate_latest_avaliable_run(meta):
    if not meta['model'] == "mo-atmospheric-mogreps-uk-prd":
        raise RuntimeError(f"Can only guess latest run for model mo-atmospheric-mogreps-uk-prd. Got {meta['model']}")
        
        
    fcst_ref_time = datetime.now().astimezone(timezone.utc) - timedelta(hours=5) # about a 5 hour delay from run time to being up.
    while fcst_ref_time.hour not in  [3, 9, 15, 21] :
        fcst_ref_time = fcst_ref_time - timedelta(hours=1)
    fcst_ref_time = fcst_ref_time.replace(minute=0, second=0, microsecond=0)
    return fcst_ref_time 
                           
def estimate_earliest_avaliable_run(meta):
    # Assume a 24 hour rolling window.
    return estimate_latest_avaliable_run(meta) - timedelta(hours=24)

print(f"""
Earliest run: {estimate_earliest_avaliable_run({'model':"mo-atmospheric-mogreps-uk-prd"})}
Latest run:   {estimate_latest_avaliable_run({'model':"mo-atmospheric-mogreps-uk-prd"})}                           
""")        
                 
                           


Earliest run: 2019-02-14 03:00:00+00:00
Latest run:   2019-02-15 03:00:00+00:00                           



In [None]:

'2019-02-11T15:00:00Z', 
'2019-02-11T21:00:00Z',
'2019-02-12T03:00:00Z',
'2019-02-12T09:00:00Z'
 

Given we know what data we think should exist it's a matter of working out the length along this axis that the zarr metadata needs to grow to include this.

In [3]:
# parameters
zarr_path = 'mo-atmospheric-mogreps-uk-prd-air_temperature-at_heights.zarr'

In [4]:
msg = meta_from_zarr_path(zarr_path)

In [5]:
dest_zarr = get_proto_zarr_array(msg)
origin = dest_zarr.attrs['_origin']
offsets = []


fcst_ref_t_dim = None
for i, dim in enumerate(origin):
    if dim['name'] == 'forecast_reference_time':
        fcst_ref_t_dim_index, fcst_ref_t_dim = i, dim
    if dim['name'] == 'forecast_period':
        forecast_period_index, forecast_period_dim = i, dim


assert fcst_ref_t_dim is not None, "couldn't find dim fcst_ref_time in _origin attr of zarr"
assert forecast_period_dim is not None, "couldn't find dim fcst_ref_time in _origin attr of zarr"
 

time_unit = cf_units.Unit(fcst_ref_t_dim['unit'])
earlist_run = estimate_earliest_avaliable_run(msg)
diff_from_origin =  time_unit.date2num(earlist_run)  - fcst_ref_t_dim['at']
steps_from_origin = diff_from_origin / fcst_ref_t_dim['step']
assert steps_from_origin % 1 == 0
steps_from_origin = int(steps_from_origin)
steps_from_origin

120

So the offset of the zarr along index `fcst_ref_t_dim_index` needs to be set as `steps_from_origin`. 

In [6]:
offset = dest_zarr.attrs['_offset']
print(f"original offset: {offset}")
offset[fcst_ref_t_dim_index] = steps_from_origin
print(f"new offset: {offset}")
dest_zarr.attrs['_offset'] = offset

original offset: [110, 0, 0, 0, 0, 0]
new offset: [120, 0, 0, 0, 0, 0]


## Zarr now grown but the metadata zarrs that xarray uses not in sync

In [7]:
forecast_reference_time_array = zarr.open(s3fs.S3Map(f's3://{get_zar_path(msg)}/forecast_reference_time'))
forecast_reference_time_array 


<zarr.core.Array (4,) int64>

In [8]:
assert msg['model'] == "mo-atmospheric-mogreps-uk-prd"
MODEL_FREQUENCY_HOURS = 6
ROLLING_WINDOW_HOURS = 24

new_fcst_ref_times = [earlist_run + i * timedelta(hours=MODEL_FREQUENCY_HOURS) for i in range(ROLLING_WINDOW_HOURS//MODEL_FREQUENCY_HOURS )]
new_fcst_ref_times


[datetime.datetime(2019, 2, 14, 3, 0, tzinfo=datetime.timezone.utc),
 datetime.datetime(2019, 2, 14, 9, 0, tzinfo=datetime.timezone.utc),
 datetime.datetime(2019, 2, 14, 15, 0, tzinfo=datetime.timezone.utc),
 datetime.datetime(2019, 2, 14, 21, 0, tzinfo=datetime.timezone.utc)]

In [9]:
unit = cf_units.Unit(forecast_reference_time_array.attrs.get('units'))
unit

Unit('seconds since 1970-01-01', calendar='gregorian')

In [10]:
assert str(forecast_reference_time_array.dtype) in ['int32','int64']
forecast_reference_time_array.resize((len(new_fcst_ref_times,)))
forecast_reference_time_array[:] = [int(unit.date2num(time)) for time in new_fcst_ref_times]
forecast_reference_time_array[:]

array([1550113200, 1550134800, 1550156400, 1550178000])

In [12]:
print("End")

End
