In [1]:
import pickle
from datetime import datetime
from itertools import product
import collections

import iris
import cf_units
import dask
import dask.bag as db
import distributed
from dask_kubernetes import KubeCluster
import numpy as np

%matplotlib inline
%config InlineBackend.figure_format = 'retina'  # Optional for retina displays

In [4]:
cluster

In [3]:
client = distributed.Client(cluster.scheduler_address)
client

0,1
Client  Scheduler: tcp://100.96.128.114:45371  Dashboard: http://100.96.128.114:33713/status,Cluster  Workers: 10  Cores: 10  Memory: 30.00 GB


In [4]:
def format_filename(dataset_name, year, month, day, run, realization, forecast_period):
    template_string = "/s3/{}/prods_op_{}_{:02d}{:02d}{:02d}_{:02d}_{:02d}_{:03d}.nc"
    return template_string.format(dataset_name, dataset_name, year, month, day, run, realization, forecast_period)

def gen_filenames(dataset_name, years, months, days, runs, realizations, forecasts):
    return [
        format_filename('mogreps-g', year, month, day, run, realisation, forecast)
        for year, month, day, run, realisation, forecast in product(years, months, days, runs, realizations, forecasts)
    ]

filenames = gen_filenames(
    dataset_name='mogreps-g',
    years=[2016],
    months=[1],
    days=range(1, 3),
    runs=[0, 12],
    realizations=range(12),
    forecasts=range(9, 175, 3)
    )

print(filenames[:3])
print(len(filenames))

['/s3/mogreps-g/prods_op_mogreps-g_20160101_00_00_009.nc', '/s3/mogreps-g/prods_op_mogreps-g_20160101_00_00_012.nc', '/s3/mogreps-g/prods_op_mogreps-g_20160101_00_00_015.nc']
2688


In [6]:
# Fast
def serial_load(cube, field, filename):
    with dask.set_options(get=dask.threaded.get):
        for c in cube.coords(dim_coords=False):
            _ = c.points
            _ = c.bounds
    return cube


@dask.delayed
def load_s(fname):
    return iris.load_raw(fname, callback=serial_load)


# Slow
def distributed_load(cube, field, filename):
    for c in cube.coords(dim_coords=False):
            _ = c.points
            _ = c.bounds
    return cube

@dask.delayed
def load_d(fname):
    return iris.load_raw(fname, callback=distributed_load)


In [41]:
%%time
# warning: slow
cubes = db.from_delayed([load_d(f) for f in filenames[:20]])
results = iris.cube.CubeList(cubes.compute())

CPU times: user 11.9 s, sys: 1.5 s, total: 13.4 s
Wall time: 24.7 s


In [42]:
%%timeit
merged_results = results.merge()

551 ms ± 40.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [43]:
%%time
cubes2 = db.from_delayed([load_s(f) for f in filenames[:20]])
results2 = iris.cube.CubeList(cubes2.compute())

CPU times: user 800 ms, sys: 32 ms, total: 832 ms
Wall time: 5.53 s


In [44]:
%%timeit
merged_results2 = results2.merge()

543 ms ± 28.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [38]:
merged_results2

[<iris 'Cube' of wet_bulb_freezing_level_altitude / (m) (time: 10; latitude: 600; longitude: 800)>,
<iris 'Cube' of wet_bulb_potential_temperature / (K) (time: 10; pressure: 3; latitude: 600; longitude: 800)>,
<iris 'Cube' of air_pressure_at_sea_level / (Pa) (time: 10; latitude: 600; longitude: 800)>,
<iris 'Cube' of air_temperature / (K) (forecast_period: 10; latitude: 600; longitude: 800)>,
<iris 'Cube' of air_temperature / (K) (forecast_period: 10; latitude: 600; longitude: 800)>,
<iris 'Cube' of air_temperature / (K) (time: 10; pressure: 16; latitude: 600; longitude: 800)>,
<iris 'Cube' of air_temperature / (K) (time: 10; latitude: 600; longitude: 800)>,
<iris 'Cube' of dew_point_temperature / (K) (time: 10; latitude: 600; longitude: 800)>,
<iris 'Cube' of fog_area_fraction / (1) (time: 10; latitude: 600; longitude: 800)>,
<iris 'Cube' of geopotential_height / (m) (time: 10; pressure: 9; latitude: 600; longitude: 800)>,
<iris 'Cube' of high_type_cloud_area_fraction / (1) (time: 10;