in this note book I intend to explore methods/functions/classes that could be created to make creating a 'hyper cube' quick and easy

In [34]:
# The files to work with
import glob
data_dir = '/s3/informatics-eupheme/'
sub_dir = data_dir + 'HadGEM3-A-N216/historical/tas/Amon/'
files = sorted(glob.glob(sub_dir + '*.nc'))

In [35]:
# Helper methods to define coords
import iris
from collections import namedtuple
import cf_units
import dask.array as da
import numpy as np

def coord_from_cube(cube, coord_name, redefined_points=None,redefined_bounds=None):
    coord = cube.coord(coord_name).copy(points=redefined_points, bounds=redefined_bounds)
    return coord

def coord_from_points(points, units='1', standard_name=None, long_name=None, var_name=None, bounds=None):
    return iris.coords.Coord(points, long_name=long_name, units=units, standard_name=standard_name, bounds=bounds)

CoordAgg = namedtuple('CoordAgg', ['type', 'coord'])

In [36]:
import warnings
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    sample_cube = iris.load_raw(files[5])[0]

In [37]:
[c.name() for c in sample_cube.dim_coords]

['time', 'latitude', 'longitude']

In [38]:
latitude = coord_from_cube(sample_cube, 'latitude')
longitude = coord_from_cube(sample_cube,'longitude')
time = coord_from_cube(sample_cube,'time',range(15,19425+1,30)) # 1960-01-15 to 2013-12-15 in 'days since 1960-01-01' on 360 day calander one month (30 day) steps
pyhsics = coord_from_points(range(1, 15+1), long_name='physics')

coords = [CoordAgg('stack', pyhsics), CoordAgg('concatenate', time), CoordAgg('const', latitude), CoordAgg('const', longitude)]





In [5]:
VNAME = "tas"

def stack_over(coord, getter ):
    def stack(*args):
        to_stack = list(getter(*args, p) for p in coord.points)
        print("Stack", to_stack, type(to_stack[0]))
        return da.stack(to_stack)
    return stack

def concatenate_over(coord, getter ):
    def concatenate(*args):
        to_concatenate = list(getter(*args, p) for p in coord.points)
        print("concatenate", to_concatenate, type(to_concatenate[0]))
        return da.concatenate(to_concatenate)
    return concatenate

def file_details(pysics_index, time_index): 
    data_dir = '/s3/informatics-eupheme/'
    sub_dir = data_dir + 'HadGEM3-A-N216/historical/tas/Amon/'
    time = cf_units.num2date(time_coord_points[time_index], TUNIT,cf_units.CALENDAR_STANDARD)
    syear = str(time.year)[:3] + '0'
    eyear = syear[:3] + '9' 
    shape = (120, 324, 432) 
    if syear == '2010':
        eyear = '2013'
        shape = (48, 324, 432) 
    basename = "tas_Amon_HadGEM3-A-N216_historical_r1i1p{physics}_{syear}01-{eyear}12.nc".format(
        syear=syear, eyear=eyear, physics=(pysics_index +1))
    path =  os.path.join(data_dir, sub_dir, basename)
    return path, shape


def caching_getter():
    file_details(p, t)

def getter(p, t):
    file, shape = file_details(p, t)
    

def cached_file_to_array(cache, file, shape):
    data = cache.get(file, None)
    if not data:
        data = file_to_array(file, shape)
    cache[file] = data
    return data 
    
def file_to_array(file, shape):
    data = iris.fileformats.netcdf.NetCDFDataProxy(
                shape,
                'float32',
                file,
                VNAME,
                None)
    data = da.from_array(data, data.shape)
 


# def dim_concatenate(coord, getter ):
#     to_concatenate = list(getter(p) for p in coord.points)
#     return da.concatenate(to_stack)

class Persist(object)
    def get_file(physics, time):
        item = da.from_array(np.array(["(%s,%s)"%(physics, time)]), 1)
        print("item", type(item))
        return item, item.shape




def get_file(physics, time):
    item = da.from_array(np.array(["(%s,%s)"%(physics, time)]), 1)
    print("item", type(item))
    return item, item.shape

pcoord = namedtuple('Coord',['points'])([1,2,3])
tcoord = namedtuple('Coord',['points'])([1,2,3])

the_agg = dim_stack(pcoord, 
              dim_stack(tcoord, get_file))()
the_agg.compute()

item <class 'dask.array.core.Array'>
item <class 'dask.array.core.Array'>
item <class 'dask.array.core.Array'>
Stack [dask.array<array, shape=(1,), dtype=<U5, chunksize=(1,)>, dask.array<array, shape=(1,), dtype=<U5, chunksize=(1,)>, dask.array<array, shape=(1,), dtype=<U5, chunksize=(1,)>] <class 'dask.array.core.Array'>
item <class 'dask.array.core.Array'>
item <class 'dask.array.core.Array'>
item <class 'dask.array.core.Array'>
Stack [dask.array<array, shape=(1,), dtype=<U5, chunksize=(1,)>, dask.array<array, shape=(1,), dtype=<U5, chunksize=(1,)>, dask.array<array, shape=(1,), dtype=<U5, chunksize=(1,)>] <class 'dask.array.core.Array'>
item <class 'dask.array.core.Array'>
item <class 'dask.array.core.Array'>
item <class 'dask.array.core.Array'>
Stack [dask.array<array, shape=(1,), dtype=<U5, chunksize=(1,)>, dask.array<array, shape=(1,), dtype=<U5, chunksize=(1,)>, dask.array<array, shape=(1,), dtype=<U5, chunksize=(1,)>] <class 'dask.array.core.Array'>
Stack [dask.array<stack, sha

array([[['(1,1)'],
        ['(1,2)'],
        ['(1,3)']],

       [['(2,1)'],
        ['(2,2)'],
        ['(2,3)']],

       [['(3,1)'],
        ['(3,2)'],
        ['(3,3)']]], dtype='<U5')

In [47]:
for p in range(0, len(physics)):
    for t in range(0, len(time)):
        file

In [40]:
def builder(coords, file_finder):
    constants = []
    while coords[-1].type == 'const':
        coords = coords[:-1]
    
    top_level_coord, *other_coords = coords 
    for point in top_level_coord.points:
        if
        
def build(coords, file_finder):
    top_level_coord, *other_coords = coords 
    for point in top_level_coord.points:
        if len(other_coords) > 0:
            arrays = build(coords)
            if(top_level_coord.type == 'stack'):
                 da.stack(p_arrays, 0)
        else:
            file, shape = file_details(t, p)
            data = iris.fileformats.netcdf.NetCDFDataProxy(
                            shape, 'float32', file, VNAME, None)
            data = da.from_array(data, data.shape)
        
        
    
    

ValueError: 'physics' is not a valid standard_name

In [7]:
import netCDF4
class NetCDFDataProxy(object):
    """A reference to the data payload of a single NetCDF file variable."""

    __slots__ = ('shape', 'dtype', 'path', 'variable_name', 'fill_value')

    def __init__(self, shape, dtype, path, variable_name, fill_value):
        self.shape = shape
        self.dtype = dtype
        self.path = path
        self.variable_name = variable_name
        self.fill_value = fill_value

    @property
    def ndim(self):
        return len(self.shape)

    def __getitem__(self, keys):
        print('__getitem__', keys)
        dataset = netCDF4.Dataset(self.path)
        try:
            variable = dataset.variables[self.variable_name]
            # Get the NetCDF variable data and slice.
            var = variable[keys]
        finally:
            dataset.close()
        return np.asanyarray(var)

    def __repr__(self):
        fmt = '<{self.__class__.__name__} shape={self.shape}' \
              ' dtype={self.dtype!r} path={self.path!r}' \
              ' variable_name={self.variable_name!r}>'
        return fmt.format(self=self)

    def __getstate__(self):
        return {attr: getattr(self, attr) for attr in self.__slots__}

    def __setstate__(self, state):
        for key, value in six.iteritems(state):
            setattr(self, key, value)



file, shape = ('/s3/informatics-eupheme/HadGEM3-A-N216/historical/tas/Amon/tas_Amon_HadGEM3-A-N216_historical_r1i1p2_196001-196912.nc', (120, 324, 432))
data = NetCDFDataProxy(
                shape,
                'float32',
                file,
                'tas',
                None)

It's not clear from the above but the the problem become hard if you work from the top down, i.e. take the desierd hypercube and try work out the path through the aggrigations down to the individual files you might need. I think the problem will be easier if working the other way. Start with the individual files and 'build them up' in to a hypercube. 

In [44]:
import re

def date_to_num(year, month, day):
    TUNIT = 'days since 1960-01-01'
    syear, smonth, sday = TUNIT.rstrip().split(' ')[-1].split('-')
    assert sday == smonth and int(sday) == 1
    assert TUNIT.strip().startswith('days since')
    start_year = int(syear)
    return (year - start_year) * 360 + (month - 1 )* 30 + ( day -1)

def file_to_coords(file):
    Dim = namedtuple('Dim', [ 'name','points','as_dim_in_file'])
    match = re.match("tas_Amon_HadGEM3-A-N216_historical_r1i1p([0-9]+)_([0-9]{6})-([0-9]{6}).nc", file)
    physics, start, end = match.groups()
    physics = int(physics)
    syear, smonth = int(start[:4]), int(start[4:])
    eyear, emonth = int(end[:4]), int(end[4:])
    
    physics = Dim('physics', physics, False)
    time = Dim('time', list(range(date_to_num(syear, smonth, 15), date_to_num(eyear, emonth, 15)+1, 30)), True)
    lat = Dim('latitude', latitude.points, True)
    lon = Dim('longitude', longitude.points, True)

    return (physics, time, lat, lon)

def file_names():
    files = []
    for physics in range(1,15+1):
        for decade in range (1970, 2010+1, 10):
            start = decade
            end = decade+9 if decade < 2010 else 2013
            rangestr = "%d01-%d12" %(start, end)
            files.append('tas_Amon_HadGEM3-A-N216_historical_r1i1p%d_%d01-%d12.nc' % (physics, start, end))
    return files
#     decades = 1970
#     'tas_Amon_HadGEM3-A-N216_historical_r1i1p15_201001-201312.nc',
#     'tas_Amon_HadGEM3-A-N216_historical_r1i1p1_196001-196912.nc',
#     'tas_Amon_HadGEM3-A-N216_historical_r1i1p1_197001-197912.nc',

file ='tas_Amon_HadGEM3-A-N216_historical_r1i1p2_196001-196912.nc'
file_names()

['tas_Amon_HadGEM3-A-N216_historical_r1i1p1_197001-197912.nc',
 'tas_Amon_HadGEM3-A-N216_historical_r1i1p1_198001-198912.nc',
 'tas_Amon_HadGEM3-A-N216_historical_r1i1p1_199001-199912.nc',
 'tas_Amon_HadGEM3-A-N216_historical_r1i1p1_200001-200912.nc',
 'tas_Amon_HadGEM3-A-N216_historical_r1i1p1_201001-201312.nc',
 'tas_Amon_HadGEM3-A-N216_historical_r1i1p2_197001-197912.nc',
 'tas_Amon_HadGEM3-A-N216_historical_r1i1p2_198001-198912.nc',
 'tas_Amon_HadGEM3-A-N216_historical_r1i1p2_199001-199912.nc',
 'tas_Amon_HadGEM3-A-N216_historical_r1i1p2_200001-200912.nc',
 'tas_Amon_HadGEM3-A-N216_historical_r1i1p2_201001-201312.nc',
 'tas_Amon_HadGEM3-A-N216_historical_r1i1p3_197001-197912.nc',
 'tas_Amon_HadGEM3-A-N216_historical_r1i1p3_198001-198912.nc',
 'tas_Amon_HadGEM3-A-N216_historical_r1i1p3_199001-199912.nc',
 'tas_Amon_HadGEM3-A-N216_historical_r1i1p3_200001-200912.nc',
 'tas_Amon_HadGEM3-A-N216_historical_r1i1p3_201001-201312.nc',
 'tas_Amon_HadGEM3-A-N216_historical_r1i1p4_197001-1979

In [55]:
class AggriableNetCDFProxy(iris.fileformats.netcdf.NetCDFDataProxy):
    def __init__(self, dims, path, variable_name):
        shape = [len(d.points) for d in dims if d.as_dim_in_file]
        self.dims = dims
        super().__init__(shape, 'float32', path, variable_name, None)
    
     
    
def build_ndarray(var, coords_and_file):
    coord, file = coords_and_file
    NetCDFDataProxy(
                shape,
                'float32',
                file,
                var,
                None)
    
    
data = AggriableNetCDFProxy(file_to_coords(file), file, 'tas')
ddata = da.from_array(data, data.shape)

def concat(dim_name, aggriables):
    pass
    
    
    
# AggStack('physics', 
#          agg_concat('time',
#                 build_ndarray(
#                   file_to_coords(file) for file in file_names))

In [56]:
data.shape

[120, 324, 432]

In [58]:
ddata.var


<bound method Array.var of dask.array<array, shape=(120, 324, 432), dtype=float32, chunksize=(120, 324, 432)>>