This is about the simplest code to generate the 'hyper cube' from 'HadGEM3-A-N216/historical/tas/Amon/' data 

In [7]:
import glob
import iris
import warnings 
import cf_units
import os
from dask import array as da

TUNIT = 'days since 1960-01-01'
VNAME = 'tas'

data_dir = '/s3/informatics-eupheme/'
sub_dir = data_dir + 'HadGEM3-A-N216/historical/tas/Amon/'
sub_dir

files = sorted(glob.glob(sub_dir + '*.nc'))


time_coord_points=list(range(15,19425+1,30))
physics_coord_points = list(range(1,16))


with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        for cube in iris.load_raw(files[0]):
            latitude_coord_points = cube.coord('latitude').points
            longitude_coord_points = cube.coord('longitude').points


def file_details(time_index, pysics_index):
    time = cf_units.num2date(time_coord_points[time_index], TUNIT,cf_units.CALENDAR_STANDARD)
    syear = str(time.year)[:3] + '0'
    eyear = syear[:3] + '9' 
    shape = (120, 324, 432) 

    if syear == '2010':
        eyear = '2013'
        shape = (48, 324, 432) 
    
    basename = "tas_Amon_HadGEM3-A-N216_historical_r1i1p{physics}_{syear}01-{eyear}12.nc".format(
        syear=syear, eyear=eyear, physics=(pysics_index +1))
    path =  os.path.join(data_dir, sub_dir, basename)
    
    return path, shape



lastfile = None
p_arrays = []
dim_order = ['physics', 'time', 'lat', 'lon']
for p in range(0, len(physics_coord_points)):
    t_arrays = []
    for t in range(0, len(time_coord_points)):
      
        file, shape = file_details(t, p)            
        if file != lastfile:
            data = iris.fileformats.netcdf.NetCDFDataProxy(
                shape,
                'float32',
                file,
                VNAME,
                None)
            data = da.from_array(data, data.shape)
            t_arrays.append(data)
        
        lastfile = file
    p_arrays.append(da.concatenate(t_arrays,0))
    
data = da.stack(p_arrays, 0)
data.shape


# Build a cube from our data array

def points_to_coord(var_name, points, units=None, long_name=None, standard_name=None):
    long_name = long_name if long_name else var_name
    return iris.coords.DimCoord(
        points=points,
        standard_name=standard_name,
        long_name=long_name,
        var_name=var_name, units=units)



dim_coords = [
    points_to_coord('physics', physics_coord_points),
    points_to_coord('time', time_coord_points, TUNIT),
    points_to_coord('latitude', latitude_coord_points, 'degrees'),
    points_to_coord('longitude', longitude_coord_points, 'degrees')
]

cube = iris.cube.Cube(
        data=data,
        standard_name='air_temperature',
        long_name='air_temperature',
        var_name='tas',
        units = cf_units.Unit('K'),
        dim_coords_and_dims=[(coord, i) for i, coord in enumerate(dim_coords)])
print(cube)

air_temperature / (K)               (physics: 15; time: 648; latitude: 324; longitude: 432)
     Dimension coordinates:
          physics                           x         -              -               -
          time                              -         x              -               -
          latitude                          -         -              x               -
          longitude                         -         -              -               x


That code uses the iris `NetCDFDataProxy` I want to create a new version of this that will perform some safety checking

In [41]:
import netCDF4
import numpy as np
#netCDF4.Dataset('/this/no/real.nc')

dataset= netCDF4.Dataset(files[0])
variable = dataset.variables['tas']


dataset variable.missing_value

1e+20

In [54]:
import netCDF4
import numpy.ma as ma
class CheckingNetCDFDataProxy(object):
    """A reference to the data payload of a single NetCDF file variable."""

    __slots__ = ('shape', 'dtype', 'path', 'variable_name', 'fill_value', 'safety_check_done', 'fatal_fail')

    def __init__(self, shape, dtype, path, variable_name, fill_value=None, do_safety_check=False):
        self.safety_check_done = do_safety_check
        self.shape = shape
        self.dtype = dtype
        self.path = path
        self.variable_name = variable_name
        self.fill_value = fill_value
        self.fatal_fail = None

    @property
    def ndim(self):
        return len(self.shape)

    def check(self):
        try:
            dataset = netCDF4.Dataset(self.path)
        except OSError:
            self.fatal_fail = "no such file %s" % self.path
            self.safety_check_done = True
            return
        
        try:
            variable = dataset.variables[self.variable_name]
        except KeyError:
            self.fatal_fail = "no variable %s in file %s" % (self.variable_name, self.path)
            self.safety_check_done = True
            return

        if variable.shape != self.shape:
            self.fatal_fail = "Shape of data %s doesn't match expected %s" %(variable.shape, self.shape)
            self.safety_check_done = True
            return
        
        # TODO check variables???
        
        self.safety_check_done = True
        
        
    def  _null_data(self, keys):
#         if not self.fill_value:
#             raise AttributeError("Can not create null data when fill value not known.")
#         else:
#             return (np.ones(self.shape) * self.fill_value)[keys]
        return ma.masked_all(self.shape)[keys]
        
    def __getitem__(self, keys):
        print('__getitem__', keys)
        
        if not self.safety_check_done:
            self.check()
            
        if self.fatal_fail:
            return self._null_data(keys)
            
        try:
            dataset = netCDF4.Dataset(self.path)
            variable = dataset.variables[self.variable_name]
            # Get the NetCDF variable data and slice.
            var = variable[keys]
        finally:
            if dataset:
                dataset.close()
        return np.asanyarray(var)

    def __repr__(self):
        fmt = '<{self.__class__.__name__} shape={self.shape}' \
              ' dtype={self.dtype!r} path={self.path!r}' \
              ' variable_name={self.variable_name!r}>'
        return fmt.format(self=self)

    def __getstate__(self):
        return {attr: getattr(self, attr) for attr in self.__slots__}

    def __setstate__(self, state):
        for key, value in six.iteritems(state):
            setattr(self, key, value)



file, shape = ('/s3/informatics-eupheme/HadGEM3-A-N216/historical/tas/Amon/tas_Amon_HadGEM3-A-N216_historical_r1i1p2_196001-196912.nc', (120, 324, 432))
data_none = CheckingNetCDFDataProxy(
                shape,
                'float32',
                file,
                'pas',
                None)

data = CheckingNetCDFDataProxy(
                shape,
                'float32',
                file,
                'tas',
                None)

In [59]:
%time
data_none[10:30, 4,:]

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 6.2 µs
__getitem__ (slice(10, 30, None), 4, slice(None, None, None))


masked_array(
  data=[[--, --, --, ..., --, --, --],
        [--, --, --, ..., --, --, --],
        [--, --, --, ..., --, --, --],
        ...,
        [--, --, --, ..., --, --, --],
        [--, --, --, ..., --, --, --],
        [--, --, --, ..., --, --, --]],
  mask=[[ True,  True,  True, ...,  True,  True,  True],
        [ True,  True,  True, ...,  True,  True,  True],
        [ True,  True,  True, ...,  True,  True,  True],
        ...,
        [ True,  True,  True, ...,  True,  True,  True],
        [ True,  True,  True, ...,  True,  True,  True],
        [ True,  True,  True, ...,  True,  True,  True]],
  fill_value=1e+20,
  dtype=float64)

In [60]:
%time
data[10:30, 4, :]

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 5.72 µs
__getitem__ (slice(10, 30, None), 4, slice(None, None, None))


array([[240.61377, 240.57666, 240.5581 , ..., 240.75854, 240.71289,
        240.6643 ],
       [249.70068, 249.65454, 249.60376, ..., 249.8562 , 249.80371,
        249.75952],
       [250.38647, 250.33862, 250.29395, ..., 250.51807, 250.47852,
        250.42749],
       ...,
       [224.87036, 224.8186 , 224.74658, ..., 225.0979 , 225.02539,
        224.97656],
       [226.07153, 226.01758, 225.95508, ..., 226.2688 , 226.20068,
        226.14502],
       [214.76855, 214.68555, 214.60742, ..., 215.     , 214.93945,
        214.8645 ]], dtype=float32)