In [15]:
import netCDF4
import iris
import numpy as np

In [2]:
data_dir = '/s3/informatics-eupheme/'

# Metadata Extraction

(just a few gbs of data)

In [3]:
sub_dir = data_dir + 'HadGEM3-A-N216/historical/tas/Amon/'
sub_dir

'/s3/informatics-eupheme/HadGEM3-A-N216/historical/tas/Amon/'

In [4]:
import glob
files = glob.glob(sub_dir + '*.nc')
files[:3]

['/s3/informatics-eupheme/HadGEM3-A-N216/historical/tas/Amon/tas_Amon_HadGEM3-A-N216_historical_r1i1p12_198001-198912.nc',
 '/s3/informatics-eupheme/HadGEM3-A-N216/historical/tas/Amon/tas_Amon_HadGEM3-A-N216_historical_r1i1p12_199001-199912.nc',
 '/s3/informatics-eupheme/HadGEM3-A-N216/historical/tas/Amon/tas_Amon_HadGEM3-A-N216_historical_r1i1p14_199001-199912.nc']

### just some reference for existing ways of loading this dataset

In [5]:
import warnings

In [87]:
import xarray as xr

In [88]:
%%time
ds = netCDF4.MFDataset(files)
print(ds['tas'].shape)
print(ds['tas'][0:5, 0, 0])

(9720, 324, 432)
[248.1753  234.50366 222.28442 219.5747  219.16309]
CPU times: user 40.1 ms, sys: 63.9 ms, total: 104 ms
Wall time: 9.08 s


In [None]:
%%time
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    cubes = iris.load(files)
    print(len(cubes))

In [None]:
%%time
xrds = xr.open_mfdataset(files)
xrds['tas'][0:5, 0, 0].data.compute()

In [None]:
print(xrds['tas'].shape)

### extracting info from nc

In [6]:
first_ds = netCDF4.Dataset(files[0])
first_ds

<class 'netCDF4._netCDF4.Dataset'>
root group (NETCDF3_CLASSIC data model, file format NETCDF3):
    institution: Met Office Hadley Centre, Fitzroy Road, Exeter, Devon, EX1 3PB, UK, (http://www.metoffice.gov.uk)
    institute_id: MOHC
    experiment_id: historical
    source: MOHC Unified Model [N216L85]
    model_id: HadGEM3-A-N216
    forcing: GHG, Oz, LU, Sl, Vl, AA, (GHG = CO2, N2O, CH4, CFCs, HFCs)
    parent_experiment_id: N/A
    parent_experiment_rip: N/A
    branch_time: 0.0
    contact: peter.stott@metoffice.gov.uk, andrew.ciavarella@metoffice.gov.uk
    history: MOHC pp to CMOR/NetCDF convertor (version 1.16.2) 2015-07-29T12:55:50Z CMOR rewrote data to comply with CF standards and EUCLEIA requirements.
    initialization_method: 1
    physics_version: 12
    tracking_id: c753134c-61b9-433a-8e4a-7089c83e9bc1
    mo_runid: dlrjb
    product: output
    experiment: historical
    frequency: mon
    creation_date: 2015-07-31T11:41:47Z
    Conventions: CF-1.4
    project_id: EUCL

In [7]:
def get_attrs(ds):
    attrs = ds.ncattrs()
    return {attr: ds.getncattr(attr) for attr in attrs}

get_attrs(first_ds)

{'Conventions': 'CF-1.4',
 'branch_time': 0.0,
 'cmor_version': '2.9.1',
 'contact': 'peter.stott@metoffice.gov.uk, andrew.ciavarella@metoffice.gov.uk',
 'creation_date': '2015-07-31T11:41:47Z',
 'experiment': 'historical',
 'experiment_id': 'historical',
 'forcing': 'GHG, Oz, LU, Sl, Vl, AA, (GHG = CO2, N2O, CH4, CFCs, HFCs)',
 'frequency': 'mon',
 'history': 'MOHC pp to CMOR/NetCDF convertor (version 1.16.2) 2015-07-29T12:55:50Z CMOR rewrote data to comply with CF standards and EUCLEIA requirements.',
 'initialization_method': 1,
 'institute_id': 'MOHC',
 'institution': 'Met Office Hadley Centre, Fitzroy Road, Exeter, Devon, EX1 3PB, UK, (http://www.metoffice.gov.uk)',
 'mo_runid': 'dlrjb',
 'model_id': 'HadGEM3-A-N216',
 'modeling_realm': 'atmos',
 'parent_experiment': 'N/A',
 'parent_experiment_id': 'N/A',
 'parent_experiment_rip': 'N/A',
 'physics_version': 12,
 'product': 'output',
 'project_id': 'EUCLEIA',
 'realization': 1,
 'source': 'MOHC Unified Model [N216L85]',
 'table_id'

In [8]:
def get_dimensions(ds):
    dims = ds.dimensions
    return {
        dim.name: {'name': dim.name, 'size': dim.size, 'unlimited': dim.isunlimited()}
        for dim in dims.values()}
get_dimensions(first_ds)

{'bnds': {'name': 'bnds', 'size': 2, 'unlimited': False},
 'lat': {'name': 'lat', 'size': 324, 'unlimited': False},
 'lon': {'name': 'lon', 'size': 432, 'unlimited': False},
 'time': {'name': 'time', 'size': 120, 'unlimited': True}}

In [9]:
def get_variables(ds):
    all_vars = ds.variables
    results = {}
    bounds = [b for b in [getattr(ds[v], 'bounds', None) for v in all_vars] if b]
    for var_name in all_vars:
        v_results = {}
        var = ds[var_name]
        v_attrs = dir(var)
        for v_attr in v_attrs:
            value = getattr(var, v_attr)
            if v_attr[0] != '_' and not callable(value):
                v_results[v_attr] = value
                
        # Should we store the data/points for this var?
        # All data (dimensions, bounds, 'the data, e.g. temperature', etc) is a var.
        # So we don't want to store all vars as this just stores the whole nc file v. inefficantly.
        # We don't want to store to little as this would prevent merges/concats etc being possiable withoout returning to source
        
        # Option 1) Record data for anything 0 or 1 dimensional. 
#         if len(var.shape) <= 1:  
#             v_results['points'] = var[:]
        
        # Option 2) Record data for anything that has no dimensions or that is it's own dimention.
        # Add to this anything that is a 'bound' of something else.
        if len(var.dimensions) == 0 or var.dimensions == (var_name,) or var_name in bounds:
            v_results['points'] = var[:]
            
            
        
        results[var_name] = v_results
    return results

get_variables(first_ds)

{'height': {'axis': 'Z',
  'chartostring': True,
  'datatype': dtype('float64'),
  'dimensions': (),
  'dtype': dtype('float64'),
  'long_name': 'height',
  'mask': True,
  'name': 'height',
  'ndim': 0,
  'points': masked_array(data=1.5,
               mask=False,
         fill_value=9.969209968386869e+36),
  'positive': 'up',
  'scale': True,
  'shape': (),
  'size': 1.0,
  'standard_name': 'height',
  'units': 'm'},
 'lat': {'axis': 'Y',
  'bounds': 'lat_bnds',
  'chartostring': True,
  'datatype': dtype('float64'),
  'dimensions': ('lat',),
  'dtype': dtype('float64'),
  'long_name': 'latitude',
  'mask': True,
  'name': 'lat',
  'ndim': 1,
  'points': masked_array(data=[-89.72222304344177, -89.16666746139526,
                     -88.61111187934875, -88.05555629730225,
                     -87.50000071525574, -86.94444513320923,
                     -86.38888955116272, -85.83333396911621,
                     -85.2777783870697, -84.7222228050232,
                     -84.166667222

In [11]:
def gen_metadata(filename, promote_attrs=None):
    ds = netCDF4.Dataset(filename)
    attributes =  get_attrs(ds)
    
    if(promote_attrs):
        for attr in promote_attrs:
            attributes[attr]['']
    
    return {
    'attributes': attributes,
    'dimensions': get_dimensions(ds),
    'variables': get_variables(ds),
    'filename': filename
    }

In [12]:
metadata = gen_metadata(files[0])
metadata

{'attributes': {'Conventions': 'CF-1.4',
  'branch_time': 0.0,
  'cmor_version': '2.9.1',
  'contact': 'peter.stott@metoffice.gov.uk, andrew.ciavarella@metoffice.gov.uk',
  'creation_date': '2015-07-31T11:41:47Z',
  'experiment': 'historical',
  'experiment_id': 'historical',
  'forcing': 'GHG, Oz, LU, Sl, Vl, AA, (GHG = CO2, N2O, CH4, CFCs, HFCs)',
  'frequency': 'mon',
  'history': 'MOHC pp to CMOR/NetCDF convertor (version 1.16.2) 2015-07-29T12:55:50Z CMOR rewrote data to comply with CF standards and EUCLEIA requirements.',
  'initialization_method': 1,
  'institute_id': 'MOHC',
  'institution': 'Met Office Hadley Centre, Fitzroy Road, Exeter, Devon, EX1 3PB, UK, (http://www.metoffice.gov.uk)',
  'mo_runid': 'dlrjb',
  'model_id': 'HadGEM3-A-N216',
  'modeling_realm': 'atmos',
  'parent_experiment': 'N/A',
  'parent_experiment_id': 'N/A',
  'parent_experiment_rip': 'N/A',
  'physics_version': 12,
  'product': 'output',
  'project_id': 'EUCLEIA',
  'realization': 1,
  'source': 'MOHC

In [10]:
metadata['dimensions']['time']

NameError: name 'metadata' is not defined

In [100]:
%%time
results = [gen_metadata(f) for f in files]

KeyboardInterrupt: 

In [17]:
import json
class NumpyEncoder(json.JSONEncoder):
    def default(self, obj):
        try:
            return json.JSONEncoder.default(self, obj)
        
        except TypeError:
            if isinstance(obj, np.ndarray):
                return obj.tolist()
            if isinstance(obj, (np.int32, np.int64)):
                return int(obj)
            if isinstance(obj, (np.float32, np.float64)):
                return float(obj)
            
            warnings.warn("Can not encode type %s, will encode as str(obj)" % type(obj), RuntimeWarning)
            return str(obj)

In [19]:
json.dump(metadata, open('./metadata.v2.json', 'w'), cls=NumpyEncoder)

  from ipykernel import kernelapp as app
