In [250]:
import netCDF4
import iris

In [2]:
data_dir = '/s3/informatics-eupheme/'

# Metadata Extraction

(just a few gbs of data)

In [21]:
sub_dir = data_dir + 'HadGEM3-A-N216/historical/tas/Amon/'
sub_dir

'/s3/informatics-eupheme/HadGEM3-A-N216/historical/tas/Amon/'

In [52]:
import glob
files = glob.glob(sub_dir + '*.nc')
files[:3]

['/s3/informatics-eupheme/HadGEM3-A-N216/historical/tas/Amon/tas_Amon_HadGEM3-A-N216_historical_r1i1p3_201001-201312.nc',
 '/s3/informatics-eupheme/HadGEM3-A-N216/historical/tas/Amon/tas_Amon_HadGEM3-A-N216_historical_r1i1p15_200001-200912.nc',
 '/s3/informatics-eupheme/HadGEM3-A-N216/historical/tas/Amon/tas_Amon_HadGEM3-A-N216_historical_r1i1p1_201001-201312.nc']

### just some reference for existing ways of loading this dataset

In [64]:
import warnings

In [75]:
import xarray as xr

In [72]:
%%time
ds = netCDF4.MFDataset(files)
print(ds['tas'].shape)
print(ds['tas'][0:5, 0, 0])

(9720, 324, 432)
[ 249.69799805  234.67407227  227.90527344  223.82910156  216.23583984]
CPU times: user 24 ms, sys: 4 ms, total: 28 ms
Wall time: 151 ms


In [73]:
%%time
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    cubes = iris.load(files)
    print(len(cubes))

90
CPU times: user 1.97 s, sys: 228 ms, total: 2.2 s
Wall time: 2min 39s


In [86]:
%%time
xrds = xr.open_mfdataset(files)
xrds['tas'][0:5, 0, 0].data.compute()

CPU times: user 1.28 s, sys: 184 ms, total: 1.46 s
Wall time: 2min 37s


In [87]:
print(xrds['tas'].shape)

(9720, 324, 432)


### extracting info from nc

In [88]:
first_ds = netCDF4.Dataset(files[0])
first_ds

<class 'netCDF4._netCDF4.Dataset'>
root group (NETCDF3_CLASSIC data model, file format NETCDF3):
    institution: Met Office Hadley Centre, Fitzroy Road, Exeter, Devon, EX1 3PB, UK, (http://www.metoffice.gov.uk)
    institute_id: MOHC
    experiment_id: historical
    source: MOHC Unified Model [N216L85]
    model_id: HadGEM3-A-N216
    forcing: GHG, Oz, LU, Sl, Vl, AA, (GHG = CO2, N2O, CH4, CFCs, HFCs)
    parent_experiment_id: N/A
    parent_experiment_rip: N/A
    branch_time: 0.0
    contact: peter.stott@metoffice.gov.uk, andrew.ciavarella@metoffice.gov.uk
    history: MOHC pp to CMOR/NetCDF convertor (version 1.16.2) 2015-07-22T13:51:32Z CMOR rewrote data to comply with CF standards and EUCLEIA requirements.
    initialization_method: 1
    physics_version: 3
    tracking_id: 31bfc042-36c1-4a68-a9bb-a39ae6a8d1aa
    mo_runid: aojac
    product: output
    experiment: historical
    frequency: mon
    creation_date: 2015-07-24T16:40:09Z
    Conventions: CF-1.4
    project_id: EUCLE

In [230]:
def get_attrs(ds):
    attrs = ds.ncattrs()
    return {attr: str(ds.getncattr(attr)) for attr in attrs}

get_attrs(first_ds)

{'Conventions': 'CF-1.4',
 'branch_time': '0.0',
 'cmor_version': '2.9.1',
 'contact': 'peter.stott@metoffice.gov.uk, andrew.ciavarella@metoffice.gov.uk',
 'creation_date': '2015-07-24T16:40:09Z',
 'experiment': 'historical',
 'experiment_id': 'historical',
 'forcing': 'GHG, Oz, LU, Sl, Vl, AA, (GHG = CO2, N2O, CH4, CFCs, HFCs)',
 'frequency': 'mon',
 'history': 'MOHC pp to CMOR/NetCDF convertor (version 1.16.2) 2015-07-22T13:51:32Z CMOR rewrote data to comply with CF standards and EUCLEIA requirements.',
 'initialization_method': '1',
 'institute_id': 'MOHC',
 'institution': 'Met Office Hadley Centre, Fitzroy Road, Exeter, Devon, EX1 3PB, UK, (http://www.metoffice.gov.uk)',
 'mo_runid': 'aojac',
 'model_id': 'HadGEM3-A-N216',
 'modeling_realm': 'atmos',
 'parent_experiment': 'N/A',
 'parent_experiment_id': 'N/A',
 'parent_experiment_rip': 'N/A',
 'physics_version': '3',
 'product': 'output',
 'project_id': 'EUCLEIA',
 'realization': '1',
 'source': 'MOHC Unified Model [N216L85]',
 'ta

In [231]:
def get_dimensions(ds):
    dims = ds.dimensions
    return {
        dim.name: {'name': dim.name, 'size': dim.size, 'unlimited': dim.isunlimited()}
        for dim in dims.values()}
get_dimensions(first_ds)

{'bnds': {'name': 'bnds', 'size': 2, 'unlimited': False},
 'lat': {'name': 'lat', 'size': 324, 'unlimited': False},
 'lon': {'name': 'lon', 'size': 432, 'unlimited': False},
 'time': {'name': 'time', 'size': 48, 'unlimited': True}}

In [232]:
def get_variables(ds):
    all_vars = ds.variables
    results = {}
    for var_name in all_vars:
        v_results = {}
        var = ds[var_name]
        v_attrs = dir(var)
        for v_attr in v_attrs:
            value = getattr(var, v_attr)
            if v_attr[0] != '_' and not callable(value):
                v_results[v_attr] = str(value)
        if len(var.shape) == 1:
            v_results['points'] = var[:].tolist()
        results[var_name] = v_results
    return results

get_variables(first_ds)

{'height': {'axis': 'Z',
  'chartostring': 'True',
  'datatype': 'float64',
  'dimensions': '()',
  'dtype': 'float64',
  'long_name': 'height',
  'mask': 'True',
  'name': 'height',
  'ndim': '0',
  'positive': 'up',
  'scale': 'True',
  'shape': '()',
  'size': '1.0',
  'standard_name': 'height',
  'units': 'm'},
 'lat': {'axis': 'Y',
  'bounds': 'lat_bnds',
  'chartostring': 'True',
  'datatype': 'float64',
  'dimensions': "('lat',)",
  'dtype': 'float64',
  'long_name': 'latitude',
  'mask': 'True',
  'name': 'lat',
  'ndim': '1',
  'points': [-89.72222304344177,
   -89.16666746139526,
   -88.61111187934875,
   -88.05555629730225,
   -87.50000071525574,
   -86.94444513320923,
   -86.38888955116272,
   -85.83333396911621,
   -85.2777783870697,
   -84.7222228050232,
   -84.16666722297668,
   -83.61111164093018,
   -83.05555605888367,
   -82.50000047683716,
   -81.94444489479065,
   -81.38888931274414,
   -80.83333373069763,
   -80.27777814865112,
   -79.72222256660461,
   -79.1666669

In [233]:
len(some_var.shape)

1

In [234]:
metadata

{'attributes': {'Conventions': 'CF-1.4',
  'branch_time': 0.0,
  'cmor_version': '2.9.1',
  'contact': 'peter.stott@metoffice.gov.uk, andrew.ciavarella@metoffice.gov.uk',
  'creation_date': '2015-07-24T16:40:09Z',
  'experiment': 'historical',
  'experiment_id': 'historical',
  'forcing': 'GHG, Oz, LU, Sl, Vl, AA, (GHG = CO2, N2O, CH4, CFCs, HFCs)',
  'frequency': 'mon',
  'history': 'MOHC pp to CMOR/NetCDF convertor (version 1.16.2) 2015-07-22T13:51:32Z CMOR rewrote data to comply with CF standards and EUCLEIA requirements.',
  'initialization_method': 1,
  'institute_id': 'MOHC',
  'institution': 'Met Office Hadley Centre, Fitzroy Road, Exeter, Devon, EX1 3PB, UK, (http://www.metoffice.gov.uk)',
  'mo_runid': 'aojac',
  'model_id': 'HadGEM3-A-N216',
  'modeling_realm': 'atmos',
  'parent_experiment': 'N/A',
  'parent_experiment_id': 'N/A',
  'parent_experiment_rip': 'N/A',
  'physics_version': 3,
  'product': 'output',
  'project_id': 'EUCLEIA',
  'realization': 1,
  'source': 'MOHC 

In [244]:
def gen_metadata(filename):
    ds = netCDF4.Dataset(filename)
    return {
    'attributes': get_attrs(ds),
    'dimensions': get_dimensions(ds),
    'variables': get_variables(ds),
    'filename': filename
    }

In [245]:
%%time
results = [gen_metadata(f) for f in files]

CPU times: user 504 ms, sys: 188 ms, total: 692 ms
Wall time: 3min 22s


In [248]:
import json

In [249]:
json.dump(results, open('./metadata.json', 'w'))