# Tests for SMM (with dask) versus CDO

There are a few tests to check if the SMM approach is faster than the CDO one and if it is reliable in terms of output. Encouraging results below, although tested only on 2D data so far.

In [4]:
from time import time
import timeit
import os
import numpy as np
import xarray as xr
from smmregrid import cdo_generate_weights, Regridder
from smmregrid.checker import check_cdo_regrid # this is a new function introduced to verify the output
from cdo import Cdo
import pandas as pd
import copy
cdo = Cdo()

# where and which the data are
indir='tests/data'
filelist = ['lsm-ifs.grb','onlytos-ipsl.nc','tas-ecearth.nc', '2t-era5.nc','tos-fesom.nc', 'ua-ecearth.nc', 'mix-cesm.nc']
#filelist = ['ua-ecearth.nc']
tfile = os.path.join(indir, 'r360x180.nc')

# method for remapping
methods = ['nn','con','bil']
accesses = ['Dataset', 'DataArray']


# create an iterable dictionary, and clean cases where we know CDO does not work
defdict = {'methods': methods, 'accesses': accesses, 'extra': ''}
base = {k: copy.deepcopy(defdict) for k in filelist}
base['tos-fesom.nc']['methods'].remove('bil')
base['lsm-ifs.grb']['extra'] = '-setgridtype,regular'
base['lsm-ifs.grb']['methods'].remove('bil')
base['lsm-ifs.grb']['methods'].remove('con')
base['mix-cesm.nc']['accesses'].remove('DataArray')

## Robustness test

This is to verify that the regridding is equal: this is done by comparing the output from CDO to the output obtained by SMM. 
The files to be checked are above. This is the same as the thing done in the tests.

In [5]:

for filein in filelist: 
    for method in base[filein]['methods']:
        for access in base[filein]['accesses']: 
            cc = check_cdo_regrid(os.path.join(indir,filein), tfile, method = method, access = access)#, extra = base[filein].get('extra'))
            print(filein + ': remap' + method + ' via ' + access + ' -> ' + str(cc))


lsm-ifs.grb: remapnn via Dataset -> True
lsm-ifs.grb: remapnn via DataArray -> True
onlytos-ipsl.nc: remapnn via Dataset -> True
onlytos-ipsl.nc: remapnn via DataArray -> True
onlytos-ipsl.nc: remapcon via Dataset -> True
onlytos-ipsl.nc: remapcon via DataArray -> True
onlytos-ipsl.nc: remapbil via Dataset -> True
onlytos-ipsl.nc: remapbil via DataArray -> True
tas-ecearth.nc: remapnn via Dataset -> True
tas-ecearth.nc: remapnn via DataArray -> True
tas-ecearth.nc: remapcon via Dataset -> True
tas-ecearth.nc: remapcon via DataArray -> True
tas-ecearth.nc: remapbil via Dataset -> True
tas-ecearth.nc: remapbil via DataArray -> True
2t-era5.nc: remapnn via Dataset -> True
2t-era5.nc: remapnn via DataArray -> True
2t-era5.nc: remapcon via Dataset -> True
2t-era5.nc: remapcon via DataArray -> True
2t-era5.nc: remapbil via Dataset -> True
2t-era5.nc: remapbil via DataArray -> True
tos-fesom.nc: remapnn via Dataset -> True
tos-fesom.nc: remapnn via DataArray -> True
tos-fesom.nc: remapcon via

From now we use only cases where we can use remapcon, i.e. we remove gaussiam reduced

In [2]:
base.pop('lsm-ifs.grb')

{'methods': ['nn'],
 'accesses': ['Dataset', 'DataArray'],
 'extra': '-setgridtype,regular'}

## Weight generation

Test the different weights generation possibilities with CDO, tested with conservative remapping: the climtas code is way more efficient if files are already on the disk, since the call to CDO has to be done from file. CDO bindings have a minimum overhead to be considered

In [8]:
# nrepetition for the check
nr = 5



# generate weights from file
data = []
for filein in base.keys(): 
 
    # open file
    xfield = xr.open_mfdataset(os.path.join(indir,filein))
    tfield = xr.open_mfdataset(tfile)

    # generate weights from file
    one = timeit.timeit(lambda: cdo_generate_weights(os.path.join(indir,filein), tfile, method = 'con'), number = nr)
    #print(filein + ': Exectime climtas from file ' + str(one/nr))
    # generate weights from xarray
    two = timeit.timeit(lambda: cdo_generate_weights(xfield, tfield, method = 'con'), number = nr)
    #print(filein + ': Exectime climtas from xarray ' + str(two/nr))
    # generatre weights with CDO bindings (from file)
    three = timeit.timeit(lambda: cdo.gencon(tfile, input = os.path.join(indir,filein), returnXDataset = True), number = nr)
    #print(filein + ': Exectime cdo from file ' + str(three/nr))
    data.append([three, one, two])

cnames = ['CDO bindings', 'CDO subprocess (from file)', 'CDO subprocess (from xarray)']
df = pd.DataFrame(data, index = base.keys(), columns = cnames)
df.div(df[cnames[0]],axis =0)


Unnamed: 0,CDO bindings,CDO subprocess (from file),CDO subprocess (from xarray)
onlytos-ipsl.nc,1.0,0.959149,1.040422
tas-ecearth.nc,1.0,0.895153,1.161307
2t-era5.nc,1.0,0.815725,0.962858
tos-fesom.nc,1.0,0.985951,1.392222
ua-ecearth.nc,1.0,0.916521,1.248493
mix-cesm.nc,1.0,0.847872,1.068933


## Full remapping 

Test the full remap (generation of the weight + applicaton) of CDO vs SMM. Still using conservative remapping. Results seems very much comparable!

In [4]:
# nrepetition for the check
nr = 5

# fast function to call the entire interpolation
def smm_remap(ifile, tfile):

    xfield = xr.open_mfdataset(ifile)
    wfield = cdo_generate_weights(ifile, tfile, method = 'con')
    interpolator = Regridder(weights=wfield)
    var = list(xfield.data_vars)[-1]
    rfield = interpolator.regrid(xfield)
    return(rfield)

data =[]
for filein in base.keys(): 

    one = timeit.timeit(lambda: cdo.remapcon(tfile, input = os.path.join(indir,filein), returnXDataset = True), number = nr)
    #print(filein + ': Exectime CDO Weight+Remap ' + str(one/nr))
    two = timeit.timeit(lambda: smm_remap(os.path.join(indir,filein), tfile), number = nr)
    #print(filein + ': Exectime SMM Weight+Remap ' + str(two/nr))
    data.append([one, two])

cnames = ['CDO (Weight+Remap)', 'SMM (Weight+Remap)']
df = pd.DataFrame(data, index = base.keys(), columns = cnames)
df.div(df[cnames[0]],axis =0)


Unnamed: 0,CDO (Weight+Remap),SMM (Weight+Remap)
onlytos-ipsl.nc,1.0,0.967143
tas-ecearth.nc,1.0,0.996663
2t-era5.nc,1.0,0.922823
tos-fesom.nc,1.0,1.043712
ua-ecearth.nc,1.0,1.166318
mix-cesm.nc,1.0,1.058284


# Remapping (with weights available)

This is the real goal of smmregrid. Here we test the computation of the remap when the weights are pre-computed, still with conservative remapping. Considering that SMM does not have to write anything to disk, it is several times faster, between 5 to 10. Running with Dataset implies a bit of overhead (20%). Masks so far does not seem to be an issue.

In [3]:
# nrepetition for the check
nr = 5

data =[]
for filein in base.keys(): 
    print(filein)

    # CDO
    wfile = cdo.gencon(tfile, input = os.path.join(indir,filein))
    one = timeit.timeit(lambda: cdo.remap(tfile + ',' + wfile, input = os.path.join(indir,filein), returnXDataset = True), number = nr)
    #print(filein + ': Exectime CDO Remap ' + str(one/nr))

    # SMM
    xfield = xr.open_mfdataset(os.path.join(indir,filein))
    wfield = cdo_generate_weights(os.path.join(indir,filein), tfile, method = 'con')
    interpolator = Regridder(weights=wfield)
    # var as the one which have time and not have bnds (could work)
    myvar = [var for var in xfield.data_vars 
             if 'time' in xfield[var].dims and 'bnds' not in xfield[var].dims]
    two = timeit.timeit(lambda: interpolator.regrid(xfield), number = nr)
    three = timeit.timeit(lambda: interpolator.regrid(xfield[myvar]), number = nr)
    four = timeit.timeit(lambda: interpolator.regrid(xfield[myvar], masked = False), number = nr)
    five = timeit.timeit(lambda: interpolator.regrid(xfield).to_netcdf('test.nc'), number = nr)
    data.append([one, two, three, four, five])

    #print(filein + ': Exectime SMM Remap (DataSet) ' + str(two/nr))
    #print(filein + ': Exectime SMM Remap (DataArray) ' + str(three/nr))
    #print(filein + ': Exectime SMM Remap (DataSet+NoMask) ' + str(four/nr))

cnames = ['CDO', 'SMM (Dataset)', 'SMM (DataArray)', 'SMM (DataArray+NoMask)', 'SMM (Dataset+Write)']
df = pd.DataFrame(data, index = base.keys(), columns = cnames)
df.div(df[cnames[0]],axis =0)

onlytos-ipsl.nc
tas-ecearth.nc
2t-era5.nc
tos-fesom.nc
ua-ecearth.nc
mix-cesm.nc


Unnamed: 0,CDO,SMM (Dataset),SMM (DataArray),SMM (DataArray+NoMask),SMM (Dataset+Write)
onlytos-ipsl.nc,1.0,0.302969,0.207936,0.197881,1.193253
tas-ecearth.nc,1.0,0.292058,0.185343,0.160926,1.036156
2t-era5.nc,1.0,0.179519,0.132975,0.09848,0.796598
tos-fesom.nc,1.0,0.137665,0.099697,0.09061,0.645483
ua-ecearth.nc,1.0,0.349889,0.334017,0.294881,1.425076
mix-cesm.nc,1.0,0.508153,0.516814,0.403554,1.48029
