# Tests for SMM versus CDO

There are a few tests to check if the SMM approach is faster than the CDO one and if it is reliable in terms of output. Encouraging results below, although tested only on 2D data so far.

In [1]:
from time import time
import timeit
import os
import numpy as np
import xarray as xr
from smmregrid import cdo_generate_weights, Regridder
from smmregrid.checker import check_cdo_regrid # this is a new function introduced to verify the output
from cdo import Cdo
import pandas as pd
cdo = Cdo()

# where and which the data are
indir='tests/data'
filelist = ['onlytos-ipsl.nc','tas-ecearth.nc', '2t-era5.nc','tos-fesom.nc']
tfile = os.path.join(indir, 'r360x180.nc')

# method for remapping
methods = ['nn','con']
accesses = ['DataArray', 'Dataset']

## Robustness test

This is to verify that the regridding is equal: this is done by comparing the output from CDO to the output obtained by SMM. 
The files to be checked are above. This is the same as the thing done in the tests.

In [2]:

for filein in filelist: 
    for method in methods:
        for access in accesses: 
            cc = check_cdo_regrid(os.path.join(indir,filein), tfile, method = method, access = access)
            print(filein + ': remap' + method + ' via ' + access + ' -> ' + str(cc))


onlytos-ipsl.nc: remapnn via DataArray->True
onlytos-ipsl.nc: remapnn via Dataset->True
onlytos-ipsl.nc: remapcon via DataArray->True
onlytos-ipsl.nc: remapcon via Dataset->True
tas-ecearth.nc: remapnn via DataArray->True
tas-ecearth.nc: remapnn via Dataset->True
tas-ecearth.nc: remapcon via DataArray->True
tas-ecearth.nc: remapcon via Dataset->True
2t-era5.nc: remapnn via DataArray->True
2t-era5.nc: remapnn via Dataset->True
2t-era5.nc: remapcon via DataArray->True
2t-era5.nc: remapcon via Dataset->True
tos-fesom.nc: remapnn via DataArray->True
tos-fesom.nc: remapnn via Dataset->True
tos-fesom.nc: remapcon via DataArray->True
tos-fesom.nc: remapcon via Dataset->True


## Weight generation

Test the different weights generation possibilities with CDO, tested with conservative remapping: the climtas code is way more efficient if files are already on the disk, since the call to CDO has to be done from file. CDO bindings have a minimum overhead to be considered

In [3]:
# nrepetition for the check
nr = 5


# generate weights from file
data = []
for filein in filelist: 
 
    # open file
    xfield = xr.open_mfdataset(os.path.join(indir,filein))
    tfield = xr.open_mfdataset(tfile)

    # generate weights from file
    one = timeit.timeit(lambda: cdo_generate_weights(os.path.join(indir,filein), tfile, method = 'con'), number = nr)
    #print(filein + ': Exectime climtas from file ' + str(one/nr))
    # generate weights from xarray
    two = timeit.timeit(lambda: cdo_generate_weights(xfield, tfield, method = 'con'), number = nr)
    #print(filein + ': Exectime climtas from xarray ' + str(two/nr))
    # generatre weights with CDO bindings (from file)
    three = timeit.timeit(lambda: cdo.gencon(tfile, input = os.path.join(indir,filein), returnXDataset = True), number = nr)
    #print(filein + ': Exectime cdo from file ' + str(three/nr))
    data.append([three, one, two])

cnames = ['CDO bindings', 'CDO subprocess (from file)', 'CDO subprocess (from xarray)']
df = pd.DataFrame(data, index = filelist, columns = cnames)
df.div(df[cnames[0]],axis =0)


Unnamed: 0,CDO binding,CDO subprocess (from file),CDO subprocess (from xarray)
onlytos-ipsl.nc,1.0,0.948454,1.045684
tas-ecearth.nc,1.0,0.914512,1.195513
2t-era5.nc,1.0,0.803809,0.961959
tos-fesom.nc,1.0,0.992282,1.425957


## Full remapping 

Test the full remap (generation of the weight + applicaton) of CDO vs SMM. Results seems very much comparable!

In [4]:
# nrepetition for the check
nr = 5

# fast function to call the entire interpolation
def smm_remap(ifile, tfile):

    xfield = xr.open_mfdataset(ifile)
    wfield = cdo_generate_weights(ifile, tfile, method = 'con')
    interpolator = Regridder(weights=wfield)
    var = list(xfield.data_vars)[-1]
    rfield = interpolator.regrid(xfield)
    return(rfield)

data =[]
for filein in filelist: 

    one = timeit.timeit(lambda: cdo.remapcon(tfile, input = os.path.join(indir,filein), returnXDataset = True), number = nr)
    #print(filein + ': Exectime CDO Weight+Remap ' + str(one/nr))
    two = timeit.timeit(lambda: smm_remap(os.path.join(indir,filein), tfile), number = nr)
    #print(filein + ': Exectime SMM Weight+Remap ' + str(two/nr))
    data.append([one, two])

cnames = ['CDO (Weight+Remap)', 'SMM (Weight+Remap)']
df = pd.DataFrame(data, index = filelist, columns = cnames)
df.div(df[cnames[0]],axis =0)


Unnamed: 0,CDO (Weight+Remap),SMM (Weight+Remap)
onlytos-ipsl.nc,1.0,1.036512
tas-ecearth.nc,1.0,0.979539
2t-era5.nc,1.0,0.932578
tos-fesom.nc,1.0,1.024755


# Remapping (with weights available)

This is the real goal of smmregrid. Here we test the computation of the remap when the weights are pre-computed. Considering that SMM does not have to write anything to disk, it is several times faster, between 5 to 10. Running with Dataset implies a bit of overhead (20%). Masks so far does not seem to be an issue.

In [5]:
# nrepetition for the check
nr = 10

data =[]
for filein in filelist: 

    # CDO
    wfile = cdo.gencon(tfile, input = os.path.join(indir,filein))
    one = timeit.timeit(lambda: cdo.remap(tfile + ',' + wfile, input = os.path.join(indir,filein), returnXDataset = True), number = nr)
    #print(filein + ': Exectime CDO Remap ' + str(one/nr))

    # SMM
    xfield = xr.open_mfdataset(os.path.join(indir,filein))
    wfield = cdo_generate_weights(os.path.join(indir,filein), tfile, method = 'con')
    interpolator = Regridder(weights=wfield)
    # var as the one which have time and not have bnds (could work)
    myvar = [var for var in xfield.data_vars 
             if 'time' in xfield[var].dims and 'bnds' not in xfield[var].dims]
    two = timeit.timeit(lambda: interpolator.regrid(xfield), number = nr)
    three = timeit.timeit(lambda: interpolator.regrid(xfield[myvar]), number = nr)
    four = timeit.timeit(lambda: interpolator.regrid(xfield[myvar], masked = False), number = nr)
    data.append([one, two, three, four])

    #print(filein + ': Exectime SMM Remap (DataSet) ' + str(two/nr))
    #print(filein + ': Exectime SMM Remap (DataArray) ' + str(three/nr))
    #print(filein + ': Exectime SMM Remap (DataSet+NoMask) ' + str(four/nr))

cnames = ['CDO', 'SMM (Dataset)', 'SMM (DataArray)', 'SMM (DataSet+NoMask)']
df = pd.DataFrame(data, index = filelist, columns = cnames)
df.div(df[cnames[0]],axis =0)

Unnamed: 0,CDO,SMM (Dataset),SMM (DataArray),SMM (DataSet+NoMask)
onlytos-ipsl.nc,1.0,0.204161,0.181773,0.182117
tas-ecearth.nc,1.0,0.261651,0.216279,0.20798
2t-era5.nc,1.0,0.176852,0.144768,0.110618
tos-fesom.nc,1.0,0.127521,0.107123,0.109413


Normalize the speed up of the different configuration