# Tests for SMM (without dask) versus CDO

There are a few tests to check if the SMM approach is faster than the CDO one and if it is reliable in terms of output. Encouraging results below, although tested only on 2D data so far.

In [72]:
from time import time
import timeit
import os
import numpy as np
import xarray as xr
from smmregrid import cdo_generate_weights, Regridder
from smmregrid.checker import check_cdo_regrid # this is a new function introduced to verify the output
from cdo import Cdo
import pandas as pd
import copy
cdo = Cdo()

# where and which the data are
indir='tests/data'
filelist = ['lsm-ifs.grb','onlytos-ipsl.nc','tas-ecearth.nc', '2t-era5.nc','tos-fesom.nc', 'ua-ecearth.nc', 'mix-cesm.nc', 'era5-mon.nc']
#filelist = ['era5-mon.nc'] # this is not available on github
#filelist = ['ua-ecearth.nc']
tfile = os.path.join(indir, 'r360x180.nc')

# method for remapping
methods = ['nn','con','bil']
accesses = ['Dataset', 'DataArray']


# create an iterable dictionary, and clean cases where we know CDO does not work
defdict = {'methods': methods, 'accesses': accesses, 'extra': ''}
base = {k: copy.deepcopy(defdict) for k in filelist}
if 'tos-fesom.nc' in filelist:
    base['tos-fesom.nc']['methods'].remove('bil')
if 'lsm-ifs.grb' in filelist:
    base['lsm-ifs.grb']['extra'] = '-setgridtype,regular'
    base['lsm-ifs.grb']['methods'].remove('bil')
    base['lsm-ifs.grb']['methods'].remove('con')
if 'mix-cesm.nc' in filelist:
    base['mix-cesm.nc']['accesses'].remove('DataArray')

## Robustness test

This is to verify that the regridding is equal: this is done by comparing the output from CDO to the output obtained by SMM. 
The files to be checked are above. This is the same as the thing done in the tests.

In [73]:

for filein in filelist: 
    for method in base[filein]['methods']:
        for access in base[filein]['accesses']: 
            cc = check_cdo_regrid(os.path.join(indir,filein), tfile, method = method, access = access)#, extra = base[filein].get('extra'))
            print(filein + ': remap' + method + ' via ' + access + ' -> ' + str(cc))


lsm-ifs.grb: remapnn via Dataset -> True
lsm-ifs.grb: remapnn via DataArray -> True
onlytos-ipsl.nc: remapnn via Dataset -> True
onlytos-ipsl.nc: remapnn via DataArray -> True
onlytos-ipsl.nc: remapcon via Dataset -> True
onlytos-ipsl.nc: remapcon via DataArray -> True
onlytos-ipsl.nc: remapbil via Dataset -> True
onlytos-ipsl.nc: remapbil via DataArray -> True
tas-ecearth.nc: remapnn via Dataset -> True
tas-ecearth.nc: remapnn via DataArray -> True
tas-ecearth.nc: remapcon via Dataset -> True


From now we use only cases where we can use remapcon, i.e. we remove gaussiam reduced

In [74]:
if 'lsm-ifs.grb' in filelist:
    base.pop('lsm-ifs.grb')

## Full remapping 

Test the full remap (generation of the weight + applicaton) of CDO vs SMM. Still using conservative remapping. Results seems very much comparable!

In [4]:
# nrepetition for the check
nr = 5

# fast function to call the entire interpolation
def smm_remap(ifile, tfile):

    xfield = xr.open_mfdataset(ifile)
    wfield = cdo_generate_weights(ifile, tfile, method = 'con')
    interpolator = Regridder(weights=wfield)
    var = list(xfield.data_vars)[-1]
    rfield = interpolator.regrid(xfield)
    return(rfield)

data =[]
for filein in base.keys(): 

    one = timeit.timeit(lambda: cdo.remapcon(tfile, input = os.path.join(indir,filein), returnXDataset = True), number = nr)
    #print(filein + ': Exectime CDO Weight+Remap ' + str(one/nr))
    two = timeit.timeit(lambda: smm_remap(os.path.join(indir,filein), tfile), number = nr)
    #print(filein + ': Exectime SMM Weight+Remap ' + str(two/nr))
    data.append([one, two])

cnames = ['CDO (Weight+Remap)', 'SMM (Weight+Remap)']
df = pd.DataFrame(data, index = base.keys(), columns = cnames)
df.div(df[cnames[0]],axis =0)


Unnamed: 0,CDO (Weight+Remap),SMM (Weight+Remap)
onlytos-ipsl.nc,1.0,0.995816
tas-ecearth.nc,1.0,1.014669
2t-era5.nc,1.0,0.862746
tos-fesom.nc,1.0,1.023985
ua-ecearth.nc,1.0,1.030331
mix-cesm.nc,1.0,0.989048


# Remapping (with weights available)

This is the real goal of smmregrid. Here we test the computation of the remap when the weights are pre-computed, still with conservative remapping. Considering that SMM does not have to write anything to disk, it is several times faster, between 5 to 10. Running with Dataset implies a bit of overhead (20%). Masks so far does not seem to be an issue.

In [75]:
data =[]
for filein in base.keys(): 
    print(filein)
    if filein == 'era5-mon.nc': 
        nr = 1
    else :
        nr = 10

    # CDO
    wfile = cdo.gencon(tfile, input = os.path.join(indir,filein))
    one = timeit.timeit(lambda: cdo.remap(tfile + ',' + wfile, input = os.path.join(indir,filein), returnXDataset = True), number = nr)
    #print(filein + ': Exectime CDO Remap ' + str(one/nr))

    # SMM: load field and weights, initialize regridder
    xfield = xr.open_mfdataset(os.path.join(indir,filein))
    wfield = cdo_generate_weights(os.path.join(indir,filein), tfile, method = 'con')
    interpolator = Regridder(weights=wfield)
 
    # var as the one which have time and not have bnds, pick the first one
    myvar = [var for var in xfield.data_vars 
             if 'time' in xfield[var].dims and 'bnds' not in xfield[var].dims]
   
    # dataset infos
    nrecords = xfield[myvar[0]].shape
    nvars = len(myvar)


    two = timeit.timeit(lambda: interpolator.regrid(xfield), number = nr)
    three = timeit.timeit(lambda: interpolator.regrid(xfield[myvar[0]]), number = nr)
    four = timeit.timeit(lambda: interpolator.regrid(xfield[myvar[0]], masked = False), number = nr)
    five = timeit.timeit(lambda: interpolator.regrid(xfield).to_netcdf('test.nc'), number = nr)
    os.remove('test.nc')
    six = timeit.timeit(lambda: interpolator.regrid(xfield[myvar[0]]).to_netcdf('test.nc'), number = nr)
    os.remove('test.nc')
    data.append([nvars, nrecords, one, two, three, four, five, six])

    #print(filein + ': Exectime SMM Remap (DataSet) ' + str(two/nr))
    #print(filein + ': Exectime SMM Remap (DataArray) ' + str(three/nr))
    #print(filein + ': Exectime SMM Remap (DataSet+NoMask) ' + str(four/nr))

cnames = ['NVars', 'NRecords', 'CDO', 
          'SMM (Dataset)', 'SMM (DataArray)', 'SMM (DataArray+NoMask)', 
          'SMM (Dataset+Write)', 'SMM (DataArray+Write)']
df = pd.DataFrame(data, index = base.keys(), columns = cnames)
final = pd.concat([df.iloc[:,0:2],df.iloc[:,2:].div(df[cnames[2]], axis=0)], join='outer', axis=1)
final


onlytos-ipsl.nc
tas-ecearth.nc
2t-era5.nc
tos-fesom.nc
ua-ecearth.nc
mix-cesm.nc
era5-mon.nc


Unnamed: 0,NVars,NRecords,CDO,SMM (Dataset),SMM (DataArray),SMM (DataArray+NoMask),SMM (Dataset+Write),SMM (DataArray+Write)
onlytos-ipsl.nc,1,"(12, 332, 362)",1.0,0.216799,0.07789,0.050437,0.997036,0.77708
tas-ecearth.nc,1,"(12, 256, 512)",1.0,0.226347,0.09004,0.063186,1.141438,0.958676
2t-era5.nc,1,"(12, 73, 144)",1.0,0.170659,0.094557,0.061034,0.845712,0.765937
tos-fesom.nc,1,"(12, 126859)",1.0,0.113976,0.039926,0.025682,0.755877,0.623671
ua-ecearth.nc,1,"(2, 19, 256, 512)",1.0,0.398825,0.067782,0.044185,1.61761,1.359219
mix-cesm.nc,4,"(12, 192, 288)",1.0,0.549228,0.068802,0.045213,1.738173,0.670783
era5-mon.nc,1,"(864, 721, 1440)",1.0,0.825034,0.000652,0.00044,1.883003,1.085205


Print to markdown so that we can copy paste it on the portal

In [76]:
print(final.to_markdown())

|                 |   NVars | NRecords          |   CDO |   SMM (Dataset) |   SMM (DataArray) |   SMM (DataArray+NoMask) |   SMM (Dataset+Write) |   SMM (DataArray+Write) |
|:----------------|--------:|:------------------|------:|----------------:|------------------:|-------------------------:|----------------------:|------------------------:|
| onlytos-ipsl.nc |       1 | (12, 332, 362)    |     1 |        0.216799 |       0.0778903   |              0.0504366   |              0.997036 |                0.77708  |
| tas-ecearth.nc  |       1 | (12, 256, 512)    |     1 |        0.226347 |       0.0900398   |              0.0631857   |              1.14144  |                0.958676 |
| 2t-era5.nc      |       1 | (12, 73, 144)     |     1 |        0.170659 |       0.094557    |              0.0610341   |              0.845712 |                0.765937 |
| tos-fesom.nc    |       1 | (12, 126859)      |     1 |        0.113976 |       0.0399258   |              0.0256824   |             

## Weight generation

Test the different weights generation possibilities with CDO, tested with conservative remapping: the climtas code is way more efficient if files are already on the disk, since the call to CDO has to be done from file. CDO bindings have a minimum overhead to be considered

In [None]:
# nrepetition for the check
nr = 5



# generate weights from file
data = []
for filein in base.keys(): 
 
    # open file
    xfield = xr.open_mfdataset(os.path.join(indir,filein))
    tfield = xr.open_mfdataset(tfile)

    # generate weights from file
    one = timeit.timeit(lambda: cdo_generate_weights(os.path.join(indir,filein), tfile, method = 'con'), number = nr)
    #print(filein + ': Exectime climtas from file ' + str(one/nr))
    # generate weights from xarray
    two = timeit.timeit(lambda: cdo_generate_weights(xfield, tfield, method = 'con'), number = nr)
    #print(filein + ': Exectime climtas from xarray ' + str(two/nr))
    # generatre weights with CDO bindings (from file)
    three = timeit.timeit(lambda: cdo.gencon(tfile, input = os.path.join(indir,filein), returnXDataset = True), number = nr)
    #print(filein + ': Exectime cdo from file ' + str(three/nr))
    data.append([three, one, two])

cnames = ['CDO bindings', 'CDO subprocess (from file)', 'CDO subprocess (from xarray)']
df = pd.DataFrame(data, index = base.keys(), columns = cnames)
df.div(df[cnames[0]],axis =0)


Unnamed: 0,CDO bindings,CDO subprocess (from file),CDO subprocess (from xarray)
onlytos-ipsl.nc,1.0,0.938895,1.025188
tas-ecearth.nc,1.0,0.882129,1.164144
2t-era5.nc,1.0,0.747073,0.852617
tos-fesom.nc,1.0,0.987944,1.377073
ua-ecearth.nc,1.0,0.891298,1.118633
mix-cesm.nc,1.0,0.796245,1.012632


: 