# Testbed for netcdf processing efficiency
- Use Dask
- Chunk
- Only process data to be used

In [1]:
import sys
import time
# Add common resources folder to path
sys.path.append("/mnt/mcc-ns9600k/jonahks/git_repos/netcdf_analysis/Common/")
sys.path.append("/mnt/mcc-ns9600k/jonahks/git_repos/netcdf_analysis/")

from imports import (
    pd, np, xr, mpl, plt, sns, os, 
    datetime, sys, crt, gridspec,
    polyfit, ccrs, LinearRegression, metrics
    )

from functions import (
    masked_average, interpretNS, plot_slf_isotherms, 
    add_weights, process_caliop, process_for_slf,
    noresm_slf_to_df, regress_1d
    )

%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
host = os.uname()[1]
if 'jupyter' in host.split('-'): # Check if running on NIRD through the Jupyter Hub
    print('Running through MC2 Jupyter Hub')
    model_dir = '/mnt/mcc-ns9600k/jonahks/'
    os.chdir(model_dir)

else:  # Assume that we're running on a local machine and mounting NIRD
    print('Running on %s, attempting to mount ns9600k/jonahks/ from NIRD' % str(host))
    os.system('fusermount -zu ~/drivemount/')  # unmount first
    os.system('sshfs jonahks@login.nird.sigma2.no:"p/jonahks/" ~/drivemount/')    # Calling mountnird from .bashrc doesn't work
    os.chdir('/home/jonahks/drivemount/')
    save_dir = '~/DATAOUT/'
    save_to = os.path.expanduser(save_dir)

output_dir = 'figures/'
case_dir = 'mnth15runs/'   # inconsistent label compared to jupy_test
mods_dir = 'inp_validation/'

# Check that each important directory can be accessed:    
access_paths = os.path.exists(mods_dir) and os.path.exists(output_dir) and os.path.exists(model_dir)
print('Can access all directory paths:', access_paths)

Running through MC2 Jupyter Hub
Can access all directory paths: True


In [3]:
os.listdir(case_dir)

['20200112_002538_singleparam_nudge_wbf_1_inp_0',
 '20200116_130416_nudged_wbfmods_wbf_10_inp_1',
 '20191230_130025_singleparam_cttest15_wbf_1_inp_1',
 '20191217_145440_singleparam_jolt_wbf_1_inp_1000',
 '20200110_142006_singleparam_nudge_wbf_1_inp_1000',
 '20191128_171713_sampleparamset_wbf_10_inp_1',
 '20191209_180424_sampleparamset_wbf_1_inp_0.1',
 'error_vs_iso.png',
 '.ipynb_checkpoints',
 'unused',
 '20191127_162007_sampleparamset_wbf_0.01_inp_1',
 'runs_as_vectors.png',
 '20191210_152149_sampleparamset_wbf_1_inp_0.1',
 '20200109_1541_wbf_1.0_inp_1.0',
 '20200128_142303_singleparam_frzrtvarsm15_wbf_1_inp_100',
 '20191128_171713_sampleparamset_wbf_1_inp_10',
 '20200204_120214_singleparam_wbfcheck_wbf_0.1_inp_1',
 '20191219_151155_singleparam_cttest_wbf_1_inp_1.cam.h0.0001-01',
 '20200204_113441_singleparam_inpcheck_wbf_1_inp_10',
 '20191210_152149_sampleparamset_wbf_1_inp_10',
 '20191217_134307_singleparam_jolt_wbf_1_inp_0',
 '20200116_130416_nudged_wbfmods_wbf_0.01_inp_1',
 'para

In [4]:
cases = ['20200109_1541_wbf_1.0_inp_1.0', 
         '20200110_142006_singleparam_nudge_wbf_1_inp_1000',
         '20200112_002538_singleparam_nudge_wbf_1_inp_0',
         '20200116_130416_nudged_wbfmods_wbf_0.01_inp_1',
         '20200204_113441_singleparam_inpcheck_wbf_1_inp_10',
         '20200204_120214_singleparam_wbfcheck_wbf_0.1_inp_1',
         '20200128_142303_singleparam_frzrtvarsm15_wbf_1_inp_100',
         '20200116_130416_nudged_wbfmods_wbf_10_inp_1']

## Working with a single dataset of 15 mnths

In [5]:
case = cases[0]

In [6]:
t1 = time.time()
ds0 = xr.open_mfdataset('%s/%s/%s.nc' % (case_dir, case, case))#, combine='by_coords', chunks={'lat':10})
t2 = time.time()
print(t2 - t1)

0.5450055599212646


will change. To retain the existing behavior, pass
combine='nested'. To use future default behavior, pass
combine='by_coords'. See
http://xarray.pydata.org/en/stable/combining.html#combining-multi

  
to use the new `combine_by_coords` function (or the
`combine='by_coords'` option to `open_mfdataset`) to order the datasets
before concatenation. Alternatively, to continue concatenating based
on the order the datasets are supplied in future, please use the new
`combine_nested` function (or the `combine='nested'` option to
open_mfdataset).
  from_openmfds=True,


In [7]:
t1 = time.time()
ds1 = xr.open_mfdataset('%s/%s/%s.nc' % (case_dir, case, case), combine='by_coords') #, chunks={'lat':10})
t2 = time.time()
print(t2 - t1)

0.4626476764678955


In [8]:
t1 = time.time()
ds2 = xr.open_mfdataset('%s/%s/%s.nc' % (case_dir, case, case), combine='by_coords', chunks={'lat':10, 'time':1})
t2 = time.time()
print(t2 - t1)

1.1622588634490967


## This will probably kill the kernel

In [None]:
t1 = time.time()
ds3 = xr.open_mfdataset('%s/%s/%s.nc' % (case_dir, case, case), combine='by_coords', chunks={'lat':10, 'time':1, 'lev':1})
t2 = time.time()
print(t2 - t1)

## Chunking takes longer
And causes the kernel to crash sometimes too. Weird.

## Dask does not change the time to load datasets


In [9]:
datasets = [ds0,ds1,ds2] # excluding d3 for now

## Just creating a new variable doesn't take long. Same time for all datasets

In [10]:
for _ds in datasets:
    t0 = time.time()    
    new_var = _ds['TS']/_ds['PS']
    print(new_var)
    tf = time.time()
    print(tf-t0)

<xarray.DataArray (time: 15, lat: 96, lon: 144)>
dask.array<truediv, shape=(15, 96, 144), dtype=float32, chunksize=(15, 96, 144), chunktype=numpy.ndarray>
Coordinates:
  * time     (time) datetime64[ns] 2000-02-01 2000-03-01 ... 2001-04-01
  * lat      (lat) float64 -90.0 -88.11 -86.21 -84.32 ... 84.32 86.21 88.11 90.0
  * lon      (lon) float64 0.0 2.5 5.0 7.5 10.0 ... 350.0 352.5 355.0 357.5
0.006802797317504883
<xarray.DataArray (time: 15, lat: 96, lon: 144)>
dask.array<truediv, shape=(15, 96, 144), dtype=float32, chunksize=(15, 96, 144), chunktype=numpy.ndarray>
Coordinates:
  * time     (time) datetime64[ns] 2000-02-01 2000-03-01 ... 2001-04-01
  * lat      (lat) float64 -90.0 -88.11 -86.21 -84.32 ... 84.32 86.21 88.11 90.0
  * lon      (lon) float64 0.0 2.5 5.0 7.5 10.0 ... 350.0 352.5 355.0 357.5
0.005956888198852539
<xarray.DataArray (time: 15, lat: 96, lon: 144)>
dask.array<truediv, shape=(15, 96, 144), dtype=float32, chunksize=(1, 10, 144), chunktype=numpy.ndarray>
Coordinate

## Computing and printing the new 2D variable doesn't take very long either. Weird...

In [11]:
for _ds in datasets:
    t0 = time.time()    
    new_var = _ds['TS']/_ds['PS']
    _blank = new_var.values
    tf = time.time()
    print(tf-t0)

0.012898921966552734
0.00898432731628418
0.2608156204223633


## Checking with a higher dimension variables

In [16]:
for _ds in datasets:
    t0 = time.time()    
    new_var = _ds['AREI']/_ds['FREQI']
    tf = time.time()
    print(tf-t0)

0.002754688262939453
0.0025043487548828125
0.002923250198364258


In [19]:
for _ds in datasets[:-1]:
    t0 = time.time()    
    new_var = _ds['AREI']/_ds['FREQI']
    _blank = new_var.values
    tf = time.time()
    print(tf-t0)

0.0692603588104248
0.07379651069641113


  return func(*args2)
  return func(*args2)


In [22]:
_blank.shape

(15, 32, 96, 144)

# Chunking seems to be worse all around
Now just about any computation is killing the kernel

## The first time, it takes much longer. So running several times hides the actual computational weight

## Subselecting data does save significant time

In [35]:
for _ds in datasets[:-1]:
    somelats = _ds.sel(lat=slice(70,90))
    sometimes = _ds.isel(time=0)
    t0 = time.time()
    new_var = _ds['AREI']/_ds['FREQI']
    _blank = new_var.values
    tf = time.time()
    print(tf-t0)    
    
    t0 = time.time()
    new_var = somelats['AREI']/somelats['FREQI']
    _blank = new_var.values
    tf = time.time()
    print(tf-t0)

    t0 = time.time()
    new_var = sometimes['AREI']/sometimes['FREQI']
    _blank = new_var.values
    tf = time.time()
    print(tf-t0)

  return func(*args2)
  return func(*args2)
  return func(*args2)


0.06509041786193848
0.01690053939819336
0.00967717170715332
0.07059884071350098
0.018398761749267578
0.010392904281616211


  return func(*args2)
  return func(*args2)
  return func(*args2)
