In [1]:
import xarray as xr
import numpy as np
import matplotlib.pyplot as plt
from math import nan
import glob
import dask
import datetime
from dateutil.relativedelta import relativedelta
from functools import partial
import pandas as pd

dask.config.set(**{'array.slicing.split_large_chunks': True})

<dask.config.set at 0x2b4700f4d950>

### Set up basic info

In [2]:
ystart = 1970 # start year of hindcasts 
yend = 2020 # end year of hindcasts
nyears = yend - ystart + 1
initmon=9
initmonstr = str(initmon).zfill(2)
nmems = 20 # the number of hindcast members
memstr = [str(i).zfill(3) for i in np.arange(1,nmems+1,1)] # generating member strings
topdir = "/glade/campaign/cesm/development/espwg/SMYLE-CW3E/timeseries/daily/Uzm/"
expname = "b.e21.BSMYLE-CW3E"
outpath="/glade/campaign/cgd/cas/islas/python/smyle_cw3e/DATA_SORT/outputzm/"

### Set up the dask cluster

In [3]:
from dask_jobqueue import PBSCluster
from dask.distributed import Client

cluster = PBSCluster(
    cores = 1,
    memory = '10GB',
    processes = 1,
    queue = 'casper',
    local_directory='$TMPDIR',
    resource_spec='select=1:ncpus=1:mem=10GB',
    project='P04010022',
    walltime='03:00:00',
    interface='ib0')

# scale up
cluster.scale(20)

# change your urls to the dask dashboard so that you can see it
dask.config.set({'distributed.dashboard.link':'https://jupyterhub.hpc.ucar.edu/stable/user/{USER}/proxy/{port}/status'})

# Setup your client
client = Client(cluster)

Perhaps you already have a cluster running?
Hosting the HTTP server on port 46085 instead
  f"Port {expected} is already in use.\n"


In [4]:
cluster

0,1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/islas/proxy/46085/status,Workers: 0
Total threads: 0,Total memory: 0 B

0,1
Comm: tcp://10.12.206.54:40019,Workers: 0
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/islas/proxy/46085/status,Total threads: 0
Started: Just now,Total memory: 0 B


### Set up the file list

In [5]:
files = []
for iyear in np.arange(ystart,yend+1,1):
    filest = [glob.glob(topdir+expname+"*."+imem+".cam.h1.Uzm."+str(iyear)+initmonstr+"01-*.nc")[0] for imem in memstr]
    files.append(filest)

### Set up pre-processor

In [6]:
# pre-processor to ensure all initialization dates have the same time axis
def preprocessor(ds):
    # sort out the times so that each member has the same time axis
    timebndavg = np.array(ds.time_bnds,
                         dtype = 'datetime64[s]').view('i8').mean(axis=1).astype('datetime64[s]')
    ds['time'] = timebndavg
    ds = ds.where( ds.time.dt.hour == 12, drop=True)
    
    datestart=pd.to_datetime("1970-"+str(ds.isel(time=0).time.dt.month.values).zfill(2)+"-"+str(ds.isel(time=0).time.dt.day.values).zfill(2)+
                       "-"+str(ds.isel(time=0).time.dt.hour.values), format="%Y-%m-%d-%H")
    time = [ datestart + relativedelta(days=int(i)) for i in np.arange(0,ds.time.size,1) ]
    ds['time'] = time
    ds = ds.isel(zlon=0)
    return ds

In [7]:
dat = xr.open_mfdataset(files, combine='nested', concat_dim=['init_year','M'],
                        parallel=True, data_vars=['Uzm'], coords='minimal', compat='override', preprocess=partial(preprocessor))
dat['init_year'] = np.arange(ystart,yend+1,1)
dat = dat.Uzm
dat.load().to_netcdf(outpath+"Uzm_BSMYLE-CW3E_day_init"+initmonstr+".nc")
cluster.close()

In [20]:
cluster.close()