# Workflow for pre-processing data for machine learning

### Import modules

In [12]:
%matplotlib inline
import xarray as xr

### Process CLM output from PPE for SVD

Set number of ensemble members

In [13]:
niter = 100

Set output path and PPE specification

In [14]:
path = "/glade/scratch/kdagon/archive/"
PPE = "hydro_ensemble_LHC_"

Set output variables of interest

In [15]:
var = ['FPSN', 'EFLX_LH_TOT']

Read in datamask from regridded observations file\
(Currently obs sampling and regridding is done in NCL files)

In [5]:
da_obs = xr.open_dataset("obs/MR_GPP_4x5_2000_2004.nc")

In [6]:
dm_obs = da_obs.datamask

In [7]:
#dm_obs.plot()

Read in CLM PPE output

In [32]:
#em = [i+1 for i in range(niter)]
#print(em)

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100]


In [22]:
#full_paths = [path+PPE+str(i+1)+"/lnd/hist/*{001[6-9],20-}*" for i in range(niter)]
full_paths = [path+PPE+str(i+1)+"/lnd/hist/*" for i in range(niter)]
full_paths[:10]

['/glade/scratch/kdagon/archive/hydro_ensemble_LHC_1/lnd/hist/*',
 '/glade/scratch/kdagon/archive/hydro_ensemble_LHC_2/lnd/hist/*',
 '/glade/scratch/kdagon/archive/hydro_ensemble_LHC_3/lnd/hist/*',
 '/glade/scratch/kdagon/archive/hydro_ensemble_LHC_4/lnd/hist/*',
 '/glade/scratch/kdagon/archive/hydro_ensemble_LHC_5/lnd/hist/*',
 '/glade/scratch/kdagon/archive/hydro_ensemble_LHC_6/lnd/hist/*',
 '/glade/scratch/kdagon/archive/hydro_ensemble_LHC_7/lnd/hist/*',
 '/glade/scratch/kdagon/archive/hydro_ensemble_LHC_8/lnd/hist/*',
 '/glade/scratch/kdagon/archive/hydro_ensemble_LHC_9/lnd/hist/*',
 '/glade/scratch/kdagon/archive/hydro_ensemble_LHC_10/lnd/hist/*']

In [44]:
import glob
#full_paths[0]
#glob.glob(full_paths[0])
#exlist = glob.glob('/glade/scratch/kdagon/archive/hydro_ensemble_LHC_1/lnd/hist/*')
#sorted(exlist)
# glob doesn't like curly brackets or square brackets or dashes?
# glob with a second filtering step
# OR for loop over specific years and months with * only?
import os
#os.path.expanduser(full_paths[0]) # just for home dir ~?
#full_paths[0]
#full_list = [glob.glob(full_paths[i]) for i in range(niter)]
#full_list

'/glade/scratch/kdagon/archive/hydro_ensemble_LHC_1/lnd/hist/*'

In [30]:
# also want to concatenate along ensemble member so that dimension is preserved
# run this for each ensemble member
# each ens mem as a dataset
# xr.concat for list of datasets (along ensemble member dim, a new dim)
def preprocess(ds):
    return ds[var]
da_model = xr.open_mfdataset(full_paths[0], combine='by_coords', preprocess=preprocess)
#da_model = xr.open_mfdataset(full_paths[0], data_vars=var, combine='by_coords')

In [32]:
da_model = [xr.open_mfdataset(p, combine='by_coords', preprocess=preprocess) for p in full_paths[:10]]

In [41]:
ensdim = xr.DataArray(list(range(1,11)), dims='ens', name='ens') # or np.arange
ensdim

<xarray.DataArray 'ens' (ens: 10)>
array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])
Dimensions without coordinates: ens

In [42]:
#da_model_concat = xr.concat(da_model, dim='ens')
da_model_concat = xr.concat(da_model, dim=ensdim)

In [43]:
da_model_concat

<xarray.Dataset>
Dimensions:      (ens: 10, lat: 46, lon: 72, time: 60)
Coordinates:
  * lon          (lon) float32 0.0 5.0 10.0 15.0 ... 340.0 345.0 350.0 355.0
  * lat          (lat) float32 -90.0 -86.0 -82.0 -78.0 ... 78.0 82.0 86.0 90.0
  * time         (time) object 0016-02-01 00:00:00 ... 0021-01-01 00:00:00
  * ens          (ens) int64 1 2 3 4 5 6 7 8 9 10
Data variables:
    FPSN         (ens, time, lat, lon) float32 dask.array<shape=(10, 60, 46, 72), chunksize=(1, 1, 46, 72)>
    EFLX_LH_TOT  (ens, time, lat, lon) float32 dask.array<shape=(10, 60, 46, 72), chunksize=(1, 1, 46, 72)>
Attributes:
    title:                                     CLM History file information
    comment:                                   NOTE: None of the variables ar...
    Conventions:                               CF-1.0
    history:                                   created on 05/28/18 20:36:54
    source:                                    Community Land Model CLM4.0
    hostname:               

In [11]:
!ls /glade/scratch/kdagon/archive/hydro_ensemble_LHC_1/lnd/hist/*{001[6-9],20-}*

/glade/scratch/kdagon/archive/hydro_ensemble_LHC_1/lnd/hist/hydro_ensemble_LHC_1.clm2.h0.0016-01.nc
/glade/scratch/kdagon/archive/hydro_ensemble_LHC_1/lnd/hist/hydro_ensemble_LHC_1.clm2.h0.0016-02.nc
/glade/scratch/kdagon/archive/hydro_ensemble_LHC_1/lnd/hist/hydro_ensemble_LHC_1.clm2.h0.0016-03.nc
/glade/scratch/kdagon/archive/hydro_ensemble_LHC_1/lnd/hist/hydro_ensemble_LHC_1.clm2.h0.0016-04.nc
/glade/scratch/kdagon/archive/hydro_ensemble_LHC_1/lnd/hist/hydro_ensemble_LHC_1.clm2.h0.0016-05.nc
/glade/scratch/kdagon/archive/hydro_ensemble_LHC_1/lnd/hist/hydro_ensemble_LHC_1.clm2.h0.0016-06.nc
/glade/scratch/kdagon/archive/hydro_ensemble_LHC_1/lnd/hist/hydro_ensemble_LHC_1.clm2.h0.0016-07.nc
/glade/scratch/kdagon/archive/hydro_ensemble_LHC_1/lnd/hist/hydro_ensemble_LHC_1.clm2.h0.0016-08.nc
/glade/scratch/kdagon/archive/hydro_ensemble_LHC_1/lnd/hist/hydro_ensemble_LHC_1.clm2.h0.0016-09.nc
/glade/scratch/kdagon/archive/hydro_ensemble_LHC_1/lnd/hist/hydro_ensemble_LHC_1.clm2.h0.0016-10.nc
