# Model Evaluation :: Data Preparation

As a part of the generalized evaluation workflow: 

<img src='./Eval_PreProc.svg' width=600>

The pre-processing step is needed in order to align the two datasets for analysis.  The specific 
steps needed to prepare a given dataset may differ, depending on the source and the variable of
interest. 

Some steps might include: 

* Organizing the time-series index such that the time steps for both simulated and observed are congruent
    * This may involve interpolation to estimate a more granular time-step than is found in the source data
    * More often, an agregating function is used to 'down-sample' the dataset to a coarser time step (days vs hours).
* Coordinate aggregation units between simulated and observed 
    * Gridded data may be sampled per HUC-12, HUC-6, etc. to match modeled data indexed by these units. 
    * Index formats may be adjusted (e.g. a 'gage_id' may be 'USGS-01104200' in one data set, vs '01104200' in another)
* Re-Chunking the data to make time-series analysis more efficient (see [here](/dev/null) for a primer on re-chunking).

At this stage, a given variable should be represented as a pair of 2D array of values (one for simulated, one for observed). 
One dimension of the array is indexed by some nominal data field (e.g. 'gage_id', 'HUC-12 ID', etc), while the other dimension 
is indexed by time step.

In [None]:

ds_chanobs = xr.open_dataset(fs2.get_mapper(url), engine='zarr', 
                             backend_kwargs={'consolidated':False}, chunks={})

gage_ids_str = [gage_id.astype('str').lstrip() for gage_id in ds_chanobs['gage_id'].values]
## what the gage IDs look like:
gage_ids_str[0:5]
# determine the start and end of the modeled timeseries
start = ds_chanobs.time[0].values
stop = ds_chanobs.time[-1].values
print(start,stop)


In [None]:
#import pygeohydro
from pygeohydro import NWIS
nwis = NWIS()
# use the start and stop dates above from the modeled data to extract observational data from NWIS for the same time period
dates = (start,stop)
print(dates)
%%time
ds_obs = nwis.get_streamflow(gage_ids_str[:2], dates, to_xarray=True)
# rename variables
ds_obs = ds_obs.rename_dims({'station_id':'gage_id'}).rename({'station_id':'gage_id','discharge':'streamflow'})
time_base = ds_obs.time.values

In [None]:
# edit this to your directory where you wish to save NWIS streamflow information
dir_scratch = Path('/caldera/projects/usgs/water/wbbp/')
file_chanobs = dir_scratch / 'nwis_chanobs2.zarr'
if file_chanobs.is_dir():
    fs.rm(str(file_chanobs),recursive=True)
len(gage_ids_str)
#source_dataset = ds_obs.drop_vars(drop_vars)
source_dataset = ds_obs
template = (source_dataset.chunk().
            pipe(xr.zeros_like).
            isel(gage_id=0, drop=True).
            expand_dims(gage_id=len(gage_ids_str), axis=-1))

template = template.assign_coords({'gage_id':[f'USGS-{gage_id}' for gage_id in gage_ids_str]})

template = template.chunk({'time':len(ds_obs.time), 'gage_id': 1})

encoding = {'alt_acy_va': dict(_FillValue=-2147483647, dtype=np.int32),
            'alt_va': dict( _FillValue=9.96921e+36, dtype=np.float32),
            'dec_lat_va': dict( _FillValue=None, dtype=np.float32),
            'dec_long_va': dict( _FillValue=None, dtype=np.float32),
            'streamflow': dict( _FillValue=9.96921e+36, dtype=np.float32)}
template.to_zarr(file_chanobs, compute=False, encoding=encoding, consolidated=True, mode='w')
nt = len(ds_obs.time)
ds_obs.to_zarr(file_chanobs, region={'time':slice(0, nt), 'gage_id': slice(0, 2)})


In [None]:
def ind2zarr(n):
     site_id = gage_ids_str[n]
     try:
        ds_obs = nwis.get_streamflow(site_id, dates, to_xarray=True).interp(time=time_base)
        ds_obs = ds_obs.rename_dims({'station_id':'gage_id'}).rename({'station_id':'gage_id','discharge':'streamflow'})
        ds_obs.to_zarr(file_chanobs, region={'time': slice(0, nt), 'gage_id': slice(n,n+1)})
     except:
        pass

In [None]:
client, cluster = configure_cluster(resource)
_ = dask.compute(*[dask.delayed(ind2zarr)(i) for i in range(len(gage_ids_str))], retries=10);
_ = consolidate_metadata(file_chanobs)
file_chanobs
dst = xr.open_dataset(file_chanobs, engine='zarr', chunks={}, backend_kwargs=dict(consolidated=True))
dst