In [1]:
import pathlib
import tqdm
import pandas as pd
import pickle as pkl
import numpy as np
import matplotlib.pyplot as plt
import xarray
import time
import matplotlib.cm as cm
import scipy

In [37]:
# NOTEBOOK SETUP.
# CHOSE OPTIONS, PATHS, DATE RANGES, ETC. HERE IN THIS CELL

# Options for what to calculate
# Calulations will be saved to pkl
# If False, will load in pkl files
######################################
calculate_metric_dictionary = False #
calculate_flow_categories = False   #
calculate_metric_matrix = False     #
######################################

val_start='10/01/1989'
val_end='09/30/1999'
date_range = pd.date_range(start=val_start, end=val_end)

# local directory with all data
# not included in github
data_dir = "./data/"

In [44]:
if True: # Open the attributes set up to do regression
    openthis = data_dir+'camels_attributes_v2.0/camels_attributes_v2.0_Regression.csv'
    attributes = pd.read_csv(openthis, sep=',', index_col='gauge_id')
else: # Open a slightly more extrnsive data set.
    openthis = data_dir+'camels_attributes_v2.0/camels_attributes_v2.0.csv'
    attributes = pd.read_csv(openthis, sep=';', index_col='gauge_id')

# Catchment attributes and hydrologic signatures that are not useful
drop_these = ['high_prec_timing','root_depth_50',
              'root_depth_99','zero_q_freq','water_frac','organic_frac']
    
# Add the basin ID as a 8 element string with a leading zero if neccessary
basin_id_str = []
for a in attributes.index.values:
    basin_id_str.append(str(a).zfill(8))
attributes['basin_id_str'] = basin_id_str

# These are bad for the regression analysis.
attributes = attributes.drop(drop_these, axis=1)
attributes.shape

(671, 45)

In [45]:
# Get the hydrologic units for each basin.
with open(data_dir + 'usgs_site_info.csv', 'r') as f:
    usgs_sites = pd.read_csv(f, skiprows=24, index_col='site_no')
usgs_idx_int = []
for idx in usgs_sites.index.values:
    usgs_idx_int.append(int(idx))
usgs_sites.reindex(usgs_idx_int)
usgs_sites = usgs_sites.reindex(usgs_idx_int)
basin_hydro_unit = []
for b in attributes.basin_id_str.values:
    huc_cd = usgs_sites.loc[int(b),'huc_cd']
    hu = '{:08d}'.format(huc_cd)
    basin_hydro_unit.append(hu[0:2])
attributes['basin_hydro_unit'] = basin_hydro_unit

In [25]:
with open(data_dir+"ensemble_metrics.pkl", "rb") as fb:
    ensemble_metrics = pkl.load(fb)
with open(data_dir+"individual_run_metrics.pkl", "rb") as fb:
    individual_run_metrics = pkl.load(fb)
with open(data_dir+"observations.pkl", "rb") as fb:
    observations = pkl.load(fb)
with open(data_dir+"simulations.pkl", "rb") as fb:
    simulations = pkl.load(fb)

In [34]:
simulations.columns.values

array([('01022500', 'base_model'), ('01022500', 'base_model_states'),
       ('01022500', 'base_model_inputs'), ...,
       ('14400000', 'lagged_streamflow_states'),
       ('14400000', 'lagged_streamflow_inputs'),
       ('14400000', 'lagged_streamflow_both')], dtype=object)

In [32]:
def nse(y, y_hat):
    idx = (~np.isnan(y)) & (~np.isnan(y_hat))
    return 1 - np.nansum((y[idx] - y_hat[idx])**2) / np.nansum((y[idx] - np.nanmean(y[idx]))**2)

In [35]:
nse(observations['01022500'], simulations['01022500', 'base_model'])

0.8835914484219056