### Running bulk of multimodel testing

This is equivalent to that present in the multimodel wrapper.


In [10]:
from config.read_configurations import config_hbv as hbvArgs
from config.read_configurations import config_prms as prmsArgs
from config.read_configurations import config_sacsma as sacsmaArgs
from config.read_configurations import config_sacsma_snow as sacsmaSnowArgs
from config.read_configurations import config_hbv_hydrodl as hbvhyArgs_d


import torch
import os
import platform
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import scipy.stats
# from post import plot

from core.utils.randomseed_config import randomseed_config
from core.utils.master import create_output_dirs
from MODELS.loss_functions.get_loss_function import get_lossFun
from MODELS.test_dp_HBV_dynamic import test_dp_hbv
from core.data_processing.data_loading import loadData
from core.data_processing.normalization import transNorm
from core.utils.randomseed_config import randomseed_config
from core.data_processing.model import (
    take_sample_test,
    converting_flow_from_ft3_per_sec_to_mm_per_day
)

import warnings
warnings.filterwarnings("ignore")



##-----## Multi-model Parameters ##-----##
##--------------------------------------##
# Setting dictionaries to separately manage each diff model's attributes.
models = {'hbvhy_dynamic': None, 'SACSMA_w_snow':None, 'marrmot_PRMS':None}  # 'hbv':None, 'hbvhy': None, 'SACSMA_w_snow':None, 'SACSMA':None,
args_list = {'hbvhy_dynamic': hbvhyArgs_d, 'SACSMA_w_snow':sacsmaSnowArgs, 'marrmot_PRMS':prmsArgs}   # 'hbvhy': hbvhyArgs, 'hbv' : hbvArgs, 'SACSMA_w_snow':None, 'SACSMA': sacsmaArgs,
ENSEMBLE_TYPE = 'max'  # 'median', 'avg', 'max', 'softmax'

# Set path to `hydro_multimodel_results` directory.
if platform.system() == 'Darwin':
    # For mac os
    OUT_DIR = '/Users/leoglonz/Desktop/water/data/model_runs/hydro_multimodel_results'
    # Some operations are not yet working with MPS, so we might need to set some environment variables to use CPU fall instead
    # %env PYTORCH_ENABLE_MPS_FALLBACK=1

    # Load test observations and predictions from a prior run.
    pred_path = os.path.join(OUT_DIR, "multimodels/671_sites_dp/output/preds_671_HBV_SACSMASnow_PRMS_dynamic.npy")
    obs_path = os.path.join(OUT_DIR, "multimodels/671_sites_dp/output/obs_671_HBV_SACSMASnow_PRMS_dynamic.npy")
    preds = np.load(pred_path, allow_pickle=True).item()
    obs = np.load(obs_path, allow_pickle=True).item()

    model_output = preds
    y_obs = obs

elif platform.system() == 'Windows':
    # For windows
    OUT_DIR = 'D:\\data\\model_runs\\hydro_multimodel_results\\'

    # Load test predictions from a prior run.
    path = os.path.join(OUT_DIR, "multimodels\\671_sites_dp\\output\\preds_671_HBV_SACSMASnow_PRMS.npy")
    preds = np.load(path, allow_pickle=True).item()

elif platform.system() == 'Linux':
    # For Colab
    OUT_DIR = '/content/drive/MyDrive/Colab/data/model_runs/hydro_multimodel_results'

    # Load test predictions from a prior run on colab.
    # path = os.path.join(OUT_DIR, "multimodels\\671_sites_dp\\output\\preds_671_HBV_SACSMASnow_PRMS_dynamic.npy")
    # preds = np.load(path, allow_pickle=True).item()

else:
    raise ValueError('Unsupported operating system.')

In [5]:
def test_differentiable_model(args, diff_model):
    """
    This function collects and outputs the model predictions and the corresponding
    observations needed to run statistical analyses.

    If rerunning testing in a Jupyter environment, you will need to re-import args
    as `batch_size` is overwritten in this function and will throw an error if the
    overwrite is attempted a second time.
    """
    warm_up = args["warm_up"]
    nmul = args["nmul"]
    diff_model.eval()
    # read data for test time range
    dataset_dictionary = loadData(args, trange=args["t_test"])
    np.save(os.path.join(args["out_dir"], "x.npy"), dataset_dictionary["x_NN"])  # saves with the overlap in the beginning
    # normalizing
    x_NN_scaled = transNorm(args, dataset_dictionary["x_NN"], varLst=args["varT_NN"], toNorm=True)
    c_NN_scaled = transNorm(args, dataset_dictionary["c_NN"], varLst=args["varC_NN"], toNorm=True)
    c_NN_scaled = np.repeat(np.expand_dims(c_NN_scaled, 0), x_NN_scaled.shape[0], axis=0)
    dataset_dictionary["inputs_NN_scaled"] = np.concatenate((x_NN_scaled, c_NN_scaled), axis=2)
    del x_NN_scaled, dataset_dictionary["x_NN"]
    # converting the numpy arrays to torch tensors:
    for key in dataset_dictionary.keys():
        dataset_dictionary[key] = torch.from_numpy(dataset_dictionary[key]).float()

    # args_mod = args.copy()
    args["batch_size"] = args["no_basins"]
    nt, ngrid, nx = dataset_dictionary["inputs_NN_scaled"].shape

    # Making lists of the start and end indices of the basins for each batch.
    batch_size = args["batch_size"]
    iS = np.arange(0, ngrid, batch_size)    # Start index list.
    iE = np.append(iS[1:], ngrid)   # End.

    list_out_diff_model = []
    for i in tqdm(range(0, len(iS)), unit='Batch'):
        dataset_dictionary_sample = take_sample_test(args, dataset_dictionary, iS[i], iE[i])

        out_diff_model = diff_model(dataset_dictionary_sample)
        # Convert all tensors in the dictionary to CPU
        out_diff_model_cpu = {key: tensor.cpu().detach() for key, tensor in out_diff_model.items()}
        # out_diff_model_cpu = tuple(outs.cpu().detach() for outs in out_diff_model)
        list_out_diff_model.append(out_diff_model_cpu)

    # getting rid of warm-up period in observation dataset and making the dimension similar to
    # converting numpy to tensor
    # y_obs = torch.tensor(np.swapaxes(y_obs[:, warm_up:, :], 0, 1), dtype=torch.float32)
    # c_hydro_model = torch.tensor(c_hydro_model, dtype=torch.float32)
    y_obs = converting_flow_from_ft3_per_sec_to_mm_per_day(args,
                                                           dataset_dictionary["c_NN"],
                                                           dataset_dictionary["obs"][warm_up:, :, :])

    return list_out_diff_model, y_obs

In [13]:
######## NOTE: As of now, testing for this cudnn_rnn model cannot be run with mps or cpu on MAC m-series architecture ##########


# loss_funcs = dict()
# model_output = dict()
# y_obs = dict()

# for mod in models:
#     mod = str(mod)

#     if mod in ['SACSMA', 'SACSMA_w_snow', 'marrmot_PRMS', 'hbv']:
#         randomseed_config(seed=args_list[mod]["randomseed"][0])
#         # Creating output directories and adding them to args.
#         args_list[mod] = create_output_dirs(args_list[mod])
#         args = args_list[mod]

#         loss_funcs[mod] = get_lossFun(args_list[mod])

#         modelFile = os.path.join(args["out_dir"], "model_Ep" + str(args['EPOCHS']) + ".pt")
#         models[mod] = torch.load(modelFile)     # Append instanced models.

#         print("Collecting predictions, observations for %s in batches of %i." %(mod, args['no_basins']))
#         model_output[mod], y_obs[mod] = test_differentiable_model(args=args,
#                                                                   diff_model=models[mod])
#     elif mod in ['hbvhy', 'hbvhy_dynamic']:
#         print("Collecting predictions, observations for HBV (HydroDL).")
#         model_output[mod], y_obs[mod] = test_dp_hbv()
#     else:
#         raise ValueError(f"Unsupported model type in `models`.")


Collecting predictions, observations for HBV (HydroDL).
daymet tmean was used!
Time to read usgs streamflow:  12.833029747009277
Time to read usgs streamflow:  11.846182584762573
daymet tmean was used!
Time to read usgs streamflow:  11.661510944366455
Time to read usgs streamflow:  11.792118072509766
daymet tmean was used!
Time to read usgs streamflow:  11.829334497451782
Time to read usgs streamflow:  12.348673582077026
read usgs streamflow 15.64434266090393
read master file /content/drive/MyDrive/Colab/data/model_runs/rnnStreamflow/CAMELSDemo/dPLHBV/ALL/TDTestforc/TD1_13/daymet/BuffOpt0/RMSE_para0.25/111111/Fold1/T_19801001_19951001_BS_100_HS_256_RHO_365_NF_13_Buff_365_Mul_16/master.json
Using device  cpu


100%|██████████| 23/23 [06:45<00:00, 17.63s/Batch]


Collecting predictions, observations for SACSMA_w_snow in batches of 25.


100%|██████████| 27/27 [08:24<00:00, 18.69s/Batch]


Collecting predictions, observations for marrmot_PRMS in batches of 25.


100%|██████████| 27/27 [04:59<00:00, 11.09s/Batch]


In [14]:
# Figure out how to save the model output to a csv or npy so that we don't waste
# time having to recollect data every time we start up this notebook.

path = os.path.join(OUT_DIR, "multimodels\\671_sites_dp\\output\\")
if not os.path.exists(path):
    os.makedirs(path, exist_ok=True)

np.save(os.path.join(path, "preds_671_HBVdynamic_SACSMASnow_PRMS.npy"),model_output)
np.save(os.path.join(path, "obs_671_HBVdynamic_SACSMASnow_PRMS.npy"),y_obs)


In [13]:
def calFDC(data):
    # data = Ngrid * Nday
    Ngrid, Nday = data.shape
    FDC100 = np.full([Ngrid, 100], np.nan)
    for ii in range(Ngrid):
        tempdata0 = data[ii, :]
        tempdata = tempdata0[~np.isnan(tempdata0)]
        # deal with no data case for some gages
        if len(tempdata)==0:
            tempdata = np.full(Nday, 0)
        # sort from large to small
        temp_sort = np.sort(tempdata)[::-1]
        # select 100 quantile points
        Nlen = len(tempdata)
        ind = (np.arange(100)/100*Nlen).astype(int)
        FDCflow = temp_sort[ind]
        if len(FDCflow) != 100:
            raise Exception('unknown assimilation variable')
        else:
            FDC100[ii, :] = FDCflow

    return FDC100


def statError(pred, target):
    ngrid, nt = pred.shape
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=RuntimeWarning)
    # Bias
        Bias = np.nanmean(pred - target, axis=1)
        # RMSE
        RMSE = np.sqrt(np.nanmean((pred - target)**2, axis=1))
        # ubRMSE
        predMean = np.tile(np.nanmean(pred, axis=1), (nt, 1)).transpose()
        targetMean = np.tile(np.nanmean(target, axis=1), (nt, 1)).transpose()
        predAnom = pred - predMean
        targetAnom = target - targetMean
        ubRMSE = np.sqrt(np.nanmean((predAnom - targetAnom)**2, axis=1))
        # FDC metric
        predFDC = calFDC(pred)
        targetFDC = calFDC(target)
        FDCRMSE = np.sqrt(np.nanmean((predFDC - targetFDC) ** 2, axis=1))
    # rho R2 NSE
        Corr = np.full(ngrid, np.nan)
        CorrSp = np.full(ngrid, np.nan)
        R2 = np.full(ngrid, np.nan)
        NSE = np.full(ngrid, np.nan)
        PBiaslow = np.full(ngrid, np.nan)
        PBiashigh = np.full(ngrid, np.nan)
        PBias = np.full(ngrid, np.nan)
        PBiasother = np.full(ngrid, np.nan)
        KGE = np.full(ngrid, np.nan)
        KGE12 = np.full(ngrid, np.nan)
        RMSElow = np.full(ngrid, np.nan)
        RMSEhigh = np.full(ngrid, np.nan)
        RMSEother = np.full(ngrid, np.nan)
        for k in range(0, ngrid):
            x = pred[k, :]
            y = target[k, :]
            ind = np.where(np.logical_and(~np.isnan(x), ~np.isnan(y)))[0]
            if ind.shape[0] > 0:
                xx = x[ind]
                yy = y[ind]
                # percent bias
                PBias[k] = np.sum(xx - yy) / np.sum(yy) * 100

                # FHV the peak flows bias 2%
                # FLV the low flows bias bottom 30%, log space
                pred_sort = np.sort(xx)
                target_sort = np.sort(yy)
                indexlow = round(0.3 * len(pred_sort))
                indexhigh = round(0.98 * len(pred_sort))
                lowpred = pred_sort[:indexlow]
                highpred = pred_sort[indexhigh:]
                otherpred = pred_sort[indexlow:indexhigh]
                lowtarget = target_sort[:indexlow]
                hightarget = target_sort[indexhigh:]
                othertarget = target_sort[indexlow:indexhigh]
                PBiaslow[k] = np.sum(lowpred - lowtarget) / np.sum(lowtarget) * 100
                PBiashigh[k] = np.sum(highpred - hightarget) / np.sum(hightarget) * 100
                PBiasother[k] = np.sum(otherpred - othertarget) / np.sum(othertarget) * 100
                RMSElow[k] = np.sqrt(np.nanmean((lowpred - lowtarget)**2))
                RMSEhigh[k] = np.sqrt(np.nanmean((highpred - hightarget)**2))
                RMSEother[k] = np.sqrt(np.nanmean((otherpred - othertarget)**2))

                if ind.shape[0] > 1:
                    # Theoretically at least two points for correlation
                    Corr[k] = scipy.stats.pearsonr(xx, yy)[0]
                    CorrSp[k] = scipy.stats.spearmanr(xx, yy)[0]
                    yymean = yy.mean()
                    yystd = np.std(yy)
                    xxmean = xx.mean()
                    xxstd = np.std(xx)
                    KGE[k] = 1 - np.sqrt((Corr[k]-1)**2 + (xxstd/yystd-1)**2 + (xxmean/yymean-1)**2)
                    KGE12[k] = 1 - np.sqrt((Corr[k] - 1) ** 2 + ((xxstd*yymean)/ (yystd*xxmean) - 1) ** 2 + (xxmean / yymean - 1) ** 2)
                    SST = np.sum((yy-yymean)**2)
                    SSReg = np.sum((xx-yymean)**2)
                    SSRes = np.sum((yy-xx)**2)
                    R2[k] = 1-SSRes/SST
                    NSE[k] = 1-SSRes/SST

    outDict = dict(Bias=Bias, RMSE=RMSE, ubRMSE=ubRMSE, Corr=Corr, CorrSp=CorrSp, R2=R2, NSE=NSE,
                   FLV=PBiaslow, FHV=PBiashigh, PBias=PBias, PBiasother=PBiasother, KGE=KGE, KGE12=KGE12, fdcRMSE=FDCRMSE,
                   lowRMSE=RMSElow, highRMSE=RMSEhigh, midRMSE=RMSEother)

    return outDict

In [16]:
class hydroEnsemble(torch.nn.Module):
    # Wrapper for multiple hydrologic models.
    # In future, consider just passing the models you want to ensemble explicitly.
    def __init__(self, num_models, hidden_size, num_layers):
        super(hydroEnsemble, self).__init__()

        self.lstm = torch.nn.LSTM(num_models, hidden_size, num_layers, batch_first=True)
        self.fc = torch.nn.Linear(hidden_size, num_models)  # Two models (modelA and modelB)

        # self.modelA = modelA
        # self.modelB = modelB
        # self.classifier = torch.nn.Linear(4, 2)

    def forward(self, x):
        # x is the input sequence tensor with shape (batch_size, sequence_length, num_models)

        # Setting randomseed for deterministic output.
        randomseed_config(0)

        # Add batch dimension to input and convert to tensor.
        x_exp = x.unsqueeze(0)

        # LSTM layer
        lstm_out, _ = self.lstm(x_exp)

        # Fully connected layer
        fc_out = self.fc(lstm_out)

        # Apply softmax activation to obtain weights
        weights = torch.nn.functional.softmax(fc_out, dim=2).squeeze()

        # Weighted combination of predictions.
        weighted_preds = np.multiply(weights.detach(), x)

        # Or take the max weight and return the corresponding value.
        max_vals, _ = torch.max(weights, dim=1)
        btensor = torch.zeros_like(weights)
        btensor[weights==max_vals.view(-1,1)] = 1
        weighted_preds = np.multiply(btensor.detach(), x)

        preds = torch.sum(weighted_preds, dim=1)

        # All tensors
        # return preds, weights, weighted_preds
        return preds


In [14]:
def calculate_metrics_multi(args_list, model_outputs, y_obs_list, ensemble_type='max', out_dir=None):
    """
    Calculate stats for a multimodel ensemble.
    """
    stats_list = dict()

    for mod in args_list:
        args = args_list[mod]
        mod_out = model_outputs[mod]
        y_obs = y_obs_list[mod]

        if mod in ['SACSMA', 'SACSMA_w_snow', 'marrmot_PRMS', 'hbv']:
            # Note for hydrodl HBV, calculations have already been done,
            # so skip this step.

            # Saving data
            if out_dir:
                path = os.path.join(out_dir,"models\\671_sites_dp\\" + mod + "\\")
                if not os.path.exists(path):
                    os.makedirs(path, exist_ok=True)

                # Test data (obs and model results).
                for key in mod_out[0].keys():
                    if len(mod_out[0][key].shape) == 3:
                        dim = 1
                    else:
                        dim = 0
                    concatenated_tensor = torch.cat([d[key] for d in mod_out], dim=dim)
                    file_name = key + ".npy"
                    np.save(os.path.join(path, file_name), concatenated_tensor.numpy())
                    # np.save(os.path.join(args["out_dir"], args["testing_dir"], file_name), concatenated_tensor.numpy())

                # Reading and flow observations.
                print(args['target'])
                for var in args["target"]:
                    item_obs = y_obs[:, :, args["target"].index(var)]
                    file_name = var + ".npy"
                    np.save(os.path.join(path, file_name), item_obs)
                    # np.save(os.path.join(args["out_dir"], args["testing_dir"], file_name), item_obs)


            ###################### calculations here ######################
            predLst = list()
            obsLst = list()
            flow_sim = torch.cat([d["flow_sim"] for d in mod_out], dim=1)
            flow_obs = y_obs[:, :, args["target"].index("00060_Mean")]
            predLst.append(flow_sim.numpy())
            obsLst.append(np.expand_dims(flow_obs, 2))

            # if args["temp_model_name"] != "None":
            #     temp_sim = torch.cat([d["temp_sim"] for d in mod_out], dim=1)
            #     temp_obs = y_obs[:, :, args["target"].index("00010_Mean")]
            #     predLst.append(temp_sim.numpy())
            #     obsLst.append(np.expand_dims(temp_obs, 2))

            # we need to swap axes here to have [basin, days], and remove redundant
            # dimensions with np.squeeze().
            stats_list[mod] = [
                statError(np.swapaxes(x.squeeze(), 1, 0), np.swapaxes(y.squeeze(), 1, 0))
                for (x, y) in zip(predLst, obsLst)
            ]
        elif mod in ['hbvhy', 'hbvhy_dynamic']:
            stats_list[mod] = [statError(mod_out[:,:,0], y_obs.squeeze())]
        else:
            raise ValueError(f"Unsupported model type in `models`.")

    # Calculating final statistics for the whole set of basins.
    name_list = ["flow", "temp"]
    for st, name in zip(stats_list[mod], name_list):
        count = 0
        mdstd = np.zeros([len(st), 3])
        for key in st.keys():
            # st contains the statistics on a model run like NSE and KGE.
            # Find the best result (e.g., the max, avg, median) and merge from each model.
            for i, mod in enumerate(args_list):
                if i == 0:
                    # temp contains the values of key per basin.
                    temp = stats_list[mod][0][key]
                    continue
                elif i == 1:
                    temp = np.stack((temp, stats_list[mod][0][key]), axis=1)
                else:
                    temp = np.hstack((temp, stats_list[mod][0][key].reshape(-1,1)))

            if len(args_list) > 1:
                if ensemble_type == 'max':
                    # print(temp, key)
                    temp = np.amax(temp, axis=1)
                    # print(temp, key)
                elif ensemble_type == 'avg':
                    temp = np.mean(temp, axis=1)
                elif ensemble_type == 'median':
                    temp = np.median(temp, axis=1)
                elif ensemble_type == 'softmax':
                    # # Softmax gets relative contributions of each model.
                    # weights = torch.nn.functional.softmax(torch.from_numpy(temp), dim=1)
                    # temp = np.sum(temp * weights.numpy(), axis=1)

                    # Instantiate weighting lstm with softmax.
                    lstm = hydroEnsemble(num_models=len(args_list), hidden_size=192, num_layers=3)
                    # Forward pass through the model
                    temp = lstm(torch.tensor(temp, dtype=torch.float))
                else:
                    raise ValueError("Invalid model ensemble type specified.")

            median = np.nanmedian(temp)  # abs(i)
            std = np.nanstd(temp)  # abs(i)
            mean = np.nanmean(temp)  # abs(i)
            k = np.array([[median, std, mean]])
            mdstd[count] = k
            count = count + 1

        # mdstd displays the statistics for each error measure in stats_list.
        mdstd = pd.DataFrame(
            mdstd, index=st.keys(), columns=["median", "STD", "mean"]
        )
        # Save the data stats from the training run:
        if out_dir and len(args_list) > 1:
            path = os.path.join(out_dir, "multimodels\\671_sites_dp\\n_" + ensemble_type + "\\")
            if not os.path.exists(path):
                os.makedirs(path, exist_ok=True)

            mdstd.to_csv((os.path.join(path, "mdstd_" + name + ensemble_type +".csv")))
        elif out_dir:
            path = os.path.join(out_dir, "models\\671_sites_dp\\" + args_list[0] + "\\")
            if not os.path.exists(path):
                os.makedirs(path, exist_ok=True)

            mdstd.to_csv((os.path.join(path, "mdstd_" + name +".csv")))
        else: continue

    # Show boxplots of the results
    # plt.rcParams["font.size"] = 14
    # keyLst = ["Bias", "RMSE", "ubRMSE", "NSE", "Corr"]
    # dataBox = list()
    # for iS in range(len(keyLst)):
    #     statStr = keyLst[iS]
    #     temp = list()
    #     # for k in range(len(st)):
    #     data = st[statStr]
    #     data = data[~np.isnan(data)]
    #     temp.append(data)
    #     dataBox.append(temp)
    # labelname = [
    #     "Hybrid differentiable model"
    # ]  # ['STA:316,batch158', 'STA:156,batch156', 'STA:1032,batch516']   # ['LSTM-34 Basin']

    # xlabel = ["Bias ($\mathregular{deg}$C)", "RMSE", "ubRMSE", "NSE", "Corr"]
    # fig = plot.plotBoxFig(
    #     dataBox, xlabel, label2=labelname, sharey=False, figsize=(16, 8)
    # )
    # fig.patch.set_facecolor("white")
    # boxPlotName = "PGML"
    # fig.suptitle(boxPlotName, fontsize=12)
    # plt.rcParams["font.size"] = 12
    # # plt.savefig(
    # #     os.path.join(args["out_dir"], args["testing_dir"], "Box_" + name + ".png")
    # # )  # , dpi=500
    # # fig.show()
    # plt.close()

    torch.cuda.empty_cache()
    print("Testing ended")

    return stats_list, mdstd

In [15]:
# models = {'SACSMA':None, 'marrmot_PRMS':None}  # 'hbv':None
args_list = {'hbvhy_dynamic': hbvhyArgs_d, 'SACSMA_w_snow':sacsmaSnowArgs, 'marrmot_PRMS':prmsArgs}   # 'hbvhy': hbvhyArgs, 'hbv' : hbvArgs, 'SACSMA_w_snow':None, 'SACSMA': sacsmaArgs,
ENSEMBLE_TYPE = 'avg'  # 'median', 'avg', 'max', 'softmax'
stats_list, mtstd = calculate_metrics_multi(args_list, model_outputs=model_output, y_obs_list=y_obs, ensemble_type=ENSEMBLE_TYPE)

mtstd['median']

Testing ended


Bias           0.023276
RMSE           1.154326
ubRMSE         1.136360
Corr           0.858483
CorrSp         0.847346
R2             0.707425
NSE            0.707425
FLV           45.150305
FHV           -6.803808
PBias          2.927847
PBiasother     5.308928
KGE            0.743327
KGE12          0.737234
fdcRMSE        1.024379
lowRMSE        0.075245
highRMSE       2.522294
midRMSE        0.237547
Name: median, dtype: float64

----

### Getting HBV Model Data

----


In [None]:
import sys
sys.path.append('../../')
from hydroDL import master, utils
from hydroDL.data import camels
from hydroDL.master import loadModel
from hydroDL.model import train
from hydroDL.post import plot, stat

import os
import numpy as np
import torch
import pandas as pd
import matplotlib.pyplot as plt
import random
import json
import datetime as dt


## fix the random seeds
randomseed = 111111
random.seed(randomseed)
torch.manual_seed(randomseed)
np.random.seed(randomseed)
torch.cuda.manual_seed(randomseed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

## GPU setting
testgpuid = 0
torch.cuda.set_device(testgpuid)

## setting options, keep the same as your training
PUOpt = 0  # 0 for All; 1 for PUB; 2 for PUR;
buffOptOri = 0  # original buffOpt, must be same as what you set for training
buffOpt = 0  # control load training data 0: do nothing; 1: repeat first year; 2: load one more year
forType = 'daymet'

## Hyperparameters, keep the same as your training setup
BATCH_SIZE = 100
RHO = 365
HIDDENSIZE = 256
Ttrain = [19801001, 19951001]  # Training period
# Ttrain = [19891001, 19991001]  # PUB/PUR period
Tinv = [19801001, 19951001] # dPL Inversion period
# Tinv = [19891001, 19991001]  # PUB/PUR period
Nfea = 12 # number of HBV parameters
BUFFTIME = 365
routing = True
Nmul = 16
comprout = False
compwts = False
pcorr = None

Ttest = [19951001, 20101001]  # testing period
TtestLst = utils.time.tRange2Array(Ttest)
TtestLoad = [19951001, 20101001]

testbatch = 50  # forward number of "testbatch" basins each time to save GPU memory. You can set this even smaller to save more.
testepoch = 50

testseed = 111111

In [None]:
# Define root directory of database and saved output dir
# Modify this based on your own location of CAMELS dataset and saved models
rootDatabase = os.path.join(os.path.sep, 'D:\data', 'Camels')  # CAMELS dataset root directory
camels.initcamels(rootDatabase)  # initialize three camels module-scope variables in camels.py: dirDB, gageDict, statDict

rootOut = os.path.join(os.path.sep, 'D:\data\model_runs', 'rnnStreamflow')  # Model output root directory

# CAMLES basin info
gageinfo = camels.gageDict
hucinfo = gageinfo['huc']
gageid = gageinfo['id']
gageidLst = gageid.tolist()

# same as training, load data based on ALL, PUB, PUR scenarios
if PUOpt == 0: # for All the basins
    puN = 'ALL'
    tarIDLst = [gageidLst]

elif PUOpt == 1: # for PUB
    puN = 'PUB'
    # load the subset ID
    # splitPath saves the basin ID of random groups
    splitPath = 'PUBsplitLst.txt'
    with open(splitPath, 'r') as fp:
        testIDLst=json.load(fp)
    tarIDLst = testIDLst

elif PUOpt == 2: # for PUR
    puN = 'PUR'
    # Divide CAMELS dataset into 7 PUR regions
    # get the id list of each region
    regionID = list()
    regionNum = list()
    regionDivide = [ [1,2], [3,6], [4,5,7], [9,10], [8,11,12,13], [14,15,16,18], [17] ] # seven regions
    for ii in range(len(regionDivide)):
        tempcomb = regionDivide[ii]
        tempregid = list()
        for ih in tempcomb:
            tempid = gageid[hucinfo==ih].tolist()
            tempregid = tempregid + tempid
        regionID.append(tempregid)
        regionNum.append(len(tempregid))
    tarIDLst = regionID     # List of all basin ID's in the study (671 for full camels).

# define the matrix to save results
predtestALL = np.full([len(gageid), len(TtestLst), 5], np.nan)
obstestALL = np.full([len(gageid), len(TtestLst), 1], np.nan)

# this testsave_path should be consistent with where you save your model
testsave_path = 'CAMELSDemo/dPLHBV/' + puN + '/Testforc/' + forType + '/BuffOpt' + str(buffOptOri) +\
                '/RMSE_para0.25/'+str(testseed)

## load data and test the model
nstart = 0
logtestIDLst = []

In [None]:
for ifold in range(1, len(tarIDLst)+1):
    testfold = ifold
    TestLS = tarIDLst[testfold - 1]
    TestInd = [gageidLst.index(j) for j in TestLS]

    TrainLS = gageidLst
    TrainInd = [gageidLst.index(j) for j in TrainLS]

    gageDic = {'TrainID':TrainLS, 'TestID':TestLS}

    nbasin = len(TestLS) # number of basins for testing


In [None]:
foldstr = 'Fold' + str(testfold)
exp_info = 'T_'+str(Ttrain[0])+'_'+str(Ttrain[1])+'_BS_'+str(BATCH_SIZE)+'_HS_'+str(HIDDENSIZE)\
            +'_RHO_'+str(RHO)+'_NF_'+str(Nfea)+'_Buff_'+str(BUFFTIME)+'_Mul_'+str(Nmul)
# the final path to test with the trained model saved in
testout = os.path.join(rootOut, testsave_path, foldstr, exp_info)
testmodel = loadModel(testout, epoch=testepoch)
testmodel

In [None]:
TtrainLoad = Ttrain
TinvLoad = Tinv

varF = ['prcp', 'tmean']
varFInv = ['prcp', 'tmean']


attrnewLst = [ 'p_mean','pet_mean','p_seasonality','frac_snow','aridity','high_prec_freq','high_prec_dur',
                   'low_prec_freq','low_prec_dur', 'elev_mean', 'slope_mean', 'area_gages2', 'frac_forest', 'lai_max',
                   'lai_diff', 'gvf_max', 'gvf_diff', 'dom_land_cover_frac', 'dom_land_cover', 'root_depth_50',
                   'soil_depth_pelletier', 'soil_depth_statsgo', 'soil_porosity', 'soil_conductivity',
                   'max_water_content', 'sand_frac', 'silt_frac', 'clay_frac', 'geol_1st_class', 'glim_1st_class_frac',
                   'geol_2nd_class', 'glim_2nd_class_frac', 'carbonate_rocks_frac', 'geol_porostiy', 'geol_permeability']

dfTrain = camels.DataframeCamels(tRange=TtrainLoad, subset=TrainLS, forType=forType)
forcUN = dfTrain.getDataTs(varLst=varF, doNorm=False, rmNan=False)

In [None]:
dfInv = camels.DataframeCamels(tRange=TinvLoad, subset=TrainLS, forType=forType)
forcInvUN = dfInv.getDataTs(varLst=varFInv, doNorm=False, rmNan=False)
attrsUN = dfInv.getDataConst(varLst=attrnewLst, doNorm=False, rmNan=False)

In [None]:
dfTest = camels.DataframeCamels(tRange=TtestLoad, subset=TestLS, forType=forType)
forcTestUN = dfTest.getDataTs(varLst=varF, doNorm=False, rmNan=False)
obsTestUN = dfTest.getDataObs(doNorm=False, rmNan=False, basinnorm=False)
attrsTestUN = dfTest.getDataConst(varLst=attrnewLst, doNorm=False, rmNan=False)

In [None]:
len(obsTestUN), len(attrnewLst), len(obsTestUN[0])

In [None]:
areas = gageinfo['area'][TestInd] # unit km2
temparea = np.tile(areas[:, None, None], (1, obsTestUN.shape[1],1))
obsTestUN = (obsTestUN * 0.0283168 * 3600 * 24) / (temparea * (10 ** 6)) * 10**3

varLstNL = ['PEVAP']
usgsIdLst = gageid
if forType == 'maurer':
    tPETRange = [19800101, 20090101]
else:
    tPETRange = [19800101, 20150101]
tPETLst = utils.time.tRange2Array(tPETRange)
PETDir = rootDatabase + '/pet_harg/' + forType + '/'
ntime = len(tPETLst)
PETfull = np.empty([len(usgsIdLst), ntime, len(varLstNL)])
for k in range(len(usgsIdLst)):
    dataTemp = camels.readcsvGage(PETDir, usgsIdLst[k], varLstNL, ntime)
    PETfull[k, :, :] = dataTemp

TtrainLst = utils.time.tRange2Array(TtrainLoad)
TinvLst = utils.time.tRange2Array(TinvLoad)
TtestLoadLst = utils.time.tRange2Array(TtestLoad)
C, ind1, ind2 = np.intersect1d(TtrainLst, tPETLst, return_indices=True)
PETUN = PETfull[:, ind2, :]
PETUN = PETUN[TrainInd, :, :] # select basins
C, ind1, ind2inv = np.intersect1d(TinvLst, tPETLst, return_indices=True)
PETInvUN = PETfull[:, ind2inv, :]
PETInvUN = PETInvUN[TrainInd, :, :]
C, ind1, ind2test = np.intersect1d(TtestLoadLst, tPETLst, return_indices=True)
PETTestUN = PETfull[:, ind2test, :]
PETTestUN = PETTestUN[TestInd, :, :]

# process data, do normalization and remove nan
series_inv = np.concatenate([forcInvUN, PETInvUN], axis=2)
seriesvarLst = varFInv + ['pet']
# load the saved statistics
statFile = os.path.join(testout, 'statDict.json')
with open(statFile, 'r') as fp:
    statDict = json.load(fp)

# normalize
attr_norm = camels.transNormbyDic(attrsUN, attrnewLst, statDict, toNorm=True)
attr_norm[np.isnan(attr_norm)] = 0.0
series_norm = camels.transNormbyDic(series_inv, seriesvarLst, statDict, toNorm=True)
series_norm[np.isnan(series_norm)] = 0.0

attrtest_norm = camels.transNormbyDic(attrsTestUN, attrnewLst, statDict, toNorm=True)
attrtest_norm[np.isnan(attrtest_norm)] = 0.0
seriestest_inv = np.concatenate([forcTestUN, PETTestUN], axis=2)
seriestest_norm = camels.transNormbyDic(seriestest_inv, seriesvarLst, statDict, toNorm=True)
seriestest_norm[np.isnan(seriestest_norm)] = 0.0

# prepare the inputs
zTrain = series_norm
xTrain = np.concatenate([forcUN, PETUN], axis=2) # HBV forcing
xTrain[np.isnan(xTrain)] = 0.0

In [None]:
if buffOpt == 1: # repeat the first year for buff
    zTrainIn = np.concatenate([zTrain[:,0:BUFFTIME,:], zTrain], axis=1)
    xTrainIn = np.concatenate([xTrain[:,0:BUFFTIME,:], xTrain], axis=1) # Bufftime for the first year
    # yTrainIn = np.concatenate([obsUN[:,0:BUFFTIME,:], obsUN], axis=1)
else: # no repeat, original data
    zTrainIn = zTrain
    xTrainIn = xTrain
    # yTrainIn = obsUN

forcTuple = (xTrainIn, zTrainIn)
attrs = attr_norm

## Prepare the testing data and forward the trained model for testing
# TestBuff = 365 # Use 365 days forcing to warm up the model for testing
TestBuff = xTrain.shape[1]  # Use the whole training period to warm up the model for testing
# TestBuff = len(TtestLoadLst) - len(TtestLst)  # use the redundantly loaded data to warm up

# prepare file name to save the testing predictions
filePathLst = master.master.namePred(
        testout, Ttest, 'All_Buff'+str(TestBuff), epoch=testepoch, targLst=['Qr', 'Q0', 'Q1', 'Q2', 'ET'])

In [None]:
# prepare the inputs for TESTING
if PUOpt == 0: # for ALL basins, temporal generalization test
    zTest = series_norm  # dPL inversion
    xTest = np.concatenate([forcTestUN, PETTestUN], axis=2)  # HBV forcing
    # forcings to warm up the model. Here use the forcing of training period to warm up
    xTestBuff = xTrain[:, -TestBuff:, :]
    xTest = np.concatenate([xTestBuff, xTest], axis=1)
    obs = obsTestUN[:, 0:, :]  # starts with 0 when not loading more data before testing period

else:  # for PUB and PUR cases, different testing basins. Load more forcings to warm up.
    zTest = seriestest_norm[:, 0:TestBuff, :]  # Use the warm-up period forcing as the gA input in zTest
    # zTest = seriestest_norm
    xTest = np.concatenate([forcTestUN, PETTestUN], axis=2)  # HBV forcing
    obs = obsTestUN[:, TestBuff:, :]  # exclude loaded obs in warming up period (first TestBuff days) for evaluation

# Use days of TestBuff to initialize the model
testmodel.inittime=TestBuff

# Final inputs to the test model
xTest[np.isnan(xTest)] = 0.0
attrtest = attrtest_norm
cTemp = np.repeat(
    np.reshape(attrtest, [attrtest.shape[0], 1, attrtest.shape[-1]]), zTest.shape[1], axis=1)
zTest = np.concatenate([zTest, cTemp], 2) # Add attributes to historical forcings as the inversion part
testTuple = (xTest, zTest) # xTest: input forcings to HBV; zTest: inputs to gA LSTM to learn parameters

# forward the model and save results
train.testModel(
    testmodel, testTuple, c=None, batchSize=testbatch, filePathLst=filePathLst)

# read out the saved forward predictions
dataPred = np.ndarray([obs.shape[0], obs.shape[1], len(filePathLst)])
for k in range(len(filePathLst)):
    filePath = filePathLst[k]
    dataPred[:, :, k] = pd.read_csv(
        filePath, dtype=np.float, header=None).values
# save the predictions to the big matrix
predtestALL[nstart:nstart+nbasin, :, :] = dataPred
obstestALL[nstart:nstart+nbasin, :, :] = obs
nstart = nstart + nbasin
logtestIDLst = logtestIDLst + TestLS

In [None]:
predtestALL[0], len(predtestALL[0])

In [None]:
## post processing
# calculate evaluation metrics
evaDict = [stat.statError(predtestALL[:,:,0], obstestALL.squeeze())]  # Q0: the streamflow

In [None]:
len(evaDict[0]['NSE'])

In [None]:
from test_dp_HBV import test_dp_hbv

predtestALL, predtestALL = test_dp_hbv()