In [50]:
from config.read_configurations import config_hbv as hbvArgs
from config.read_configurations import config_prms as prmsArgs
from config.read_configurations import config_sacsma as sacsmaArgs
from config.read_configurations import config_hbv_hydrodl as hbvhyArgs



import torch
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import scipy.stats

from post import plot
from test_dp_HBV import test_dp_hbv

from core.utils.randomseed_config import randomseed_config
from core.utils.master import create_output_dirs
from MODELS.loss_functions.get_loss_function import get_lossFun
from core.data_processing.data_loading import loadData
from core.data_processing.normalization import transNorm
from core.data_processing.model import (
    take_sample_test,
    converting_flow_from_ft3_per_sec_to_mm_per_day
)

import warnings
warnings.filterwarnings("ignore")



##-----## Multi-model Parameters ##-----##
# Setting dictionaries to separately manage each diff model's attributes.
models = {'hbv': None, 'SACSMA':None, 'marrmot_PRMS':None}  # 'hbv':None
args_list = {'SACSMA': sacsmaArgs, 'marrmot_PRMS':prmsArgs,'hbv': hbvArgs}

ENSEMBLE_TYPE = 'median'  # 'avg', 'max', 'softmax'
OUT_DIR = 'D:\\data\\model_runs\\hydro_multimodel_results\\'


In [52]:
def test_differentiable_model(args, diff_model):
    """
    This function collects and outputs the model predictions and the corresponding
    observations needed to run statistical analyses.
    
    If rerunning testing in a Jupyter environment, you will need to re-import args
    as `batch_size` is overwritten in this function and will throw an error if the
    overwrite is attempted a second time.
    """
    warm_up = args["warm_up"]
    nmul = args["nmul"]
    diff_model.eval()
    # read data for test time range
    dataset_dictionary = loadData(args, trange=args["t_test"])
    np.save(os.path.join(args["out_dir"], "x.npy"), dataset_dictionary["x_NN"])  # saves with the overlap in the beginning
    # normalizing
    x_NN_scaled = transNorm(args, dataset_dictionary["x_NN"], varLst=args["varT_NN"], toNorm=True)
    c_NN_scaled = transNorm(args, dataset_dictionary["c_NN"], varLst=args["varC_NN"], toNorm=True)
    c_NN_scaled = np.repeat(np.expand_dims(c_NN_scaled, 0), x_NN_scaled.shape[0], axis=0)
    dataset_dictionary["inputs_NN_scaled"] = np.concatenate((x_NN_scaled, c_NN_scaled), axis=2)
    del x_NN_scaled, dataset_dictionary["x_NN"]
    # converting the numpy arrays to torch tensors:
    for key in dataset_dictionary.keys():
        dataset_dictionary[key] = torch.from_numpy(dataset_dictionary[key]).float()

    # args_mod = args.copy()
    args["batch_size"] = args["no_basins"] 
    nt, ngrid, nx = dataset_dictionary["inputs_NN_scaled"].shape

    # Making lists of the start and end indices of the basins for each batch.
    batch_size = args["batch_size"]
    iS = np.arange(0, ngrid, batch_size)    # Start index list.
    iE = np.append(iS[1:], ngrid)   # End.
    
    list_out_diff_model = []
    for i in tqdm(range(0, len(iS)), unit='Batch'):
        dataset_dictionary_sample = take_sample_test(args, dataset_dictionary, iS[i], iE[i])

        out_diff_model = diff_model(dataset_dictionary_sample)
        # Convert all tensors in the dictionary to CPU
        out_diff_model_cpu = {key: tensor.cpu().detach() for key, tensor in out_diff_model.items()}
        # out_diff_model_cpu = tuple(outs.cpu().detach() for outs in out_diff_model)
        list_out_diff_model.append(out_diff_model_cpu)

    # getting rid of warm-up period in observation dataset and making the dimension similar to
    # converting numpy to tensor
    # y_obs = torch.tensor(np.swapaxes(y_obs[:, warm_up:, :], 0, 1), dtype=torch.float32)
    # c_hydro_model = torch.tensor(c_hydro_model, dtype=torch.float32)
    y_obs = converting_flow_from_ft3_per_sec_to_mm_per_day(args, 
                                                           dataset_dictionary["c_NN"],
                                                           dataset_dictionary["obs"][warm_up:, :, :])
        
    return list_out_diff_model, y_obs

In [53]:
loss_funcs = dict()
model_output = dict()
y_obs = dict()

for mod in models:
    mod = str(mod)

    if mod in ['SACSMA', 'marrmot_PRMS', 'farshid_HBV']:
        randomseed_config(seed=args_list[mod]["randomseed"][0])
        # Creating output directories and adding them to args.
        args_list[mod] = create_output_dirs(args_list[mod])
        args = args_list[mod]

        loss_funcs[mod] = get_lossFun(args_list[mod])

        modelFile = os.path.join(args["out_dir"], "model_Ep" + str(args['EPOCHS']) + ".pt")        
        models[mod] = torch.load(modelFile)     # Append instanced models.

        print("Collecting predictions, observations for %s in batches of %i." %(mod, args['no_basins']))
        model_output[mod], y_obs[mod] = test_differentiable_model(args=args, 
                                                                  diff_model=models[mod])
    elif mod == 'hbv':
        print("Collecting predictions, observations for HBV.")
        model_output[mod], y_obs[mod] = test_dp_hbv()
    else:
        raise ValueError(f"Unsupported model type in `models`.")


Collecting predictions, observations for HBV.
daymet tmean was used!
Time to read usgs streamflow:  22.34215998649597
Time to read usgs streamflow:  22.323699474334717
daymet tmean was used!
Time to read usgs streamflow:  22.293091773986816
Time to read usgs streamflow:  22.706169366836548
daymet tmean was used!
Time to read usgs streamflow:  22.093195915222168
Time to read usgs streamflow:  22.04894709587097
read usgs streamflow 18.233071327209473
read master file D:\data\model_runs\rnnStreamflow\CAMELSDemo/dPLHBV/ALL/Testforc/daymet/BuffOpt0/RMSE_para0.25/111111\Fold1\T_19801001_19951001_BS_100_HS_256_RHO_365_NF_12_Buff_365_Mul_16\master.json
batch 0
batch 1
batch 2
batch 3
batch 4
batch 5
batch 6
batch 7
batch 8
batch 9
batch 10
batch 11
batch 12
batch 13


FileNotFoundError: [Errno 2] No such file or directory: 'D:\\\\data\\\\model_runs\\\\PGML_STemp_results\\\\models\\\\PRMS_SNTEMP\\\\671_sites_dp\\LSTM_SACSMA_E50_R365_B25_H256_tr1980_1995_n2_0\\model_Ep50.pt'

In [None]:
def calFDC(data):
    # data = Ngrid * Nday
    Ngrid, Nday = data.shape
    FDC100 = np.full([Ngrid, 100], np.nan)
    for ii in range(Ngrid):
        tempdata0 = data[ii, :]
        tempdata = tempdata0[~np.isnan(tempdata0)]
        # deal with no data case for some gages
        if len(tempdata)==0:
            tempdata = np.full(Nday, 0)
        # sort from large to small
        temp_sort = np.sort(tempdata)[::-1]
        # select 100 quantile points
        Nlen = len(tempdata)
        ind = (np.arange(100)/100*Nlen).astype(int)
        FDCflow = temp_sort[ind]
        if len(FDCflow) != 100:
            raise Exception('unknown assimilation variable')
        else:
            FDC100[ii, :] = FDCflow

    return FDC100


def statError(pred, target):
    ngrid, nt = pred.shape
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=RuntimeWarning)
    # Bias
        Bias = np.nanmean(pred - target, axis=1)
        # RMSE
        RMSE = np.sqrt(np.nanmean((pred - target)**2, axis=1))
        # ubRMSE
        predMean = np.tile(np.nanmean(pred, axis=1), (nt, 1)).transpose()
        targetMean = np.tile(np.nanmean(target, axis=1), (nt, 1)).transpose()
        predAnom = pred - predMean
        targetAnom = target - targetMean
        ubRMSE = np.sqrt(np.nanmean((predAnom - targetAnom)**2, axis=1))
        # FDC metric
        predFDC = calFDC(pred)
        targetFDC = calFDC(target)
        FDCRMSE = np.sqrt(np.nanmean((predFDC - targetFDC) ** 2, axis=1))
    # rho R2 NSE
        Corr = np.full(ngrid, np.nan)
        CorrSp = np.full(ngrid, np.nan)
        R2 = np.full(ngrid, np.nan)
        NSE = np.full(ngrid, np.nan)
        PBiaslow = np.full(ngrid, np.nan)
        PBiashigh = np.full(ngrid, np.nan)
        PBias = np.full(ngrid, np.nan)
        PBiasother = np.full(ngrid, np.nan)
        KGE = np.full(ngrid, np.nan)
        KGE12 = np.full(ngrid, np.nan)
        RMSElow = np.full(ngrid, np.nan)
        RMSEhigh = np.full(ngrid, np.nan)
        RMSEother = np.full(ngrid, np.nan)
        for k in range(0, ngrid):
            x = pred[k, :]
            y = target[k, :]
            ind = np.where(np.logical_and(~np.isnan(x), ~np.isnan(y)))[0]
            if ind.shape[0] > 0:
                xx = x[ind]
                yy = y[ind]
                # percent bias
                PBias[k] = np.sum(xx - yy) / np.sum(yy) * 100

                # FHV the peak flows bias 2%
                # FLV the low flows bias bottom 30%, log space
                pred_sort = np.sort(xx)
                target_sort = np.sort(yy)
                indexlow = round(0.3 * len(pred_sort))
                indexhigh = round(0.98 * len(pred_sort))
                lowpred = pred_sort[:indexlow]
                highpred = pred_sort[indexhigh:]
                otherpred = pred_sort[indexlow:indexhigh]
                lowtarget = target_sort[:indexlow]
                hightarget = target_sort[indexhigh:]
                othertarget = target_sort[indexlow:indexhigh]
                PBiaslow[k] = np.sum(lowpred - lowtarget) / np.sum(lowtarget) * 100
                PBiashigh[k] = np.sum(highpred - hightarget) / np.sum(hightarget) * 100
                PBiasother[k] = np.sum(otherpred - othertarget) / np.sum(othertarget) * 100
                RMSElow[k] = np.sqrt(np.nanmean((lowpred - lowtarget)**2))
                RMSEhigh[k] = np.sqrt(np.nanmean((highpred - hightarget)**2))
                RMSEother[k] = np.sqrt(np.nanmean((otherpred - othertarget)**2))

                if ind.shape[0] > 1:
                    # Theoretically at least two points for correlation
                    Corr[k] = scipy.stats.pearsonr(xx, yy)[0]
                    CorrSp[k] = scipy.stats.spearmanr(xx, yy)[0]
                    yymean = yy.mean()
                    yystd = np.std(yy)
                    xxmean = xx.mean()
                    xxstd = np.std(xx)
                    KGE[k] = 1 - np.sqrt((Corr[k]-1)**2 + (xxstd/yystd-1)**2 + (xxmean/yymean-1)**2)
                    KGE12[k] = 1 - np.sqrt((Corr[k] - 1) ** 2 + ((xxstd*yymean)/ (yystd*xxmean) - 1) ** 2 + (xxmean / yymean - 1) ** 2)
                    SST = np.sum((yy-yymean)**2)
                    SSReg = np.sum((xx-yymean)**2)
                    SSRes = np.sum((yy-xx)**2)
                    R2[k] = 1-SSRes/SST
                    NSE[k] = 1-SSRes/SST

    outDict = dict(Bias=Bias, RMSE=RMSE, ubRMSE=ubRMSE, Corr=Corr, CorrSp=CorrSp, R2=R2, NSE=NSE,
                   FLV=PBiaslow, FHV=PBiashigh, PBias=PBias, PBiasother=PBiasother, KGE=KGE, KGE12=KGE12, fdcRMSE=FDCRMSE,
                   lowRMSE=RMSElow, highRMSE=RMSEhigh, midRMSE=RMSEother)
    
    return outDict

In [None]:
def calculate_metrics_multi(args_list, model_outputs, y_obs_list, ensemble_type='max', out_dir=None):
    """
    Calculate stats for a multimodel ensemble.
    """
    stats_list = dict()

    for mod in args_list:
        args = args_list[mod]
        mod_out = model_outputs[mod]
        y_obs = y_obs_list[mod]

        if mod in ['SACSMA', 'marrmot_PRMS', 'farshid_hbv']:
            # Note for hydrodl HBV, calculations have already been done,
            # so skip this step.
            
            # Saving data            
            if out_dir:
                path = os.path.join(out_dir,"models\\671_sites_dp\\" + mod + "\\")
                if not os.path.exists(path):
                    os.makedirs(path, exist_ok=True)

                # Test data (obs and model results).
                for key in mod_out[0].keys():
                    if len(mod_out[0][key].shape) == 3:
                        dim = 1
                    else:
                        dim = 0
                    concatenated_tensor = torch.cat([d[key] for d in mod_out], dim=dim)
                    file_name = key + ".npy"
                    np.save(os.path.join(path, file_name), concatenated_tensor.numpy())
                    # np.save(os.path.join(args["out_dir"], args["testing_dir"], file_name), concatenated_tensor.numpy())

                # Reading and flow observations.
                print(args['target'])
                for var in args["target"]:
                    item_obs = y_obs[:, :, args["target"].index(var)]
                    file_name = var + ".npy"
                    np.save(os.path.join(path, file_name), item_obs)
                    # np.save(os.path.join(args["out_dir"], args["testing_dir"], file_name), item_obs)


            ###################### calculations here ######################
            predLst = list()
            obsLst = list()
            flow_sim = torch.cat([d["flow_sim"] for d in mod_out], dim=1)
            flow_obs = y_obs[:, :, args["target"].index("00060_Mean")]
            predLst.append(flow_sim.numpy())
            obsLst.append(np.expand_dims(flow_obs, 2))

            # if args["temp_model_name"] != "None":
            #     temp_sim = torch.cat([d["temp_sim"] for d in mod_out], dim=1)
            #     temp_obs = y_obs[:, :, args["target"].index("00010_Mean")]
            #     predLst.append(temp_sim.numpy())
            #     obsLst.append(np.expand_dims(temp_obs, 2))

            # we need to swap axes here to have [basin, days], and remove redundant 
            # dimensions with np.squeeze().
            stats_list[mod] = [
                statError(np.swapaxes(x.squeeze(), 1, 0), np.swapaxes(y.squeeze(), 1, 0))
                for (x, y) in zip(predLst, obsLst)
            ]
        elif mod == 'hbv':
            stats_list[mod] = [statError(mod_out[:,:,0], y_obs.squeeze())]
        else:
            raise ValueError(f"Unsupported model type in `models`.")

    # Calculating final statistics for the whole set of basins.
    name_list = ["flow", "temp"]
    for st, name in zip(stats_list[mod], name_list):
        count = 0
        mdstd = np.zeros([len(st), 3])
        for key in st.keys():
            # Find the best result (e.g., the max, avg, median) and merge from each model.
            for i, mod in enumerate(args_list):
                if i == 0:
                    temp = stats_list[mod][0][key]
                    continue
                elif i == 1:
                    temp = np.stack((temp, stats_list[mod][0][key]), axis=1)
                else:
                    temp = np.hstack((temp, stats_list[mod][0][key].reshape(-1,1)))

            if len(args_list) > 1:
                if ensemble_type == 'max':
                    # print(temp, key)
                    temp = np.amax(temp, axis=1)
                    # print(temp, key)
                elif ensemble_type == 'avg':
                    temp = np.mean(temp, axis=1)
                elif ensemble_type == 'median':
                    temp = np.median(temp, axis=1)
                else:
                    raise ValueError("Invalid model ensemble type specified.")
                
            median = np.nanmedian(temp)  # abs(i)
            std = np.nanstd(temp)  # abs(i)
            mean = np.nanmean(temp)  # abs(i)
            k = np.array([[median, std, mean]])
            mdstd[count] = k
            count = count + 1
            
        # mdstd displays the statistics for each error measure in stats_list.
        mdstd = pd.DataFrame(
            mdstd, index=st.keys(), columns=["median", "STD", "mean"]
        )
        # Save the data stats from the training run:
        if out_dir and len(args_list) > 1:
            path = os.path.join(out_dir, "multimodels\\671_sites_dp\\n_" + ensemble_type + "\\")
            if not os.path.exists(path):
                os.makedirs(path, exist_ok=True)
                    
            mdstd.to_csv((os.path.join(path, "mdstd_" + name + ensemble_type +".csv")))
        elif out_dir:
            path = os.path.join(out_dir, "models\\671_sites_dp\\" + args_list[0] + "\\")
            if not os.path.exists(path):
                os.makedirs(path, exist_ok=True)

            mdstd.to_csv((os.path.join(path, "mdstd_" + name +".csv")))
        else: continue

     # Show boxplots of the results
    plt.rcParams["font.size"] = 14
    keyLst = ["Bias", "RMSE", "ubRMSE", "NSE", "Corr"]
    dataBox = list()
    for iS in range(len(keyLst)):
        statStr = keyLst[iS]
        temp = list()
        # for k in range(len(st)):
        data = st[statStr]
        data = data[~np.isnan(data)]
        temp.append(data)
        dataBox.append(temp)
    labelname = [
        "Hybrid differentiable model"
    ]  # ['STA:316,batch158', 'STA:156,batch156', 'STA:1032,batch516']   # ['LSTM-34 Basin']

    xlabel = ["Bias ($\mathregular{deg}$C)", "RMSE", "ubRMSE", "NSE", "Corr"]
    fig = plot.plotBoxFig(
        dataBox, xlabel, label2=labelname, sharey=False, figsize=(16, 8)
    )
    fig.patch.set_facecolor("white")
    boxPlotName = "PGML"
    fig.suptitle(boxPlotName, fontsize=12)
    plt.rcParams["font.size"] = 12
    # plt.savefig(
    #     os.path.join(args["out_dir"], args["testing_dir"], "Box_" + name + ".png")
    # )  # , dpi=500
    # fig.show()
    plt.close()

    torch.cuda.empty_cache()
    # print("Testing ended")

    return stats_list, mdstd

In [None]:
# models = {'SACSMA':None, 'marrmot_PRMS':None}  # 'hbv':None
# args_list = {'marrmot_PRMS':prmsArgs,'hbv': hbvArgs}

stats_list, mtstd = calculate_metrics_multi(args_list, model_outputs=model_output, y_obs_list=y_obs, ensemble_type=ENSEMBLE_TYPE, out_dir=OUT_DIR)


['00060_Mean']
['00060_Mean']


In [31]:
mtstd['median'], "SAC"

(Bias           0.023638
 RMSE           1.375866
 ubRMSE         1.364707
 Corr           0.766663
 CorrSp         0.789618
 R2             0.531261
 NSE            0.531261
 FLV           82.221192
 FHV          -14.811704
 PBias          2.451530
 PBiasother     4.070650
 KGE            0.633087
 KGE12          0.620712
 fdcRMSE        1.362580
 lowRMSE        0.105923
 highRMSE       3.322313
 midRMSE        0.311551
 Name: median, dtype: float64,
 'SAC')

In [33]:
mtstd['median'], "PRMS"

(Bias          -0.012955
 RMSE           1.187929
 ubRMSE         1.181253
 Corr           0.845166
 CorrSp         0.808476
 R2             0.684968
 NSE            0.684968
 FLV           16.443141
 FHV           -8.544982
 PBias         -1.679518
 PBiasother    -0.010780
 KGE            0.750555
 KGE12          0.762107
 fdcRMSE        0.967566
 lowRMSE        0.056080
 highRMSE       2.605308
 midRMSE        0.207460
 Name: median, dtype: float64,
 'PRMS')

In [38]:
mtstd['median'], 'HBV'

(Bias           0.030976
 RMSE           1.186985
 ubRMSE         1.171042
 Corr           0.856491
 CorrSp         0.854078
 R2             0.713660
 NSE            0.713660
 FLV           51.890851
 FHV           -8.908644
 PBias          3.470816
 PBiasother     6.365034
 KGE            0.734564
 KGE12          0.728974
 fdcRMSE        1.057993
 lowRMSE        0.076760
 highRMSE       2.656486
 midRMSE        0.228451
 Name: median, dtype: float64,
 'HBV')

In [42]:
mtstd['median'], 'all'

(Bias            0.091692
 RMSE            1.415431
 ubRMSE          1.406244
 Corr            0.861052
 CorrSp          0.859163
 R2              0.726301
 NSE             0.726301
 FLV           130.182993
 FHV            -2.363760
 PBias          10.018780
 PBiasother     12.789425
 KGE             0.778676
 KGE12           0.784603
 fdcRMSE         1.661377
 lowRMSE         0.171308
 highRMSE        4.083405
 midRMSE         0.357401
 Name: median, dtype: float64,
 'all')

In [24]:
mtstd['median'], 'avg all'

(Bias           0.013683
 RMSE           1.184225
 ubRMSE         1.174306
 Corr           0.850140
 CorrSp         0.832750
 R2             0.698096
 NSE            0.698096
 FLV           41.489862
 FHV           -8.064903
 PBias          1.489761
 PBiasother     3.476490
 KGE            0.742377
 KGE12          0.744687
 fdcRMSE        1.058350
 lowRMSE        0.073452
 highRMSE       2.618986
 midRMSE        0.221791
 Name: median, dtype: float64,
 'avg all')

In [13]:
model_output['hbv'][0]

array([[1.3000539e-02, 0.0000000e+00, 7.9583000e-03, 2.9776424e-01,
        1.7031918e+00],
       [5.8530558e-02, 0.0000000e+00, 1.3215601e-03, 2.7569354e-01,
        1.6276169e+00],
       [1.1220434e-01, 0.0000000e+00, 0.0000000e+00, 2.5547340e-01,
        1.3423388e+00],
       ...,
       [1.3872437e+00, 0.0000000e+00, 8.0913790e-01, 1.4040515e+00,
        1.8904784e+00],
       [1.5643985e+00, 0.0000000e+00, 1.0355383e+00, 1.5936201e+00,
        1.9796518e+00],
       [1.8524745e+00, 8.7666190e-02, 2.0063870e+00, 1.7806802e+00,
        2.0639358e+00]])

In [11]:
model_output['marrmot_PRMS'][0].keys()

dict_keys(['flow_sim', 'srflow', 'ssflow', 'gwflow', 'sink', 'PET_hydro', 'AET_hydro', 'BFI_sim'])

In [16]:
args["testing_dir"]

'ts1995_2010'

In [15]:
for key in model_output['marrmot_PRMS'][0].keys():
    if len(model_output['marrmot_PRMS'][0][key].shape) == 3:
        dim = 1
    else:
        dim = 0
    concatenated_tensor = torch.cat([d[key] for d in model_output], dim=dim)
    file_name = key + ".npy"
    np.save(os.path.join(SAVE_PATH, args["testing_dir"], file_name), concatenated_tensor.numpy())

TypeError: string indices must be integers

In [None]:
def save_outputs(args, list_out_diff_model, y_obs, calculate_metrics=True):
    for key in list_out_diff_model[0].keys():
        if len(list_out_diff_model[0][key].shape) == 3:
            dim = 1
        else:
            dim = 0
        concatenated_tensor = torch.cat([d[key] for d in list_out_diff_model], dim=dim)
        file_name = key + ".npy"
        np.save(os.path.join(args["out_dir"], args["testing_dir"], file_name), concatenated_tensor.numpy())

    # Reading flow observation
    for var in args["target"]:
        item_obs = y_obs[:, :, args["target"].index(var)]
        file_name = var + ".npy"
        np.save(os.path.join(args["out_dir"], args["testing_dir"], file_name), item_obs)




















    if calculate_metrics == True:
        predLst = list()
        obsLst = list()
        flow_sim = torch.cat([d["flow_sim"] for d in list_out_diff_model], dim=1)
        flow_obs = y_obs[:, :, args["target"].index("00060_Mean")]
        predLst.append(flow_sim.numpy())
        obsLst.append(np.expand_dims(flow_obs, 2))
        if args["temp_model_name"] != "None":
            temp_sim = torch.cat([d["temp_sim"] for d in list_out_diff_model], dim=1)
            temp_obs = y_obs[:, :, args["target"].index("00010_Mean")]
            predLst.append(temp_sim.numpy())
            obsLst.append(np.expand_dims(temp_obs, 2))
        # we need to swap axes here to have [basin, days]
        statDictLst = [
            stat.statError(np.swapaxes(x.squeeze(), 1, 0), np.swapaxes(y.squeeze(), 1, 0))
            for (x, y) in zip(predLst, obsLst)
        ]
        ### save this file
        # median and STD calculation
        name_list = ["flow", "temp"]
        for st, name in zip(statDictLst, name_list):
            count = 0
            mdstd = np.zeros([len(st), 3])
            for key in st.keys():
                median = np.nanmedian(st[key])  # abs(i)
                STD = np.nanstd(st[key])  # abs(i)
                mean = np.nanmean(st[key])  # abs(i)
                k = np.array([[median, STD, mean]])
                mdstd[count] = k
                count = count + 1
            mdstd = pd.DataFrame(
                mdstd, index=st.keys(), columns=["median", "STD", "mean"]
            )
            mdstd.to_csv((os.path.join(args["out_dir"], args["testing_dir"], "mdstd_" + name + ".csv")))

            # Show boxplots of the results
            plt.rcParams["font.size"] = 14
            keyLst = ["Bias", "RMSE", "ubRMSE", "NSE", "Corr"]
            dataBox = list()
            for iS in range(len(keyLst)):
                statStr = keyLst[iS]
                temp = list()
                # for k in range(len(st)):
                data = st[statStr]
                data = data[~np.isnan(data)]
                temp.append(data)
                dataBox.append(temp)
            labelname = [
                "Hybrid differentiable model"
            ]  # ['STA:316,batch158', 'STA:156,batch156', 'STA:1032,batch516']   # ['LSTM-34 Basin']

            xlabel = ["Bias ($\mathregular{deg}$C)", "RMSE", "ubRMSE", "NSE", "Corr"]
            fig = plot.plotBoxFig(
                dataBox, xlabel, label2=labelname, sharey=False, figsize=(16, 8)
            )
            fig.patch.set_facecolor("white")
            boxPlotName = "PGML"
            fig.suptitle(boxPlotName, fontsize=12)
            plt.rcParams["font.size"] = 12
            plt.savefig(
                os.path.join(args["out_dir"], args["testing_dir"], "Box_" + name + ".png")
            )  # , dpi=500
            # fig.show()
            plt.close()


In [211]:
stats_list['SACSMA'][0].keys()

dict_keys(['Bias', 'RMSE', 'ubRMSE', 'Corr', 'CorrSp', 'R2', 'NSE', 'FLV', 'FHV', 'PBias', 'PBiasother', 'KGE', 'KGE12', 'fdcRMSE', 'lowRMSE', 'highRMSE', 'midRMSE'])

In [205]:
array1 = np.array([[1, 2],
                   [3, 4],
                   [5, 6]])

# Create a 1x3 array
a2 = np.array([7, 8, 9])

a = np.stack((a2, a2*2), axis=1)
# print(a)

b = np.hstack((a, a2.reshape(-1,1)))


b, np.amax(b, axis=1)




# def calc_nse(pred, target):
#     """
#     Currently returns the overall nse per basin.

#     Note: modify this to allow per day per basin as well.
#     """
#     # ngrid: number of basins
#     # nt: number of timesteps (in days usually)
#     ngrid, nt = pred.shape
#     NSE = np.full(ngrid, np.nan)

#     print(len(pred[670,:]), len(pred))
#     for k in range(0, ngrid):
#         x = pred[k, :]
#         y = target[k, :]
#         ind = np.where(np.logical_and(~np.isnan(x), ~np.isnan(y)))[0]
#         if ind.shape[0] > 0:
#             xx = x[ind]
#             yy = y[ind]

#             if ind.shape[0] > 1:
#                 yymean = yy.mean()
            
#                 SST = np.sum((yy-yymean)**2)
#                 SSRes = np.sum((yy-xx)**2)
#                 NSE[k] = 1-SSRes/SST

#     return NSE



# for i, (x,y) in enumerate(zip(preds, obs)):
#     # print(i)
#     # print(x.shape)
#     nse = calc_nse(np.swapaxes(x.squeeze(), 1, 0), np.swapaxes(y.squeeze(), 1, 0))

(array([[ 7, 14,  7],
        [ 8, 16,  8],
        [ 9, 18,  9]]),
 array([14, 16, 18]))

----

### Getting HBV Model Data

----


In [1]:
import sys
sys.path.append('../../')
from hydroDL import master, utils
from hydroDL.data import camels
from hydroDL.master import loadModel
from hydroDL.model import train
from hydroDL.post import plot, stat

import os
import numpy as np
import torch
import pandas as pd
import matplotlib.pyplot as plt
import random
import json
import datetime as dt


## fix the random seeds
randomseed = 111111
random.seed(randomseed)
torch.manual_seed(randomseed)
np.random.seed(randomseed)
torch.cuda.manual_seed(randomseed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

## GPU setting
testgpuid = 0
torch.cuda.set_device(testgpuid)

## setting options, keep the same as your training
PUOpt = 0  # 0 for All; 1 for PUB; 2 for PUR;
buffOptOri = 0  # original buffOpt, must be same as what you set for training
buffOpt = 0  # control load training data 0: do nothing; 1: repeat first year; 2: load one more year
forType = 'daymet'

## Hyperparameters, keep the same as your training setup
BATCH_SIZE = 100
RHO = 365
HIDDENSIZE = 256
Ttrain = [19801001, 19951001]  # Training period
# Ttrain = [19891001, 19991001]  # PUB/PUR period
Tinv = [19801001, 19951001] # dPL Inversion period
# Tinv = [19891001, 19991001]  # PUB/PUR period
Nfea = 12 # number of HBV parameters
BUFFTIME = 365
routing = True
Nmul = 16
comprout = False
compwts = False
pcorr = None

Ttest = [19951001, 20101001]  # testing period
TtestLst = utils.time.tRange2Array(Ttest)
TtestLoad = [19951001, 20101001]  

testbatch = 50  # forward number of "testbatch" basins each time to save GPU memory. You can set this even smaller to save more.
testepoch = 50

testseed = 111111

loading package hydroDL


In [2]:
# Define root directory of database and saved output dir
# Modify this based on your own location of CAMELS dataset and saved models
rootDatabase = os.path.join(os.path.sep, 'D:\data', 'Camels')  # CAMELS dataset root directory
camels.initcamels(rootDatabase)  # initialize three camels module-scope variables in camels.py: dirDB, gageDict, statDict

rootOut = os.path.join(os.path.sep, 'D:\data\model_runs', 'rnnStreamflow')  # Model output root directory

# CAMLES basin info
gageinfo = camels.gageDict
hucinfo = gageinfo['huc']
gageid = gageinfo['id']
gageidLst = gageid.tolist()

# same as training, load data based on ALL, PUB, PUR scenarios
if PUOpt == 0: # for All the basins
    puN = 'ALL'
    tarIDLst = [gageidLst]

elif PUOpt == 1: # for PUB
    puN = 'PUB'
    # load the subset ID
    # splitPath saves the basin ID of random groups
    splitPath = 'PUBsplitLst.txt'
    with open(splitPath, 'r') as fp:
        testIDLst=json.load(fp)
    tarIDLst = testIDLst

elif PUOpt == 2: # for PUR
    puN = 'PUR'
    # Divide CAMELS dataset into 7 PUR regions
    # get the id list of each region
    regionID = list()
    regionNum = list()
    regionDivide = [ [1,2], [3,6], [4,5,7], [9,10], [8,11,12,13], [14,15,16,18], [17] ] # seven regions
    for ii in range(len(regionDivide)):
        tempcomb = regionDivide[ii]
        tempregid = list()
        for ih in tempcomb:
            tempid = gageid[hucinfo==ih].tolist()
            tempregid = tempregid + tempid
        regionID.append(tempregid)
        regionNum.append(len(tempregid))
    tarIDLst = regionID     # List of all basin ID's in the study (671 for full camels).

# define the matrix to save results
predtestALL = np.full([len(gageid), len(TtestLst), 5], np.nan)
obstestALL = np.full([len(gageid), len(TtestLst), 1], np.nan)

# this testsave_path should be consistent with where you save your model
testsave_path = 'CAMELSDemo/dPLHBV/' + puN + '/Testforc/' + forType + '/BuffOpt' + str(buffOptOri) +\
                '/RMSE_para0.25/'+str(testseed)

## load data and test the model
nstart = 0
logtestIDLst = []

In [19]:
for ifold in range(1, len(tarIDLst)+1):
    testfold = ifold
    TestLS = tarIDLst[testfold - 1]
    TestInd = [gageidLst.index(j) for j in TestLS]
   
    TrainLS = gageidLst
    TrainInd = [gageidLst.index(j) for j in TrainLS]

    gageDic = {'TrainID':TrainLS, 'TestID':TestLS}

    nbasin = len(TestLS) # number of basins for testing


NameError: name 'tarIDLst' is not defined

In [5]:
foldstr = 'Fold' + str(testfold)
exp_info = 'T_'+str(Ttrain[0])+'_'+str(Ttrain[1])+'_BS_'+str(BATCH_SIZE)+'_HS_'+str(HIDDENSIZE)\
            +'_RHO_'+str(RHO)+'_NF_'+str(Nfea)+'_Buff_'+str(BUFFTIME)+'_Mul_'+str(Nmul)
# the final path to test with the trained model saved in
testout = os.path.join(rootOut, testsave_path, foldstr, exp_info)
testmodel = loadModel(testout, epoch=testepoch)
testmodel

MultiInv_HBVModel(
  (lstminv): CudnnLstmModel(
    (linearIn): Linear(in_features=38, out_features=256, bias=True)
    (lstm): CudnnLstm()
    (linearOut): Linear(in_features=256, out_features=194, bias=True)
  )
  (HBV): HBVMul()
)

In [6]:
TtrainLoad = Ttrain
TinvLoad = Tinv

varF = ['prcp', 'tmean']
varFInv = ['prcp', 'tmean']


attrnewLst = [ 'p_mean','pet_mean','p_seasonality','frac_snow','aridity','high_prec_freq','high_prec_dur',
                   'low_prec_freq','low_prec_dur', 'elev_mean', 'slope_mean', 'area_gages2', 'frac_forest', 'lai_max',
                   'lai_diff', 'gvf_max', 'gvf_diff', 'dom_land_cover_frac', 'dom_land_cover', 'root_depth_50',
                   'soil_depth_pelletier', 'soil_depth_statsgo', 'soil_porosity', 'soil_conductivity',
                   'max_water_content', 'sand_frac', 'silt_frac', 'clay_frac', 'geol_1st_class', 'glim_1st_class_frac',
                   'geol_2nd_class', 'glim_2nd_class_frac', 'carbonate_rocks_frac', 'geol_porostiy', 'geol_permeability']

dfTrain = camels.DataframeCamels(tRange=TtrainLoad, subset=TrainLS, forType=forType)
forcUN = dfTrain.getDataTs(varLst=varF, doNorm=False, rmNan=False)

daymet tmean was used!
Time to read usgs streamflow:  26.232120752334595
Time to read usgs streamflow:  20.120174407958984


In [8]:
dfInv = camels.DataframeCamels(tRange=TinvLoad, subset=TrainLS, forType=forType)
forcInvUN = dfInv.getDataTs(varLst=varFInv, doNorm=False, rmNan=False)
attrsUN = dfInv.getDataConst(varLst=attrnewLst, doNorm=False, rmNan=False)

daymet tmean was used!
Time to read usgs streamflow:  20.086344957351685
Time to read usgs streamflow:  20.02952527999878


In [9]:
dfTest = camels.DataframeCamels(tRange=TtestLoad, subset=TestLS, forType=forType)
forcTestUN = dfTest.getDataTs(varLst=varF, doNorm=False, rmNan=False)
obsTestUN = dfTest.getDataObs(doNorm=False, rmNan=False, basinnorm=False)
attrsTestUN = dfTest.getDataConst(varLst=attrnewLst, doNorm=False, rmNan=False)

daymet tmean was used!
Time to read usgs streamflow:  20.367339611053467
Time to read usgs streamflow:  20.11677098274231
read usgs streamflow 21.04769468307495


In [11]:
len(obsTestUN), len(attrnewLst), len(obsTestUN[0])

(671, 35, 5479)

In [15]:
areas = gageinfo['area'][TestInd] # unit km2
temparea = np.tile(areas[:, None, None], (1, obsTestUN.shape[1],1))
obsTestUN = (obsTestUN * 0.0283168 * 3600 * 24) / (temparea * (10 ** 6)) * 10**3 

varLstNL = ['PEVAP']
usgsIdLst = gageid
if forType == 'maurer':
    tPETRange = [19800101, 20090101]
else:
    tPETRange = [19800101, 20150101]
tPETLst = utils.time.tRange2Array(tPETRange)
PETDir = rootDatabase + '/pet_harg/' + forType + '/'
ntime = len(tPETLst)
PETfull = np.empty([len(usgsIdLst), ntime, len(varLstNL)])
for k in range(len(usgsIdLst)):
    dataTemp = camels.readcsvGage(PETDir, usgsIdLst[k], varLstNL, ntime)
    PETfull[k, :, :] = dataTemp

TtrainLst = utils.time.tRange2Array(TtrainLoad)
TinvLst = utils.time.tRange2Array(TinvLoad)
TtestLoadLst = utils.time.tRange2Array(TtestLoad)
C, ind1, ind2 = np.intersect1d(TtrainLst, tPETLst, return_indices=True)
PETUN = PETfull[:, ind2, :]
PETUN = PETUN[TrainInd, :, :] # select basins
C, ind1, ind2inv = np.intersect1d(TinvLst, tPETLst, return_indices=True)
PETInvUN = PETfull[:, ind2inv, :]
PETInvUN = PETInvUN[TrainInd, :, :]
C, ind1, ind2test = np.intersect1d(TtestLoadLst, tPETLst, return_indices=True)
PETTestUN = PETfull[:, ind2test, :]
PETTestUN = PETTestUN[TestInd, :, :]

# process data, do normalization and remove nan
series_inv = np.concatenate([forcInvUN, PETInvUN], axis=2)
seriesvarLst = varFInv + ['pet']
# load the saved statistics
statFile = os.path.join(testout, 'statDict.json')
with open(statFile, 'r') as fp:
    statDict = json.load(fp)

# normalize
attr_norm = camels.transNormbyDic(attrsUN, attrnewLst, statDict, toNorm=True)
attr_norm[np.isnan(attr_norm)] = 0.0
series_norm = camels.transNormbyDic(series_inv, seriesvarLst, statDict, toNorm=True)
series_norm[np.isnan(series_norm)] = 0.0

attrtest_norm = camels.transNormbyDic(attrsTestUN, attrnewLst, statDict, toNorm=True)
attrtest_norm[np.isnan(attrtest_norm)] = 0.0
seriestest_inv = np.concatenate([forcTestUN, PETTestUN], axis=2)
seriestest_norm = camels.transNormbyDic(seriestest_inv, seriesvarLst, statDict, toNorm=True)
seriestest_norm[np.isnan(seriestest_norm)] = 0.0

# prepare the inputs
zTrain = series_norm
xTrain = np.concatenate([forcUN, PETUN], axis=2) # HBV forcing
xTrain[np.isnan(xTrain)] = 0.0

In [16]:
if buffOpt == 1: # repeat the first year for buff
    zTrainIn = np.concatenate([zTrain[:,0:BUFFTIME,:], zTrain], axis=1)
    xTrainIn = np.concatenate([xTrain[:,0:BUFFTIME,:], xTrain], axis=1) # Bufftime for the first year
    # yTrainIn = np.concatenate([obsUN[:,0:BUFFTIME,:], obsUN], axis=1)
else: # no repeat, original data
    zTrainIn = zTrain
    xTrainIn = xTrain
    # yTrainIn = obsUN

forcTuple = (xTrainIn, zTrainIn)
attrs = attr_norm

## Prepare the testing data and forward the trained model for testing
# TestBuff = 365 # Use 365 days forcing to warm up the model for testing
TestBuff = xTrain.shape[1]  # Use the whole training period to warm up the model for testing
# TestBuff = len(TtestLoadLst) - len(TtestLst)  # use the redundantly loaded data to warm up

# prepare file name to save the testing predictions
filePathLst = master.master.namePred(
        testout, Ttest, 'All_Buff'+str(TestBuff), epoch=testepoch, targLst=['Qr', 'Q0', 'Q1', 'Q2', 'ET'])

read master file D:\data\model_runs\rnnStreamflow\CAMELSDemo/dPLHBV/ALL/Testforc/daymet/BuffOpt0/RMSE_para0.25/111111\Fold1\T_19801001_19951001_BS_100_HS_256_RHO_365_NF_12_Buff_365_Mul_16\master.json


In [18]:
# prepare the inputs for TESTING
if PUOpt == 0: # for ALL basins, temporal generalization test
    zTest = series_norm  # dPL inversion
    xTest = np.concatenate([forcTestUN, PETTestUN], axis=2)  # HBV forcing
    # forcings to warm up the model. Here use the forcing of training period to warm up
    xTestBuff = xTrain[:, -TestBuff:, :]
    xTest = np.concatenate([xTestBuff, xTest], axis=1)
    obs = obsTestUN[:, 0:, :]  # starts with 0 when not loading more data before testing period

else:  # for PUB and PUR cases, different testing basins. Load more forcings to warm up.
    zTest = seriestest_norm[:, 0:TestBuff, :]  # Use the warm-up period forcing as the gA input in zTest
    # zTest = seriestest_norm
    xTest = np.concatenate([forcTestUN, PETTestUN], axis=2)  # HBV forcing
    obs = obsTestUN[:, TestBuff:, :]  # exclude loaded obs in warming up period (first TestBuff days) for evaluation

# Use days of TestBuff to initialize the model
testmodel.inittime=TestBuff

# Final inputs to the test model
xTest[np.isnan(xTest)] = 0.0
attrtest = attrtest_norm
cTemp = np.repeat(
    np.reshape(attrtest, [attrtest.shape[0], 1, attrtest.shape[-1]]), zTest.shape[1], axis=1)
zTest = np.concatenate([zTest, cTemp], 2) # Add attributes to historical forcings as the inversion part
testTuple = (xTest, zTest) # xTest: input forcings to HBV; zTest: inputs to gA LSTM to learn parameters

# forward the model and save results
train.testModel(
    testmodel, testTuple, c=None, batchSize=testbatch, filePathLst=filePathLst)

# read out the saved forward predictions
dataPred = np.ndarray([obs.shape[0], obs.shape[1], len(filePathLst)])
for k in range(len(filePathLst)):
    filePath = filePathLst[k]
    dataPred[:, :, k] = pd.read_csv(
        filePath, dtype=np.float, header=None).values
# save the predictions to the big matrix
predtestALL[nstart:nstart+nbasin, :, :] = dataPred
obstestALL[nstart:nstart+nbasin, :, :] = obs
nstart = nstart + nbasin
logtestIDLst = logtestIDLst + TestLS

batch 0


  output, hy, cy, reserve, new_weight_buf = torch._cudnn_rnn(


batch 1
batch 2
batch 3
batch 4
batch 5
batch 6
batch 7
batch 8
batch 9
batch 10
batch 11
batch 12
batch 13


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  filePath, dtype=np.float, header=None).values


In [24]:
predtestALL[0], len(predtestALL[0])

(array([[1.3000539e-02, 0.0000000e+00, 7.9583000e-03, 2.9776424e-01,
         1.7031918e+00],
        [5.8530558e-02, 0.0000000e+00, 1.3215601e-03, 2.7569354e-01,
         1.6276169e+00],
        [1.1220434e-01, 0.0000000e+00, 0.0000000e+00, 2.5547340e-01,
         1.3423388e+00],
        ...,
        [1.3872437e+00, 0.0000000e+00, 8.0913790e-01, 1.4040515e+00,
         1.8904784e+00],
        [1.5643985e+00, 0.0000000e+00, 1.0355383e+00, 1.5936201e+00,
         1.9796518e+00],
        [1.8524745e+00, 8.7666190e-02, 2.0063870e+00, 1.7806802e+00,
         2.0639358e+00]]),
 5479)

In [25]:
## post processing
# calculate evaluation metrics
evaDict = [stat.statError(predtestALL[:,:,0], obstestALL.squeeze())]  # Q0: the streamflow

  PBiaslow[k] = np.sum(lowpred - lowtarget) / np.sum(lowtarget) * 100


In [30]:
len(evaDict[0]['NSE'])

671

In [1]:
from test_dp_HBV import test_dp_hbv

predtestALL, predtestALL = test_dp_hbv()

loading package hydroDL
daymet tmean was used!
Time to read usgs streamflow:  20.30001163482666
Time to read usgs streamflow:  20.10430121421814
daymet tmean was used!
Time to read usgs streamflow:  20.163031578063965
Time to read usgs streamflow:  20.467963218688965
daymet tmean was used!
Time to read usgs streamflow:  20.458739519119263
Time to read usgs streamflow:  20.346842050552368
read usgs streamflow 16.54717493057251
read master file D:\data\model_runs\rnnStreamflow\CAMELSDemo/dPLHBV/ALL/Testforc/daymet/BuffOpt0/RMSE_para0.25/111111\Fold1\T_19801001_19951001_BS_100_HS_256_RHO_365_NF_12_Buff_365_Mul_16\master.json
batch 0


  output, hy, cy, reserve, new_weight_buf = torch._cudnn_rnn(


batch 1
batch 2
batch 3
batch 4
batch 5
batch 6
batch 7
batch 8
batch 9
batch 10
batch 11
batch 12
batch 13
