### Adapting ensemble code of Kamlesh 2023
*Requires cuda

---

In [1]:
from config.read_configurations import config_hbv as hbvArgs
from config.read_configurations import config_prms as prmsArgs
from config.read_configurations import config_sacsma as sacsmaArgs
from config.read_configurations import config_sacsma_snow as sacsmaSnowArgs
from config.read_configurations import config_hbv_hydrodl as hbvhyArgs_d

import torch
import os
import platform
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import scipy.stats
# from post import plot

from core.utils.randomseed_config import randomseed_config
from core.utils.master import create_output_dirs
from MODELS.loss_functions.get_loss_function import get_lossFun
from MODELS.test_dp_HBV_dynamic import test_dp_hbv
from core.data_processing.data_loading import loadData
from core.data_processing.normalization import transNorm
from core.data_processing.model import (
    take_sample_test,
    converting_flow_from_ft3_per_sec_to_mm_per_day
)

import warnings
warnings.filterwarnings("ignore")


## GPU setting
# which GPU to use when having multiple
# traingpuid = 0
# torch.cuda.set_device(traingpuid)



# fix the random seeds for reproducibility
def randomseed_config(seed):
    if seed == None:  # args['randomseed'] is None:
        # generate random seed
        randomseed = int(np.random.uniform(low=0, high=1e6))
        print("random seed updated!")
    else:
        print("Setting seed 0.")
        # randomseed = args['randomseed']
        random.seed(seed)
        torch.manual_seed(seed)
        np.random.seed(seed)
        torch.cuda.manual_seed(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        # torch.use_deterministic_algorithms(True)



randomseed_config(0)



# Set path to `hydro_multimodel_results` directory.
if platform.system() == 'Darwin':
    # For mac os
    out_dir = '/Users/leoglonz/Desktop/water/data/model_runs/hydro_multimodel_results'
    # Some operations are not yet working with MPS, so we might need to set some environment variables to use CPU fall instead
    # %env PYTORCH_ENABLE_MPS_FALLBACK=1

elif platform.system() == 'Windows':
    # For windows
    out_dir = 'D:\\data\\model_runs\\hydro_multimodel_results\\'

elif platform.system() == 'Linux':
    # For Colab
    out_dir = '/content/drive/MyDrive/Colab/data/model_runs/hydro_multimodel_results'

else:
    raise ValueError('Unsupported operating system.')


##-----## Multi-model Parameters ##-----##
##--------------------------------------##
# Setting dictionaries to separately manage each diff model's attributes.
models = {'dPLHBV_dyn': None,'SACSMA_snow':None, 'marrmot_PRMS':None}  # 'HBV':None, 'hbvhy': None, 'SACSMA_snow':None, 'SACSMA':None,
args_list = {'dPLHBV_dyn': hbvhyArgs_d,'SACSMA_snow':sacsmaSnowArgs, 'marrmot_PRMS':prmsArgs}   # 'hbvhy': hbvhyArgs, 'HBV' : hbvArgs, 'SACSMA_snow':None, 'SACSMA': sacsmaArgs,
ENSEMBLE_TYPE = 'max'  # 'median', 'avg', 'max', 'softmax'

# Load test observations and predictions from a prior run.
pred_path = os.path.join(out_dir, 'multimodels', '671_sites_dp', 'hydro_preds_obs', 'preds_671_dPLHBVd_SACSMASnow_PRMS.npy')
obs_path = os.path.join(out_dir, 'multimodels', '671_sites_dp', 'hydro_preds_obs', 'obs_671_dPLHBVd_SACSMASnow_PRMS.npy')
preds = np.load(pred_path, allow_pickle=True).item()
obs = np.load(obs_path, allow_pickle=True).item()

model_output = preds
y_obs = obs

# Initialize
flow_preds = []
flow_obs = None
obs_trig = False

# Concatenate individual model predictions, and observation data.
for i, mod in enumerate(args_list):
    args = args_list[mod]
    mod_out = model_output[mod]
    y_ob = y_obs[mod]

    print(mod)

    if mod in ['HBV', 'SACSMA', 'SACSMA_snow', 'marrmot_PRMS']:
        # Hydro models are tested in batches, so we concatenate them and select
        # the desired flow.
        # Note: modified HBV already has this preparation done during testing.

        # Get flow predictions and swap axes to get shape [basins, days]
        pred = np.swapaxes(torch.cat([d["flow_sim"] for d in mod_out], dim=1).squeeze().numpy(), 0, 1)

        if obs_trig == False:
            # dPLHBV uses GAGES while the other hydro models use CAMELS data. This means small
            # e-5 variation in observation data between the two. This is averaged if both models
            # are used, but to avoid double-counting data from multiply hydro models, use a trigger.
            obs = np.swapaxes(y_ob[:, :, args["target"].index("00060_Mean")].numpy(), 0, 1)
            obs_trig = True
            dup = False
        else:
            dup = True

    elif mod in ['dPLHBV_dyn']:
        pred = mod_out[:,:,0][:,365:] # Set dim2 = 0 to get streamflow Qr
        obs = y_ob.squeeze()[:,365:]
        dup = False

    else:
        raise ValueError(f"Unsupported model type in `models`.")

    if i == 0:
        tmp_pred = pred
        tmp_obs = obs
    elif i == 1:
        tmp_pred = np.stack((tmp_pred, pred), axis=2)
        if not dup:
            # Avoid double-counting GAGES obs.
            tmp_obs = np.stack((tmp_obs, obs), axis=2)
    else:
        # Combine outputs of >3 models.
        tmp_pred = np.concatenate((tmp_pred,np.expand_dims(pred, 2)), axis=2)
        if not dup:
            # Avoid double-counting GAGES obs.
            tmp_obs = np.concatenate((tmp_obs,np.expand_dims(obs, 2)), axis=2)

preds = tmp_pred
obs = tmp_obs

# Merge observation data.
if len(obs.shape) == 3:
    comp_obs = np.mean(obs, axis = 2)
elif len(obs.shape) == 2:
    comp_obs = obs
else:
    raise ValueError("Error reading prediction data: incorrect formatting.")

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


loading package hydroDL
Setting seed 0.
dPLHBV_dyn
SACSMA_snow
marrmot_PRMS


In [2]:
# LSTM model dependencies:
##########################


#### DESTINATION: train.py


import time
from hydroDL.model import rnn, cnn, crit



def randomIndex(ngrid, nt, dimSubset, bufftime=0):
    batchSize, rho = dimSubset
    iGrid = np.random.randint(0, ngrid, [batchSize])
    iT = np.random.randint(0+bufftime, nt - rho, [batchSize])
    return iGrid, iT


def selectSubset(x, iGrid, iT, rho, *, c=None, tupleOut=False, LCopt=False, bufftime=0):
    nx = x.shape[-1]
    nt = x.shape[1]
    if x.shape[0] == len(iGrid):   #hack
        iGrid = np.arange(0,len(iGrid))  # hack
    if nt <= rho:
        iT.fill(0)

    batchSize = iGrid.shape[0]
    if iT is not None:
        # batchSize = iGrid.shape[0]
        xTensor = torch.zeros([rho+bufftime, batchSize, nx], requires_grad=False)
        for k in range(batchSize):
            temp = x[iGrid[k]:iGrid[k] + 1, np.arange(iT[k]-bufftime, iT[k] + rho), :]
            xTensor[:, k:k + 1, :] = torch.from_numpy(np.swapaxes(temp, 1, 0))
    else:
        if LCopt is True:
            # used for local calibration kernel: FDC, SMAP...
            if len(x.shape) == 2:
                # Used for local calibration kernel as FDC
                # x = Ngrid * Ntime
                xTensor = torch.from_numpy(x[iGrid, :]).float()
            elif len(x.shape) == 3:
                # used for LC-SMAP x=Ngrid*Ntime*Nvar
                xTensor = torch.from_numpy(np.swapaxes(x[iGrid, :, :], 1, 2)).float()
        else:
            # Used for rho equal to the whole length of time series
            xTensor = torch.from_numpy(np.swapaxes(x[iGrid, :, :], 1, 0)).float()
            rho = xTensor.shape[0]
    if c is not None:
        nc = c.shape[-1]
        temp = np.repeat(
            np.reshape(c[iGrid, :], [batchSize, 1, nc]), rho+bufftime, axis=1)
        cTensor = torch.from_numpy(np.swapaxes(temp, 1, 0)).float()

        if (tupleOut):
            if torch.cuda.is_available():
                xTensor = xTensor.cuda()
                cTensor = cTensor.cuda()
            out = (xTensor, cTensor)
        else:
            out = torch.cat((xTensor, cTensor), 2)
    else:
        out = xTensor

    if torch.cuda.is_available() and type(out) is not tuple:
        out = out.cuda()
    return out


In [3]:
# LSTM model:
##########################


#### DESTINATION: rnn.py

import math
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
import torch.autograd as autograd

from hydroDL.model.dropout import DropMask, createMask



class CudnnLstm(torch.nn.Module):
    def __init__(self, *, inputSize, hiddenSize, dr=0.5, drMethod='drW',
                 gpu=0):
        super(CudnnLstm, self).__init__()
        self.inputSize = inputSize
        self.hiddenSize = hiddenSize
        self.dr = dr
        self.w_ih = Parameter(torch.Tensor(hiddenSize * 4, inputSize))
        self.w_hh = Parameter(torch.Tensor(hiddenSize * 4, hiddenSize))
        self.b_ih = Parameter(torch.Tensor(hiddenSize * 4))
        self.b_hh = Parameter(torch.Tensor(hiddenSize * 4))
        self._all_weights = [['w_ih', 'w_hh', 'b_ih', 'b_hh']]
        self.cuda()

        self.reset_mask()
        self.reset_parameters()

    def _apply(self, fn):
        ret = super(CudnnLstm, self)._apply(fn)
        return ret

    def __setstate__(self, d):
        super(CudnnLstm, self).__setstate__(d)
        self.__dict__.setdefault('_data_ptrs', [])
        if 'all_weights' in d:
            self._all_weights = d['all_weights']
        if isinstance(self._all_weights[0][0], str):
            return
        self._all_weights = [['w_ih', 'w_hh', 'b_ih', 'b_hh']]

    def reset_mask(self):
        self.maskW_ih = createMask(self.w_ih, self.dr)
        self.maskW_hh = createMask(self.w_hh, self.dr)

    def reset_parameters(self):
        stdv = 1.0 / math.sqrt(self.hiddenSize)
        for weight in self.parameters():
            weight.data.uniform_(-stdv, stdv)


    def forward(self, input, hx=None, cx=None, doDropMC=False, dropoutFalse=False):
        # dropoutFalse: it will ensure doDrop is false, unless doDropMC is true
        if dropoutFalse and (not doDropMC):
            doDrop = False
        elif self.dr > 0 and (doDropMC is True or self.training is True):
            doDrop = True
        else:
            doDrop = False

        batchSize = input.size(1)

        if hx is None:
            hx = input.new_zeros(
                1, batchSize, self.hiddenSize, requires_grad=False)
        if cx is None:
            cx = input.new_zeros(
                1, batchSize, self.hiddenSize, requires_grad=False)

        # cuDNN backend - disabled flat weight
        # handle = torch.backends.cudnn.get_handle()
        if doDrop is True:
            self.reset_mask()
            weight = [
                DropMask.apply(self.w_ih, self.maskW_ih, True),
                DropMask.apply(self.w_hh, self.maskW_hh, True), self.b_ih,
                self.b_hh
            ]
        else:
            weight = [self.w_ih, self.w_hh, self.b_ih, self.b_hh]

        # output, hy, cy, reserve, new_weight_buf = torch._cudnn_rnn(
            # input, weight, 4, None, hx, cx, torch.backends.cudnn.CUDNN_LSTM,
            # self.hiddenSize, 1, False, 0, self.training, False, (), None)
        if torch.__version__ < "1.8":
            output, hy, cy, reserve, new_weight_buf = torch._cudnn_rnn(
                input, weight, 4, None, hx, cx, 2,  # 2 means LSTM
                self.hiddenSize, 1, False, 0, self.training, False, (), None)
        else:
            output, hy, cy, reserve, new_weight_buf = torch._cudnn_rnn(
                input, weight, 4, None, hx, cx, 2,  # 2 means LSTM
                self.hiddenSize, 0, 1, False, 0, self.training, False, (), None)
        return output, (hy, cy)

    @property
    def all_weights(self):
        return [[getattr(self, weight) for weight in weights]
                for weights in self._all_weights]


class CudnnLstmModel(torch.nn.Module):
    def __init__(self, *, nx, ny, hiddenSize, dr=0.5):
        super(CudnnLstmModel, self).__init__()
        self.nx = nx
        self.ny = ny
        self.hiddenSize = hiddenSize
        self.ct = 0
        self.nLayer = 1
        self.linearIn = torch.nn.Linear(nx, hiddenSize).cuda()
        self.lstm = CudnnLstm(
            inputSize=hiddenSize, hiddenSize=hiddenSize, dr=dr)
        self.linearOut = torch.nn.Linear(hiddenSize, ny)
        self.gpu = 1
        # self.drtest = torch.nn.Dropout(p=0.4)

    def forward(self, x, doDropMC=False, dropoutFalse=False):
        x0 = F.relu(self.linearIn(x))
        outLSTM, (hn, cn) = self.lstm(x0, doDropMC=doDropMC, dropoutFalse=dropoutFalse)
        # outLSTMdr = self.drtest(outLSTM)
        out = self.linearOut(outLSTM)
        return out


class RangeBoundLoss(nn.Module):
    """limit parameters from going out of range"""
    def __init__(self, lb, ub):
        super(RangeBoundLoss, self).__init__()
        self.lb = torch.tensor(lb).cuda()
        self.ub = torch.tensor(ub).cuda()
        # self.factor = torch.tensor(factor).cuda()

    def forward(self, params, factor):
        factor = torch.tensor(factor).cuda()
        loss = 0
        for i in range(len(params)):
            lb = self.lb[i]
            ub = self.ub[i]
            upper_bound_loss = factor * torch.relu(params[i] - ub).mean()
            lower_bound_loss = factor * torch.relu(lb - params[i]).mean()
            loss = loss + upper_bound_loss + lower_bound_loss
        return loss
    

In [4]:
# Wrapper:
##########################


#### DESTINATION: rnn.py

# Modified prcp_weights

class EnsembleWeights(torch.nn.Module):
    def __init__(self, *, ninv, hiddeninv, drinv=0.5, prcp_datatypes=1):
        super(EnsembleWeights, self).__init__()
        self.ninv = ninv
        self.prcp_datatypes = prcp_datatypes

        self.ntp = prcp_datatypes*3
        self.hiddeninv = hiddeninv

        self.lstminv = CudnnLstmModel(
            nx=ninv, ny=self.ntp, hiddenSize=hiddeninv, dr=drinv).cuda()

        # Adjust the range for acceptable sum of weights for loss.
        # Potentially worth testing different combinations.
        lb_prcp = [0.95]
        ub_prcp = [1.05]
        self.RangeBoundLoss = RangeBoundLoss(lb=lb_prcp, ub=ub_prcp)

    def forward(self, x, prcp_loss_factor):
        # x.requires_grad = True

        wghts = self.lstminv(x)
        wghts_scaled = torch.sigmoid(wghts)

        # prcp_wghts_sum = torch.sum(wghts_scaled, dim=2)
        # prcp_wghts_sum = torch.sum(wghts_scaled[:,:,:3], dim=2)
        prcp_wghts_sum = torch.sum(wghts_scaled[:,:,:self.ntp], dim=2)


        # range_bound_loss_prcp = self.RangeBoundLoss([prcp_wghts_sum], factor=prcp_loss_factor)+self.RangeBoundLoss([temp_wghts_sum], factor=prcp_loss_factor)+self.RangeBoundLoss([pet_wghts_sum], factor=prcp_loss_factor)
        range_bound_loss_prcp = self.RangeBoundLoss([prcp_wghts_sum], factor=prcp_loss_factor)

        # range_bound_loss_prcp = 0

        # Use if the Dr. Shen requests gradient analysis.
        # grad_daymet = autograd.grad(outputs=wghts_scaled[:, :, 0], inputs=z, grad_outputs=torch.ones_like(wghts_scaled[:, :, 0]), retain_graph=True)[0]
        # grad_maurer = autograd.grad(outputs=wghts_scaled[:, :, 1], inputs=z, grad_outputs=torch.ones_like(wghts_scaled[:, :, 1]), retain_graph=True)[0]
        # grad_nldas = autograd.grad(outputs=wghts_scaled[:, :, 2], inputs=z, grad_outputs=torch.ones_like(wghts_scaled[:, :, 2]), retain_graph=True)[0]

        # return x_new, range_bound_loss_prcp, wghts_scaled, grad_daymet, grad_maurer, grad_nldas
        return range_bound_loss_prcp, wghts_scaled


In [5]:
from typing import ValuesView
from tqdm import tqdm


def trainEnsemble(model,
                  x,
                  y,
                  lossFun,
                  *,
                  nEpoch=500,
                  startEpoch=1,
                  miniBatch=[100, 30],
                  saveEpoch=100,
                  saveFolder=None,
                  mode='seq2seq',
                  bufftime=0,
                  prcp_loss_factor = 15,
                  smooth_loss_factor = 0,
                  ):
    """
    x- input;
    y - target;
    """

    if torch.cuda.is_available():
        lossFun = lossFun.cuda()
        model = model.cuda()

    batchSize, rho = miniBatch
    ngrid, nt, nx = x.shape

    if batchSize >= ngrid:
        # batchsize larger than total grids
        batchSize = ngrid

    nIterEp = int(
        np.ceil(np.log(0.01) / np.log(1 - batchSize * rho / ngrid / (nt-bufftime))))
    if hasattr(model, 'ctRm'):
        if model.ctRm is True:
            nIterEp = int(
                np.ceil(
                    np.log(0.01) / np.log(1 - batchSize *
                                          (rho - model.ct) / ngrid / (nt-bufftime))))

    optim = torch.optim.Adadelta(list(model.parameters()))
    model.zero_grad()

    if saveFolder != None:
        os.makedirs(saveFolder, exist_ok=True)
    #     runFile = os.path.join(saveFolder, 'run.csv')

    #     rf = open(runFile, 'w+')

    for iEpoch in range(startEpoch, nEpoch + 1):
        lossEp = 0
        loss_prcp_Ep = 0
        loss_sf_Ep = 0

        t0 = time.time()
        prog_str = "Epoch " + str(iEpoch) + "/" + str(nEpoch)

        for iIter in tqdm(range(0, nIterEp), desc=prog_str, leave=False):
            # training iterations
            iGrid, iT = randomIndex(ngrid, nt, [batchSize, rho], bufftime=bufftime)

            if type(model) == EnsembleWeights:
                # leave here to allow additional model types for future.
                xTrain = selectSubset(x, iGrid, iT, rho, bufftime=bufftime)
            else:
                raise ValueError("Model must be of type `Ensemble_weights")

            yTrain = selectSubset(y, iGrid, iT, rho)

            # calculate loss and weights `wt`.
            prcp_loss, prcp_wt = model(xTrain, prcp_loss_factor)
            yP = xTrain * prcp_wt


            ## temporary test for NSE loss
            if type(lossFun) in [crit.NSELossBatch, crit.NSESqrtLossBatch]:
                loss_sf = lossFun(yP, yTrain, iGrid)
                loss = loss_sf + prcp_loss
            else:
                loss_sf = lossFun(yP, yTrain)
                loss = loss_sf + prcp_loss

            loss.backward()
            optim.step()
            optim.zero_grad()
            lossEp = lossEp + loss.item()

            try:
                loss_prcp_Ep = loss_prcp_Ep + prcp_loss.item()
            except:
                pass

            loss_sf_Ep = loss_sf_Ep + loss_sf.item()

            # if iIter % 100 == 0:
            #     # print loss
            #     print('Iter {} of {}: Loss {:.3f}'.format(iIter, nIterEp, loss.item()))

        lossEp = lossEp / nIterEp
        loss_sf_Ep = loss_sf_Ep / nIterEp
        loss_prcp_Ep = loss_prcp_Ep / nIterEp

        logStr = 'Epoch {} Loss {:.3f}, Streamflow Loss {:.3f}, Precipitation Loss {:.3f}, time {:.2f}'.format(
            iEpoch, lossEp, loss_sf_Ep, loss_prcp_Ep,
            time.time() - t0)
        print(logStr)

        # save model and loss
        if saveFolder != None:
            runFile = os.path.join(saveFolder, 'run.csv')

            with open(runFile, 'w+') as rf:
              rf.write(logStr + '\n')

            if iEpoch % saveEpoch == 0:
                # save model
                modelFile = os.path.join(saveFolder,
                                         'model_Ep' + str(iEpoch) + '.pt')
                torch.save(model, modelFile)

    if saveFolder != None:
        rf.close()

    return model


---
### Testing Ensemble


\\

\\


---

In [6]:
EPOCHS = 100
BATCHSIZE = 100
LOSS_FACTOR = 15
SMOOTH_LOSS_FACTOR = 0
ALPHA = 0.25  # A weight for RMSE loss to balance low and peak flow.
LOSSFUNC = crit.RmseLossComb(alpha=ALPHA)

# model = EnsembleWeights(ninv=3, hiddeninv=256, prcp_datatypes=1)


In [7]:
# Setting save path for model.
rootdir = '/content/drive/MyDrive/Colab/data/model_runs/hydro_multimodel_results/multimodels/671_sites_dp/ensemble/'
save_path = rootdir + 'HBV_SAC_wSnow_PRMS' + '_E' + str(EPOCHS) + '_B' + str(BATCHSIZE) + '_L' + str(LOSS_FACTOR) + '_SmL' + str(SMOOTH_LOSS_FACTOR)

# Load previous model to continue training:
train_epoch = 0
# model = loadModel(save_path, epoch=train_epoch)

model = trainEnsemble(model,
                      preds,
                      obs,
                      LOSSFUNC,
                      nEpoch=EPOCHS,
                      startEpoch = train_epoch+1,
                      miniBatch=[BATCHSIZE, 30],
                      saveEpoch=1,
                      saveFolder=save_path,
                      mode='seq2seq',
                      bufftime=0,
                      prcp_loss_factor = LOSS_FACTOR,
                      smooth_loss_factor = SMOOTH_LOSS_FACTOR
                      )

NameError: name 'model' is not defined

In [9]:
import sys
sys.path.append('../../')
from hydroDL import master, utils
from hydroDL.data import camels
from hydroDL.master import default
from hydroDL.model import rnn, crit, train
from hydroDL.master import loadModel


import os
import numpy as np
import torch
from collections import OrderedDict
import random
import json
import datetime as dt

import sys
from datetime import datetime




# fix the random seeds for reproducibility
randomseed = 111111
random.seed(randomseed)
torch.manual_seed(randomseed)
np.random.seed(randomseed)
torch.cuda.manual_seed(randomseed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

## GPU setting
# which GPU to use when having multiple
# traingpuid = 6
# torch.cuda.set_device(traingpuid)


# my_list = json.loads(sys.argv[1])
## Setting training options here
PUOpt = 0
# PUOpt values and explanations:
# 0: train and test on ALL basins;
# 1 for PUB spatial test, randomly hold out basins;
# 2 for PUR spatial test, hold out a continuous region;
buffOpt = 0
# buffOpt defines the warm-up option for the first year of training forcing data
# 0: do nothing, the first year forcing would only be used to warm up the next year;
# 1: repeat first year forcing to warm up the first year;3.988196838920481
# 2: load one more year forcing to warm up the first year
TDOpt = True
# TDOpt, True as using dynamic parameters and False as using static parameters


multiforcing = True # set True if you want to use multiple forcings
if multiforcing == False:
    forType = 'nldas'
    # for Type defines which forcing in CAMELS to use: 'daymet', 'nldas', 'maurer'
else:
    # forType = ['daymet']
    forType = ['daymet', 'maurer_extended', 'nldas_extended']
    # forType = ['nldas_extended', 'maurer_extended']

#used only when multiforcing is True; else does not matter
prcp_loss_factor = 23
smooth_loss_factor = 0

## Set hyperparameters
EPOCH = 50 # total epoches to train the mode
BATCH_SIZE = 100
RHO = 365
HIDDENSIZE = 256
saveEPOCH = 10
Ttrain = [19801001, 19951001] # Training period
# Ttrain = [19891001, 19991001] # PUB/PUR period
Tinv = [19801001, 19951001] # Inversion period for historical forcings
# Tinv = [19891001, 19991001] # PUB/PUR period
Nfea = 12 # number of HBV parameters. 12:original HBV; 13:includes the added dynamic ET para when setting ETMod=True
BUFFTIME = 365 # for each training sample, to use BUFFTIME days to warm up the states.
routing = True # Whether to use the routing module for simulated runoff
Nmul = 16 # Multi-component model. How many parallel HBV components to use. 1 means the original HBV.
comprout = False # True is doing routing for each component
compwts = False # True is using weighted average for components; False is the simple mean
pcorr = None # or a list to give the range of precip correction

# Convert the date strings to datetime objects
dateTrain1 = datetime.strptime(str(Ttrain[0]), '%Y%m%d')
dateTrain2 = datetime.strptime(str(Ttrain[1]), '%Y%m%d')
delta_train = dateTrain2 - dateTrain1
num_days_train = delta_train.days


if TDOpt is True:
    # Below options are only for running models with dynamic parameters
    tdRep = [1, 13] # When using dynamic parameters, this list defines which parameters to set as dynamic
    tdRepS = [str(ix) for ix in tdRep]
    # ETMod: if True, use the added shape parameter (index 13) for ET. Default as False.
    # Must set below ETMod as True and Nfea=13 when including 13 index in above tdRep list for dynamic parameters
    # If 13 not in tdRep list, set below ETMod=False and Nfea=12 to use the original HBV without ET shape para
    ETMod = True
    Nfea = 13 # should be 13 when setting ETMod=True. 12 when ETMod=False
    dydrop = 0.0 # dropout possibility for those dynamic parameters: 0.0 always dynamic; 1.0 always static
    staind = -1 # which time step to use from the learned para time series for those static parameters
    TDN = '/TDTestforc/'+'TD'+"_".join(tdRepS) +'/'
else:
    TDN = '/Testforc/'

# Define root directory of database and output
# Modify these based on your own location of CAMELS dataset
# Following the data download instruction in README file, you should organize the folders like
# 'your/path/to/Camels/basin_timeseries_v1p2_metForcing_obsFlow' and 'your/path/to/Camels/camels_attributes_v2.0'
# Then 'rootDatabase' here should be 'your/path/to/Camels';
# 'rootOut' is the root dir where you save the trained model
rootDatabase = os.path.join(os.path.sep, 'scratch', 'Camels')  # CAMELS dataset root directory
# rootDatabase = os.path.join(os.path.sep, 'data', 'kas7897', 'dPLHBVrelease')  # CAMELS dataset root directory
camels.initcamels(rootDatabase)  # initialize camels module-scope variables in camels.py (dirDB, gageDict) to read basin info

# rootOut = os.path.join(os.path.sep, 'data', 'rnnStreamflow')  # Model output root directory
rootOut = os.path.join(os.path.sep, 'data', 'kas7897', 'dPLHBVrelease', 'output')  # Model output root directory

## set up different data loadings for ALL, PUB, PUR
testfoldInd = 1
# Which fold to hold out for PUB (10 folds, from 1 to 10) and PUR (7 folds, from 1 to 7).
# It doesn't matter when training on ALL basins (setting PUOpt=0), could always set testfoldInd=1 for this case.

# load CAMELS basin information
gageinfo = camels.gageDict
hucinfo = gageinfo['huc']
gageid = gageinfo['id']
gageidLst = gageid.tolist()

if PUOpt == 0: # training on all basins without spatial hold-out
    puN = 'ALL'
    TrainLS = gageidLst # all basins
    TrainInd = [gageidLst.index(j) for j in TrainLS]
    TestLS = gageidLst
    TestInd = [gageidLst.index(j) for j in TestLS]
    gageDic = {'TrainID':TrainLS, 'TestID':TestLS}

elif PUOpt == 1: # random hold out basins. hold out the fold set by testfoldInd
    puN = 'PUB'
    # load the PUB basin groups
    # randomly divide CAMELS basins into 10 groups and this file contains the basin ID for each group
    # located in splitPath
    splitPath = 'PUBsplitLst.txt'
    with open(splitPath, 'r') as fp:
        testIDLst=json.load(fp)
    # Generate training ID lists excluding the hold out fold
    TestLS = testIDLst[testfoldInd - 1]
    TestInd = [gageidLst.index(j) for j in TestLS]
    TrainLS = list(set(gageid.tolist()) - set(TestLS))
    TrainInd = [gageidLst.index(j) for j in TrainLS]
    gageDic = {'TrainID':TrainLS, 'TestID':TestLS}

elif PUOpt == 2:
    puN = 'PUR'
    # Divide CAMELS dataset into 7 continous PUR regions, as shown in Feng et al, 2021 GRL; 2022 HESSD
    # get the id list of each PUR region, save to list
    regionID = list()
    regionNum = list()
    # seven regions including different HUCs
    regionDivide = [ [1,2], [3,6], [4,5,7], [9,10], [8,11,12,13], [14,15,16,18], [17] ]
    for ii in range(len(regionDivide)):
        tempcomb = regionDivide[ii]
        tempregid = list()
        for ih in tempcomb:
            tempid = gageid[hucinfo==ih].tolist()
            tempregid = tempregid + tempid
        regionID.append(tempregid)
        regionNum.append(len(tempregid))

    iexp = testfoldInd - 1  #index
    TestLS = regionID[iexp] # basin ID list for testing, hold out for training
    TestInd = [gageidLst.index(j) for j in TestLS]
    TrainLS = list(set(gageid.tolist()) - set(TestLS)) # basin ID for training
    TrainInd = [gageidLst.index(j) for j in TrainLS]
    gageDic = {'TrainID': TrainLS, 'TestID': TestLS}


# apply buffOPt to solve the warm-up for the first year
if buffOpt ==2: # load more BUFFTIME data for the first year
    sd = utils.time.t2dt(Ttrain[0]) - dt.timedelta(days=BUFFTIME)
    sdint = int(sd.strftime("%Y%m%d"))
    TtrainLoad = [sdint, Ttrain[1]]
    TinvLoad = [sdint, Ttrain[1]]
else:
    TtrainLoad = Ttrain
    TinvLoad = Tinv

## prepare input data
## load camels dataset
# if forType == 'daymet' or forType==['daymet', 'maurer_extended', 'nldas_extended']:
#     varF = ['prcp', 'tmean']
#     varFInv = ['prcp', 'tmean']
# else:
#     varF = ['prcp', 'tmax'] # For CAMELS maurer and nldas forcings, tmax is actually tmean
#     varFInv = ['prcp', 'tmax']

# the attributes used to learn parameters
attrnewLst = [ 'p_mean','pet_mean','p_seasonality','frac_snow','aridity','high_prec_freq','high_prec_dur',
               'low_prec_freq','low_prec_dur', 'elev_mean', 'slope_mean', 'area_gages2', 'frac_forest', 'lai_max',
               'lai_diff', 'gvf_max', 'gvf_diff', 'dom_land_cover_frac', 'dom_land_cover', 'root_depth_50',
               'soil_depth_pelletier', 'soil_depth_statsgo', 'soil_porosity', 'soil_conductivity',
               'max_water_content', 'sand_frac', 'silt_frac', 'clay_frac', 'geol_1st_class', 'glim_1st_class_frac',
               'geol_2nd_class', 'glim_2nd_class_frac', 'carbonate_rocks_frac', 'geol_porostiy', 'geol_permeability']
attrWghts = ['p_mean','pet_mean','p_seasonality','frac_snow','aridity','high_prec_freq','high_prec_dur',
               'low_prec_freq','low_prec_dur', 'elev_mean', 'slope_mean', 'area_gages2', 'frac_forest', 'lai_max',
               'lai_diff', 'gvf_max', 'gvf_diff']

optData = default.optDataCamels # a default dictionary for logging, updated below
# Update the training period and variables

# if forType==['daymet', 'maurer_extended', 'nldas_extended']:
if type(forType) == list:
    #for all forcings
    # forcUN = np.empty([len(TrainInd), num_days_train, len(forType)*2])
    # forcInvUN = np.empty([len(TrainInd), num_days_train, len(forType)*2])

    #for multiple prcp only
    forcUN = np.empty([len(TrainInd), num_days_train, len(forType) + 1])
    forcInvUN = np.empty([len(TrainInd), num_days_train, len(forType) +1])
    # counter = 0
    for i in range(len(forType)):
        if forType[i] == 'daymet':
            varF = ['prcp', 'tmean']
            varFInv = ['prcp', 'tmean']
        else:
            varF = ['prcp', 'tmax']  # For CAMELS maurer and nldas forcings, tmax is actually tmean
            varFInv = ['prcp', 'tmax']

        if 'daymet' in forType:
            optData = default.update(optData, tRange=TtrainLoad, varT=varFInv, varC=attrnewLst, subset=TrainLS,
                                     forType='daymet')
        elif 'nldas' in forType:
            optData = default.update(optData, tRange=TtrainLoad, varT=varFInv, varC=attrnewLst, subset=TrainLS,
                                     forType='nldas')
        elif 'nldas_extended' in forType:
            optData = default.update(optData, tRange=TtrainLoad, varT=varFInv, varC=attrnewLst, subset=TrainLS,
                                     forType='nldas_extended')
        else:
            optData = default.update(optData, tRange=TtrainLoad, varT=varFInv, varC=attrnewLst, subset=TrainLS,
                                     forType=forType[0])

        dfTrain = camels.DataframeCamels(tRange=TtrainLoad, subset=TrainLS, forType=forType[i])
        forcUN_type = dfTrain.getDataTs(varLst=varF, doNorm=False, rmNan=False)

        dfInv = camels.DataframeCamels(tRange=TinvLoad, subset=TrainLS, forType=forType[i])
        forcInvUN_type = dfInv.getDataTs(varLst=varFInv, doNorm=False, rmNan=False)

        forcUN[:, :, i] = forcUN_type[:, :, 0]
        forcInvUN[:, :, i] = forcInvUN_type[:, :, 0]
        forcUN[:, :, -1] = forcUN_type[:, :, 1]
        forcInvUN[:, :, -1] = forcInvUN_type[:, :, 1]
        if forType[i] == 'daymet':
            daymet_temp = forcUN_type[:, :, 1]
            daymetInV_temp = forcInvUN_type[:, :, 1]
        if forType[i] == 'nldas' or forType[i] == 'nldas_extended':
            nldas_temp = forcUN_type[:, :, 1]
            nldasInV_temp = forcInvUN_type[:, :, 1]

        #for all forcings
        # forcUN[:,:,i] = forcUN_type[:,:,0]
        # forcUN[:,:,i+3] = forcUN_type[:,:,1]
        # forcInvUN[:,:,i] = forcInvUN_type[:,:,0]
        # forcInvUN[:,:,i+3] = forcInvUN_type[:,:,1]


    obsUN = dfTrain.getDataObs(doNorm=False, rmNan=False, basinnorm=False)
    attrsUN = dfInv.getDataConst(varLst=attrnewLst, doNorm=False, rmNan=False)
    attrs_wghtsUN = dfInv.getDataConst(varLst=attrWghts, doNorm=False, rmNan=False)

    if 'daymet' in forType:
        forcUN[:, :, -1] = daymet_temp
        forcInvUN[:, :, -1] = daymetInV_temp
    elif 'nldas' in forType or 'nldas_extended' in forType:
        forcUN[:, :, -1] = nldas_temp
        forcInvUN[:, :, -1] = nldasInV_temp



else:
    if forType == 'daymet':
        varF = ['prcp', 'tmean']
        varFInv = ['prcp', 'tmean']
    else:
        varF = ['prcp', 'tmax']  # For CAMELS maurer and nldas forcings, tmax is actually tmean
        varFInv = ['prcp', 'tmax']
    optData = default.update(optData, tRange=TtrainLoad, varT=varFInv, varC=attrnewLst, subset=TrainLS, forType=forType)
    dfTrain = camels.DataframeCamels(tRange=TtrainLoad, subset=TrainLS, forType=forType)
    forcUN = dfTrain.getDataTs(varLst=varF, doNorm=False, rmNan=False)
    obsUN = dfTrain.getDataObs(doNorm=False, rmNan=False, basinnorm=False)
    # for dPL inversion data, inputs of gA
    dfInv = camels.DataframeCamels(tRange=TinvLoad, subset=TrainLS, forType=forType)
    forcInvUN = dfInv.getDataTs(varLst=varFInv, doNorm=False, rmNan=False)
    attrsUN = dfInv.getDataConst(varLst=attrnewLst, doNorm=False, rmNan=False)


# for HBV model training inputs


# dfInv = camels.DataframeCamels(tRange=TinvLoad, subset=TrainLS, forType=forType)
# forcInvUN = dfInv.getDataTs(varLst=varFInv, doNorm=False, rmNan=False)
# attrsUN = dfInv.getDataConst(varLst=attrnewLst, doNorm=False, rmNan=False)

# Unit transformation, discharge obs from ft3/s to mm/day
areas = gageinfo['area'][TrainInd] # unit km2
temparea = np.tile(areas[:, None, None], (1, obsUN.shape[1],1))
obsUN = (obsUN * 0.0283168 * 3600 * 24) / (temparea * (10 ** 6)) * 10**3 # transform to mm/day

# load potential ET calculated by hargreaves method
varLstNL = ['PEVAP']
usgsIdLst = gageid

#for multiple PETs
# PETUN = np.empty([len(usgsIdLst), num_days_train, len(forType)])
# PETInvUN = np.empty([len(usgsIdLst), num_days_train, len(forType)])
# if type(forType) == list:
#     for i in range(len(forType)):
#         if forType[i] == 'nldas_extended' or forType[i] == 'nldas':
#             PETDir = rootDatabase + '/pet_harg/' + 'nldas' + '/'
#             tPETRange = [19800101, 20150101]
#             tPETLst = utils.time.tRange2Array(tPETRange)
#         if forType[i] == 'maurer_extended' or forType[i] == 'maurer':
#             PETDir = rootDatabase + '/pet_harg/' + 'maurer' + '/'
#             tPETRange = [19800101, 20090101]
#             tPETLst = utils.time.tRange2Array(tPETRange)
#         if forType[i] == 'daymet':
#             PETDir = rootDatabase + '/pet_harg/' + 'daymet' + '/'
#             tPETRange = [19800101, 20150101]
#             tPETLst = utils.time.tRange2Array(tPETRange)
#         ntime = len(tPETLst)
#         PETfull = np.empty([len(usgsIdLst), ntime, len(varLstNL)])
#         for k in range(len(usgsIdLst)):
#             dataTemp = camels.readcsvGage(PETDir, usgsIdLst[k], varLstNL, ntime)
#             PETfull[k, :, :] = dataTemp
#         TtrainLst = utils.time.tRange2Array(TtrainLoad)
#         TinvLst = utils.time.tRange2Array(TinvLoad)
#         C, ind1, ind2 = np.intersect1d(TtrainLst, tPETLst, return_indices=True)
#         PETUN_type = PETfull[:, ind2, :]
#         PETUN_type = PETUN_type[TrainInd, :, :] # select basins
#         PETUN[:,:,i]  = PETUN_type[:,:,0]
#         C, ind1, ind2inv = np.intersect1d(TinvLst, tPETLst, return_indices=True)
#         PETInvUN_type = PETfull[:, ind2inv, :]
#         PETInvUN_type = PETInvUN_type[TrainInd, :, :]
#         PETInvUN[:,:,i] = PETInvUN_type[:,:,0]

if forType == 'maurer' or forType=='maurer_extended' or forType == ['maurer'] or forType == ['maurer_extended']:
    tPETRange = [19800101, 20090101]
else:
    tPETRange = [19800101, 20150101]
tPETLst = utils.time.tRange2Array(tPETRange)
# Modify this as the directory where you put PET
if type(forType) == list:
    if forType[0]=='nldas_extended':
        PETDir = rootDatabase + '/pet_harg/' + 'nldas' + '/'
    elif forType[0]=='maurer_extended':
        PETDir = rootDatabase + '/pet_harg/' + 'maurer' + '/'
    else:
        PETDir = rootDatabase + '/pet_harg/' + forType[0] + '/'
else:
    PETDir = rootDatabase + '/pet_harg/' + forType + '/'

ntime = len(tPETLst)
PETfull = np.empty([len(usgsIdLst), ntime, len(varLstNL)])
for k in range(len(usgsIdLst)):
    dataTemp = camels.readcsvGage(PETDir, usgsIdLst[k], varLstNL, ntime)
    PETfull[k, :, :] = dataTemp

TtrainLst = utils.time.tRange2Array(TtrainLoad)
TinvLst = utils.time.tRange2Array(TinvLoad)
C, ind1, ind2 = np.intersect1d(TtrainLst, tPETLst, return_indices=True)
PETUN = PETfull[:, ind2, :]
PETUN = PETUN[TrainInd, :, :] # select basins
C, ind1, ind2inv = np.intersect1d(TinvLst, tPETLst, return_indices=True)
PETInvUN = PETfull[:, ind2inv, :]
PETInvUN = PETInvUN[TrainInd, :, :]

# process data, do normalization and remove nan
series_inv = np.concatenate([forcInvUN, PETInvUN], axis=2)
series_inv_hbv = series_inv[:,:,(0,-2,-1)]
seriesvarLst = varFInv + ['pet']
# calculate statistics for normalization and saved to a dictionary
statDict_hbv = camels.getStatDic(attrLst=attrnewLst, attrdata=attrsUN, seriesLst=seriesvarLst, seriesdata=series_inv_hbv)
statDict_wghts = camels.getStatDic(attrLst=attrWghts, attrdata=attrs_wghtsUN, seriesLst=['prcp_daymet', 'prcp_maurer', 'prcp_nldas', 'tmax', 'pet'], seriesdata=series_inv)
# normalize data
attr_norm = camels.transNormbyDic(attrsUN, attrnewLst, statDict_hbv, toNorm=True)
attrWghts_norm = camels.transNormbyDic(attrs_wghtsUN, attrWghts, statDict_hbv, toNorm=True)
attr_norm[np.isnan(attr_norm)] = 0.0
attrWghts_norm[np.isnan(attrWghts_norm)] = 0.0
series_norm_hbv = camels.transNormbyDic(series_inv_hbv, seriesvarLst, statDict_hbv, toNorm=True)
series_Wghts_norm = camels.transNormbyDic(series_inv, ['prcp_daymet', 'prcp_maurer', 'prcp_nldas', 'tmax', 'pet'], statDict_wghts, toNorm=True)
series_norm_hbv[np.isnan(series_norm_hbv)] = 0.0
series_Wghts_norm[np.isnan(series_Wghts_norm)] = 0.0

# prepare the inputs
zTrain_hbv = series_norm_hbv # used as the inputs for dPL inversion gA along with attributes
zTrain_wghts = series_Wghts_norm # used as the inputs for dPL inversion gA along with attributes
xTrain_wghts = np.concatenate([forcUN, PETUN], axis=2) # used as HBV forcing
xTrain_hbv = xTrain_wghts[:,:,(0,-2,-1)] # used as HBV forcing
xTrain_wghts[np.isnan(xTrain_wghts)] = 0.0
xTrain_hbv[np.isnan(xTrain_hbv)] = 0.0

if buffOpt == 1: # repeat the first year warm up the first year itself
    zTrainIn_hbv = np.concatenate([zTrain_hbv[:,0:BUFFTIME,:], zTrain_hbv], axis=1)
    zTrainIn_wghts = np.concatenate([zTrain_wghts[:,0:BUFFTIME,:], zTrain_wghts], axis=1)
    xTrainIn_wghts = np.concatenate([xTrain_wghts[:,0:BUFFTIME,:], xTrain_wghts], axis=1) # repeat forcing to warm up the first year
    xTrainIn_hbv = np.concatenate([xTrain_hbv[:,0:BUFFTIME,:], xTrain_hbv], axis=1) # repeat 
else: # no repeat, original data, the first year data would only be used as warmup for the next following year
    zTrainIn_hbv = zTrain_hbv
    zTrainIn_wghts = zTrain_wghts
    xTrainIn_wghts = xTrain_wghts
    xTrainIn_hbv = xTrain_hbv

forcTuple_hbv = (xTrainIn_hbv, zTrainIn_hbv)
forcTuple_wghts = (xTrainIn_wghts, zTrainIn_wghts)
attrs = attr_norm
attrs_wghts = attrWghts_norm

## Train the model
# define loss function
alpha = 0.25 # a weight for RMSE loss to balance low and peak flow
optLoss = default.update(default.optLossComb, name='hydroDL.model.crit.RmseLossComb', weight=alpha)
lossFun = crit.RmseLossComb(alpha=alpha)

# define training options
optTrain = default.update(default.optTrainCamels, miniBatch=[BATCH_SIZE, RHO], nEpoch=EPOCH, saveEpoch=saveEPOCH)
# define output folder to save model results
exp_name = 'CAMELSDemo'
if forType==['daymet', 'maurer_extended', 'nldas_extended']:
    exp_disp = 'LSTM-dPLHBV/' + puN + TDN + 'allprcp_withloss' + str(prcp_loss_factor) + 'smooth' + str(smooth_loss_factor) + '/BuffOpt'+str(buffOpt)+'/RMSE_para'+str(alpha)+'/' + str(randomseed) + \
           '/Fold' + str(testfoldInd)
elif forType==['daymet', 'maurer', 'nldas']:
    exp_disp = 'dPLHBV/' + puN + TDN + 'all_withloss' + str(prcp_loss_factor) + 'smooth' + str(smooth_loss_factor) + '/BuffOpt'+str(buffOpt)+'/RMSE_para'+str(alpha)+'/' + str(randomseed) + \
           '/Fold' + str(testfoldInd)
elif type(forType)==list:
    forType_string = '|'.join(forType)
    exp_disp = 'dPLHBV/' + puN + TDN + forType_string + 'withloss' + str(prcp_loss_factor)+ 'smooth' + str(smooth_loss_factor) + '/BuffOpt'+str(buffOpt)+'/RMSE_para'+str(alpha)+'/' + str(randomseed) + \
           '/Fold' + str(testfoldInd)
else:
    exp_disp = 'dPLHBV/' + puN + TDN + forType + '/BuffOpt'+str(buffOpt)+'/RMSE_para'+str(alpha)+'/' + str(randomseed) + \
            '/Fold' + str(testfoldInd)
exp_info = 'T_'+str(Ttrain[0])+'_'+str(Ttrain[1])+'_BS_'+str(BATCH_SIZE)+'_HS_'+str(HIDDENSIZE)\
           +'_RHO_'+str(RHO)+'_NF_'+str(Nfea)+'_Buff_'+str(BUFFTIME)+'_Mul_'+str(Nmul)
save_path = os.path.join(exp_name, exp_disp)
out = os.path.join(rootOut, save_path, exp_info) # output folder to save results
# define and load model
Ninv = zTrain_wghts.shape[-1] + attrs_wghts.shape[-1]



##################### MODEL ##########################
model = prcp_weights(ninv=Ninv, hiddeninv=HIDDENSIZE, prcp_datatypes=len(forType))



# dict only for logging
optModel = OrderedDict(name='LSTM-dPLHBV', nx=Ninv, nfea=Nfea, nmul=Nmul, hiddenSize=HIDDENSIZE, doReLU=True,
                        Tinv=Tinv, Trainbuff=BUFFTIME, routOpt=routing, comprout=comprout, compwts=compwts,
                        pcorr=pcorr, staind=staind, tdlst=tdRep, dydrop=dydrop,buffOpt=buffOpt, TDOpt=TDOpt, ETMod=ETMod)



# Wrap up all the training configurations to one dictionary in order to save into "out" folder as logging
masterDict = master.wrapMaster(out, optData, optModel, optLoss, optTrain)
master.writeMasterFile(masterDict)
# log statistics for normalization
statFile_wghts = os.path.join(out, 'statDict_wghts.json')
with open(statFile_wghts, 'w') as fp:
    json.dump(statDict_wghts, fp, indent=4)
statFile_hbv = os.path.join(out, 'statDict_hbv.json')
with open(statFile_hbv, 'w') as fp:
    json.dump(statDict_hbv, fp, indent=4)

testout = "/data/kas7897/dPLHBVrelease/output/CAMELSDemo/dPLHBV/ALL" \
          "/TDTestforc/TD1_13/daymet/BuffOpt0/RMSE_para0.25/111111/Fold1" \
          "/T_19801001_19951001_BS_100_HS_256_RHO_365_NF_13_Buff_365_Mul_16/model_Ep50.pt"
loaded_hbv = torch.load(testout)

# Train the model
# trainedModel = train.train2Model(
#     model,
#     loaded_hbv,
#     forcTuple_wghts,
#     forcTuple_hbv,
#     yTrainIn,
#     attrs_wghts,
#     attrs,
#     lossFun,
#     nEpoch=EPOCH,
#     miniBatch=[BATCH_SIZE, RHO],
#     saveEpoch=saveEPOCH,
#     saveFolder=out,
#     bufftime=BUFFTIME,
#     multiforcing=multiforcing,
#     prcp_loss_factor=prcp_loss_factor)


FileNotFoundError: [Errno 2] No such file or directory: '/scratch/Camels/basin_timeseries_v1p2_metForcing_obsFlow/basin_dataset_public_v1p2/basin_metadata/gauge_information.txt'

In [6]:
attrnewLst = [ 'p_mean','pet_mean','p_seasonality','frac_snow','aridity','high_prec_freq','high_prec_dur',
               'low_prec_freq','low_prec_dur', 'elev_mean', 'slope_mean', 'area_gages2', 'frac_forest', 'lai_max',
               'lai_diff', 'gvf_max', 'gvf_diff', 'dom_land_cover_frac', 'dom_land_cover', 'root_depth_50',
               'soil_depth_pelletier', 'soil_depth_statsgo', 'soil_porosity', 'soil_conductivity',
               'max_water_content', 'sand_frac', 'silt_frac', 'clay_frac', 'geol_1st_class', 'glim_1st_class_frac',
               'geol_2nd_class', 'glim_2nd_class_frac', 'carbonate_rocks_frac', 'geol_porostiy', 'geol_permeability']

In [7]:
len(attrnewLst)

35

In [10]:
import logging
from datetime import datetime
from typing import Any, Optional

import numpy as np
import pandas as pd
from pydantic import BaseModel, ConfigDict, PrivateAttr
import torch
from omegaconf import DictConfig

log = logging.getLogger(__name__)


class Dates(BaseModel):
    model_config = ConfigDict(arbitrary_types_allowed=True)
    daily_format: str = "%Y/%m/%d"
    hourly_format: str = "%Y/%m/%d %H:%M:%S"
    origin_start_date: str = "1980/01/01"
    start_time: str
    end_time: str
    rho: Optional[int] = None
    batch_daily_time_range: pd.DatetimeIndex = PrivateAttr
    batch_hourly_time_range: pd.DatetimeIndex = PrivateAttr
    daily_time_range: pd.DatetimeIndex = PrivateAttr
    hourly_indices: torch.Tensor = PrivateAttr
    hourly_time_range: pd.DatetimeIndex = PrivateAttr
    numerical_time_range: np.ndarray = PrivateAttr

    def __init__(self, cfg: DictConfig):
        super(Dates, self).__init__(
            start_time=cfg.dataset.time.start,
            end_time=cfg.dataset.time.end,
            rho=cfg.dataset.get("rho", None),
        )

    def model_post_init(self, __context: Any) -> None:
        self.daily_time_range = pd.date_range(
            datetime.strptime(self.start_time, self.daily_format),
            datetime.strptime(self.end_time, self.daily_format),
            freq="D",
            inclusive="both",
        )
        self.hourly_time_range = pd.date_range(
            start=self.daily_time_range[0],
            end=self.daily_time_range[-1],
            freq="h",
            inclusive="both",
        )
        self.batch_daily_time_range = self.daily_time_range
        self.set_batch_time(self.daily_time_range)

    def set_batch_time(self, daily_time_range: pd.DatetimeIndex):
        self.batch_hourly_time_range = pd.date_range(
            start=daily_time_range[0],
            end=daily_time_range[-1],
            freq="h",
            inclusive="both",
        )
        origin_start_date = datetime.strptime(self.origin_start_date, self.daily_format)
        origin_base_start_time = int(
            (daily_time_range[0].to_pydatetime() - origin_start_date).total_seconds()
            / 86400
        )
        origin_base_end_time = int(
            (daily_time_range[-1].to_pydatetime() - origin_start_date).total_seconds()
            / 86400
        )

        # The indices for the dates in your selected routing time range
        self.numerical_time_range = np.arange(
            origin_base_start_time, origin_base_end_time + 1, 1
        )

        common_elements = self.hourly_time_range.intersection(
            self.batch_hourly_time_range
        )
        self.hourly_indices = torch.tensor(
            [self.hourly_time_range.get_loc(time) for time in common_elements]
        )

    def calculate_time_period(self) -> None:
        if self.rho is not None:
            sample_size = len(self.daily_time_range)
            random_start = torch.randint(
                low=0, high=sample_size - self.rho, size=(1, 1)
            )[0][0].item()
            self.batch_daily_time_range = self.daily_time_range[
                random_start : (random_start + self.rho)
            ]
            self.set_batch_time(self.batch_daily_time_range)

ModuleNotFoundError: No module named 'omegaconf'