# Calibration analyses wflow_sbm

In [None]:
import numpy as np
import xarray as xr
import pandas as pd
import hydroeval

from glob import glob
from pathlib import Path

## Set Paths

In [None]:
# Set Paths
ROOT = Path("/gpfs/work1/0/wtrcycle/users/jaerts/camels_uk/")
MODELS = Path(f'{ROOT}/wflow/data/')
AUXDATA = Path(f"{ROOT}/aux_data")
OBSDIR = Path(f"{AUXDATA}/CAMELS-GB/data/timeseries/")
OUTPUT = Path(f"{ROOT}/results/wflow_sbm/calibration_period/")

## Set Config

In [None]:
# Get available basin IDs wflow_sbm
basin_dirs = glob(f'{MODELS}/*')
basin_ids = [s.split('/')[-1] for s in basin_dirs]
basin_ids.sort()

# Period (drop first year)
start_date = '2001-10-01'
end_date   = '2007-09-30'

# Calibration Values
calibration_values = [1,5,10,15,20,
                      25,30,35,40,45,
                      50,55,60,65,70,
                      75,80,85,90,95,
                      100,125,150,175,
                      200,225,250,275,
                      300,350,400,450,550,
                      600,650,700,750,800,
                      850,900,950,1000,1500,
                      2000,2500,3000,4000,
                      4500,5000,7500,10000]

## Define functions

In [None]:
def get_simulations(basin_id, calibration_values, start_date, end_date):
    dataframes = []

    for calibration_value in calibration_values:
        # Set simulation file
        sim_file = glob(f'{MODELS}/{basin_id}/ksathorfrac_{calibration_value}/output.csv')[0]

        # Load simulation dataframe
        df = pd.read_csv(sim_file, parse_dates=True, index_col='time')

        # Select calibration period (drop first year)
        mask = (df.index >= start_date) & (df.index <= end_date)
        df = df.loc[mask]
        
        # Rename column
        df = df.rename(columns={'Q_1': f'ksathorfrac_{calibration_value}'})
        
        # Append to list
        dataframes.append(df)

    # Concat simulation dataframes
    df_sim = pd.concat(dataframes,  axis=1, ignore_index=False)
    
    return df_sim


def get_observations(basin_id, start_date, end_date):
    # Set observation file
    obs_file = glob(f'{OBSDIR}/*_{basin_id}_*.csv')[0]
    
    # Load observation dataframe
    df_obs = pd.read_csv(obs_file, parse_dates=True, index_col='date')
    
    # Select calibration period (drop first year)
    mask = (df_obs.index >= start_date) & (df_obs.index <= end_date)
    df_obs = df_obs.loc[mask]
    
    return df_obs
    
def calculate_objective_functions(basin_id, df_sim, df_obs, calibration_values):

    # Create empty dataframe and lists
    df = pd.DataFrame()
    basin_ids = []
    ksathorfracs = []
    nse_values = []
    kge_2009_values = []
    kge_2012_values = []
    kge_np_values = []
    kge_np_r_values = []
    kge_np_alpha_values = []
    kge_np_beta_values = []

    # Calculate objective functions for each parameter value
    for calibration_value in calibration_values:

        basin_ids.append(basin_id)
        ksathorfracs.append(calibration_value)

        # Calculate objective functions and round
        nse = hydroeval.evaluator(hydroeval.nse, df_sim[f'ksathorfrac_{calibration_value}'], df_obs.discharge_vol, axis=1)
        nse_values.append(np.round(nse[0], 4))

        kge_2009 = hydroeval.evaluator(hydroeval.kge, df_sim[f'ksathorfrac_{calibration_value}'], df_obs.discharge_vol, axis=1)
        kge_2009_values.append(np.round(kge_2009[0][0], 4))

        kge_2012 = hydroeval.evaluator(hydroeval.kgeprime, df_sim[f'ksathorfrac_{calibration_value}'], df_obs.discharge_vol, axis=1)
        kge_2012_values.append(np.round(kge_2012[0][0], 4))    

        kge_np = hydroeval.evaluator(hydroeval.kgenp, df_sim[f'ksathorfrac_{calibration_value}'], df_obs.discharge_vol, axis=1)
        kge_np_values.append(np.round(kge_np[0][0], 4))    
        kge_np_r_values.append(np.round(kge_np[0][1], 4))
        kge_np_alpha_values.append(np.round(kge_np[0][2], 4))
        kge_np_beta_values.append(np.round(kge_np[0][3], 4))
    
    df['basin_id'] = basin_ids
    df['ksathorfrac'] = ksathorfracs
    df['nse'] = nse_values
    df['kge_2009'] = kge_2009_values
    df['kge_2012'] = kge_2012_values
    df['kge_np'] = kge_np_values
    df['kge_np_r'] = kge_np_r_values
    df['kge_np_alpha'] = kge_np_alpha_values
    df['kge_np_beta'] = kge_np_beta_values

    return df

# Check if output exists




In [None]:
df = pd.DataFrame()
basins = []
exists = []

for basin_id in basin_ids:
    basins.append(basin_id)

    # check if file exists
    sim_file = Path(f'{MODELS}/{basin_id}/ksathorfrac_5/output.csv')
    if sim_file.is_file() is False:
        exists.append(False)
    else:
        df_sim = pd.read_csv(sim_file)
    
        # Check if csv containes output
        if len(df_sim) < 3200:
            exists.append(False)
        else:
            exists.append(True)
        
df['basin_id'] = basins
df['completed'] = exists
df = df.reset_index()

# completed!
df = df[df['completed'] == True]

basin_ids = df.basin_id.to_list()

In [None]:
len(basin_ids)

# Calculate objective functions

In [None]:
# Get sim and obs timeseries
for basin_id in basin_ids:
    print(basin_id)
    
    df_sim = get_simulations(basin_id, calibration_values, start_date, end_date)
    df_obs = get_observations(basin_id, start_date, end_date)
    
    df_sim.to_csv(f'{OUTPUT}/simulations/{basin_id}_wflow_calibration_simulations.csv')
    df_obs.to_csv(f'{OUTPUT}/observations/{basin_id}_wflow_calibration_observations.csv', index=False)   
    
    # Calculate objective function for each water year and take average
    years = list(range(int(start_date[:4]), int(end_date[:4])))
     
    objective_dfs = []
    for year in years:
        start_year = f'{year}-10-01'
        end_year = f'{year+1}-09-30'
        
        # Select water year
        mask = (df_sim.index >= start_year) & (df_sim.index <= end_year)
        df_sim_year = df_sim.loc[mask]
        df_obs_year = df_obs.loc[mask]

        # Calculate objective function
        df_objective = calculate_objective_functions(basin_id, df_sim_year, df_obs_year, calibration_values)
        objective_dfs.append(df_objective)
    
    # Merge water years objective values and take the mean value
    df = pd.concat(objective_dfs,axis=1)
    df = df.groupby(level=0,axis=1).mean()
    df = df.sort_values('kge_np', ascending=False)
    df['basin_id'] = [basin_id] * len(df)
    df.to_csv(f'{OUTPUT}/objective_functions/{basin_id}_wflow_calibration_objective_functions.csv', index=False)

# Create overview dataframe

## Select the lowest ksathorfrac values

In [None]:
# Create empty dataframe and lists
df_out = pd.DataFrame()
basins = []
ls_ksathorfrac = []
ls_kge_np = []
ls_kge_np_r = []
ls_kge_np_alpha = []
ls_kge_np_beta = []
ls_kge_2009 = []
ls_kge_2012 = []
ls_nse = []

# Remove basin_ids that return nan values
basin_ids.remove('18017')
basin_ids.remove('18018')
basin_ids.remove('54038')
basin_ids.remove('76011')

for basin_id in basin_ids:
    print(basin_id)
    file = glob(f"{OUTPUT}/objective_functions/{basin_id}_wflow_calibration_objective_functions.csv")[0]
    # Read results and rank descending (kge_np)
    df = pd.read_csv(file)
    df = df.set_index('kge_np')
    df = df.sort_index(ascending=False)
    df = df.reset_index()

    # Select lowest ksathorfrac within 0.02 of max KGE-NP
    mask = (df.kge_np <= df.kge_np.max()) & (df.kge_np >= df.kge_np.max() - 0.02)
    df = df.loc[mask]
    df = df.sort_values(by='ksathorfrac')
    df = df.loc[0]
    
    # Append results
    basins.append(int(df['basin_id']))
    ls_ksathorfrac.append(int(df['ksathorfrac']))
    ls_kge_np.append(df['kge_np'])
    ls_kge_np_r.append(df['kge_np_r'])
    ls_kge_np_alpha.append(df['kge_np_alpha'])
    ls_kge_np_beta.append(df['kge_np_beta'])
    ls_kge_2009.append(df['kge_2009'])
    ls_kge_2012.append(df['kge_2012'])
    ls_nse.append(df['nse'])

# Create output dataframe
df_out['basin_id'] = basins    
df_out['ksathorfrac'] = ls_ksathorfrac    
df_out['kge_np'] = ls_kge_np    
df_out['kge_np_r'] = ls_kge_np_r    
df_out['kge_np_alpha'] = ls_kge_np_alpha    
df_out['kge_np_beta'] = ls_kge_np_beta    
df_out['kge_2009'] = ls_kge_2009    
df_out['kge_2012'] = ls_kge_2012    
df_out['nse'] = ls_nse 

# Write output
df_out.to_csv(f'{ROOT}/results/wflow_sbm/wflow_calibration_objective_function_overview.csv')