# Wflow_SBM Uncalibrated, evaluation period

# Select simulations that match flow categories that are based on observation percentiles
## Calculate objective functions per category

In [None]:
import hydroeval
import numpy as np
import pandas as pd


from pathlib import Path
from glob import glob

## Set Paths

In [None]:
# Set Paths
ROOT = Path("/gpfs/work1/0/wtrcycle/users/jaerts/camels_uk/")
MODELS = Path(f'{ROOT}/wflow/data/')
AUXDATA = Path(f"{ROOT}/aux_data")
OBSDIR = Path(f"{AUXDATA}/CAMELS-GB/data/timeseries/")
OUTPUT = Path(f"{ROOT}/results/wflow_sbm/evaluation_period_uncalibrated/")

# Set uncertainty estimate file
uncertainty_file = f"{AUXDATA}/CAMELS-GB/data/CAMELS_GB_hydrometry_attributes.csv"

## Set Config

In [None]:
# Get available basin IDs
df_ids = pd.read_csv(f"{AUXDATA}/CAMELS-GB/data/CAMELS_GB_topographic_attributes.csv", index_col='gauge_id')
basin_ids = df_ids.index.to_list()

# Period (drop first year)
start_date = '2008-10-01'
end_date   = '2015-09-30'

## Select available results

In [None]:
df = pd.DataFrame()
basins = []
exists = []

for basin_id in basin_ids:
    basins.append(basin_id)

    # check if file exists
    sim_file = Path(f'{MODELS}/{basin_id}/evaluation_ksathorfac_100/output.csv')
    if sim_file.is_file() is False:
        exists.append(False)
    else:
        df_sim = pd.read_csv(sim_file)
    
        # Check if csv containes output
        if len(df_sim) == 0:
            exists.append(False)
        else:
            exists.append(True)
        
df['basin_id'] = basins
df['completed'] = exists
df = df.reset_index()
df = df[df['completed'] == True]

basin_ids = df.basin_id.to_list()

## Define Functions

In [None]:
def calculate_objective_functions(basin_id, df_sim, df_obs):
    
    # Create empty dataframe and lists
    df = pd.DataFrame()

    # Combine obs and sim because of nan values
    df_eval = df_obs.discharge_vol.to_frame().join(df_sim)
    df_eval = df_eval.dropna()
    
    # Calculate objective functions and round
    nse = hydroeval.evaluator(hydroeval.nse, df_eval[f'evaluation'], df_eval.discharge_vol, axis=1)
    nse = np.round(nse[0], 4)

    kge_2009 = hydroeval.evaluator(hydroeval.kge, df_eval[f'evaluation'], df_eval.discharge_vol, axis=1)
    kge_2009 = np.round(kge_2009[0][0], 4)

    kge_2012 = hydroeval.evaluator(hydroeval.kgeprime, df_eval[f'evaluation'], df_eval.discharge_vol, axis=1)
    kge_2012 = np.round(kge_2012[0][0], 4)

    kge_np = hydroeval.evaluator(hydroeval.kgenp, df_eval[f'evaluation'], df_eval.discharge_vol, axis=1)
    kge_np_value = np.round(kge_np[0][0], 4)
    kge_np_r = np.round(kge_np[0][1], 4)
    kge_np_alpha = np.round(kge_np[0][2], 4)
    kge_np_beta = np.round(kge_np[0][3], 4)

    df['basin_id'] = [basin_id]
    df['nse']      = [nse]
    df['kge_2009'] = [kge_2009]
    df['kge_2012'] = [kge_2012]
    df['kge_np']   = [kge_np_value]

    df['kge_np_r'] = [kge_np_r]
    df['kge_np_alpha'] = [kge_np_alpha]
    df['kge_np_beta'] = [kge_np_beta]
    
    return df

## Load uncertainty estimates and drop nan values

In [None]:
# load uncertainty file and drop nan
df_uncertainty = pd.read_csv(uncertainty_file, index_col='gauge_id')

df_uncertainty = df_uncertainty[df_uncertainty['q5_uncert_upper'].notna()]
df_uncertainty = df_uncertainty[df_uncertainty['q5_uncert_lower'].notna()]
df_uncertainty = df_uncertainty[df_uncertainty['q95_uncert_upper'].notna()]
df_uncertainty = df_uncertainty[df_uncertainty['q95_uncert_lower'].notna()]

## Calculate observation timeseries based percentiles
### Select sim based on obs percentiles
### Calculate objective functions per percentile

In [None]:
# Set flow categories based on percentiles
flow_categories = {'low_flow': (5, 25),
                   'mean_flow': (25, 75),
                   'high_flow': (75, 95)}

In [None]:
for basin_id in basin_ids:
    print(basin_id)
    # Get model simulation timeseries
    df_model = pd.read_csv(f"{OUTPUT}/simulations/{basin_id}_wflow_uncalibrated_evaluation_simulations.csv", index_col='time')

    # Get obervation timeseries
    df_obs = pd.read_csv(f'{OBSDIR}/CAMELS_GB_hydromet_timeseries_{basin_id}_19701001-20150930.csv', index_col='date')

    # Select evaluation period
    mask = (df_obs.index >= start_date) & (df_obs.index <= end_date)
    df_obs = df_obs.loc[mask]

    # Drop NaN values observation timeseries
    df_obs = df_obs[df_obs['discharge_vol'].notna()]

    # Loop Flow Categories
    for category in flow_categories:

        # Calculate percentiles
        lower = flow_categories[category][0]
        upper = flow_categories[category][1]

        obs_perc_lower = np.percentile(df_obs.discharge_vol,lower,axis=0)
        obs_perc_upper = np.percentile(df_obs.discharge_vol,upper,axis=0)

        # Select observations based on percentiles
        mask = (df_obs.discharge_vol >= obs_perc_lower) & (df_obs.discharge_vol <= obs_perc_upper)
        df_obs_selected = df_obs.loc[mask]

        # Select simulations that match observation based flow category
        df_sim_selected = df_obs_selected.join(df_model)
        df_sim_selected = df_sim_selected[['evaluation']]

        # Export selected simulations
        df_sim_selected.to_csv(f'{OUTPUT}/flow_categories/{basin_id}_wflow_uncalibrated_evaluation_simulations_{category}.csv')

        # Export selected observation
        df_obs_selected = df_obs_selected[['discharge_vol']]
        df_obs_selected.to_csv(f'{OUTPUT}/observations/{basin_id}_wflow_uncalibrated_evaluation_observations_{category}.csv')
        
        # Calculate objective function for each water year and take average
        years = list(range(int(start_date[:4]), int(end_date[:4])))

        objective_dfs = []
        for year in years:
            start_year = f'{year}-10-01'
            end_year = f'{year+1}-09-30'

            # Select water year
            mask_sim = (df_sim_selected.index >= start_year) & (df_sim_selected.index <= end_year)
            mask_obs = (df_obs_selected.index >= start_year) & (df_obs_selected.index <= end_year)

            df_sim_year = df_sim_selected.loc[mask_sim]
            df_obs_year = df_obs_selected.loc[mask_obs]

            # Calculate objective function
            df_objective = calculate_objective_functions(basin_id, df_sim_year, df_obs_year)
            objective_dfs.append(df_objective)

        # Merge water years objective values and take the mean value
        df = pd.concat(objective_dfs,axis=1)
        df = df.groupby(level=0,axis=1).mean()
        df = df.sort_values('kge_np', ascending=False)
        df['basin_id'] = [basin_id] * len(df)
        df.to_csv(f'{OUTPUT}/objective_functions/{basin_id}_wflow_uncalibrated_evaluation_objective_functions_{category}.csv', index=False)

# Create overview files

In [None]:
for category in flow_categories:

    files = glob(f'{OUTPUT}/objective_functions/*_wflow_uncalibrated_evaluation_objective_functions_{category}.csv')

    dataframes = []
    
    for file in files:
        df = pd.read_csv(file)
        dataframes.append(df)

    df_out = pd.concat(dataframes)
    df_out.to_csv(f'{ROOT}/results/wflow_sbm/wflow_uncalibrated_evaluation_objective_functions_overview_{category}.csv')