# Calculate model difference < observation uncertainty

In [None]:
import os
from glob import glob
from pathlib import Path

import numpy as np
import xarray as xr
import pandas as pd
from scipy import stats
import pylab as plot
import matplotlib.pyplot as plt
import seaborn as sns

# Set Paths

In [None]:
# Set Paths
ROOT = Path("/gpfs/work1/0/wtrcycle/users/jaerts/camels_uk/")
AUXDATA = Path(f"{ROOT}/aux_data")
RESULTS = Path(f"{ROOT}/results/")
OBSDIR = Path(f"{AUXDATA}/obs_flow_categories/")

# Set Config

In [None]:
# Load available basin_IDs
df_basin_ids = pd.read_csv(f"{AUXDATA}/available_basin_ids_uncertainty.csv", index_col='basin_id')
basin_ids = df_basin_ids.index.to_list()
# Set flow categories based on percentiles
flow_categories = {'low_flow': (5, 25),
                   'mean_flow': (25, 75),
                   'high_flow': (75, 95)}

# Select matching basin_IDs
ids = []
files = glob(f"{RESULTS}/model_differences/wflow_calibrated_uncalibrated/*_model_simulation_difference_wflow_calibrated_uncalibrated_low_flow.csv")
for file in files:
    ids.append(int(file.split('/')[-1].split('_')[0]))
basin_ids = list(set(basin_ids).intersection(ids))

In [None]:
# Load available basin_IDs
df_basin_ids = pd.read_csv(f"{AUXDATA}/available_basin_ids_uncertainty.csv", index_col='basin_id')
basin_ids = df_basin_ids.index.to_list()

ids = []
files = glob(f'{RESULTS}/model_differences/wflow_pcr-globwb/*_model_simulation_difference_wflow_calibrated_pcr-globwb_low_flow.csv')
for file in files:
    ids.append(int(file.split('/')[-1].split('_')[0]))
basin_ids = list(set(basin_ids).intersection(ids))

# wflow calibrated & uncalibrated

In [None]:
df_stats = pd.DataFrame()
basins = []
ksstats = []
pvalues = []
categories = []

for basin_id in basin_ids:
    for category in flow_categories:
        # Load observation uncertainty in m3/s
        df_obs_uncertainty = pd.read_csv(f"{RESULTS}/obs_flow_categories/{category}_{basin_id}_observation_uncertainty_m3s.csv", index_col='date')
        df_model_difference = pd.read_csv(f"{RESULTS}/model_differences/wflow_calibrated_uncalibrated/{basin_id}_model_simulation_difference_wflow_calibrated_uncalibrated_{category}.csv", index_col='date')
        df_model_difference = df_model_difference[df_model_difference['model_difference_wflow_calibrated_uncalibrated'].notna()]
        df = df_model_difference.join(df_obs_uncertainty)
        
        statistic, pvalue = stats.ttest_rel(df.observation_uncertainty.values,df.model_difference_wflow_calibrated_uncalibrated.values, alternative='greater')

        ksstats.append(statistic)
        pvalues.append(pvalue)        
        basins.append(basin_id)
        categories.append(category)
        
df_stats['basin_id'] = basins
df_stats['t_statistic'] = ksstats
df_stats['p_value'] = pvalues
df_stats['flow_category'] = categories

df_stats = df_stats[df_stats.p_value < 0.05]
df_stats = df_stats.set_index('flow_category')

print('wflow_sbm calibrated & uncalibrated \n Number of catchments p < 0.05:') 
print(f'low flow: {len(df_stats.loc["low_flow"])}')
print(f'average flow: {len(df_stats.loc["mean_flow"])}')
print(f'high flow: {len(df_stats.loc["high_flow"])}')

## Wflow calibrated & PCR-GLOBWB

In [None]:
df_stats = pd.DataFrame()
basins = []
ksstats = []
pvalues = []
categories = []

for basin_id in basin_ids:
    # print(basin_id)
    for category in flow_categories:
        # Load observation uncertainty in m3/s
        df_obs_uncertainty = pd.read_csv(f"{RESULTS}/obs_flow_categories/{category}_{basin_id}_observation_uncertainty_m3s.csv", index_col='date')
        df_model_difference = pd.read_csv(f'{RESULTS}/model_differences/wflow_pcr-globwb/{basin_id}_model_simulation_difference_wflow_calibrated_pcr-globwb_{category}.csv')
        df_model_difference = df_model_difference.set_index(df_model_difference.iloc[:, 0].name)
        df_model_difference = df_model_difference[df_model_difference['model_difference_wflow_calibrated_pcr-globwb'].notna()]
        df = df_model_difference.join(df_obs_uncertainty, rsuffix='_unc')
        
        statistic, pvalue = stats.ttest_rel(df.observation_uncertainty.values,df['model_difference_wflow_calibrated_pcr-globwb'].values, alternative='greater')

        ksstats.append(statistic)
        pvalues.append(pvalue)        
        basins.append(basin_id)
        categories.append(category)
        
df_stats['basin_id'] = basins
df_stats['t_statistic'] = ksstats
df_stats['p_value'] = pvalues
df_stats['flow_category'] = categories

df_stats = df_stats[df_stats.p_value < 0.05]
df_stats = df_stats.set_index('flow_category')

print('wflow_sbm calibrated & PCR-GLOBWB \n Number of catchments p < 0.05:') 
print(f'low flow: {len(df_stats.loc["low_flow"])}')
print(f'average flow: {len(df_stats.loc["mean_flow"])}')
print(f'high flow: {len(df_stats.loc["high_flow"])}')