# Streamflow analyses PCR-GLOBWB

In [1]:
import os
import hydroeval
import xarray as xr
import pandas as pd
import numpy as np

from pathlib import Path
from glob import glob

# Set Paths

In [2]:
# Set Paths
ROOT = Path('/gpfs/work1/0/wtrcycle/users/jaerts/camels_uk/')
MODELS = Path(f'{ROOT}/pcr-globwb/')
AUXDATA = Path(f'{ROOT}/aux_data/')
OBSDIR = Path(f"{AUXDATA}/CAMELS-GB/data/timeseries/")
OUTPUT = Path(f'{ROOT}/results/pcr-globwb/evaluation/')

# UK CloneMap Run
## Config

In [3]:
# Get available basin IDs
basin_dirs = glob(f'{MODELS}/*')
basin_ids = [s.split('/')[-1] for s in basin_dirs]
basin_ids.remove('uk')
basin_ids.sort()

# Time period (drop first year)
start_date = '2008-01-01'
end_date   = '2015-09-30'

In [6]:
basin_ids.remove('18017')
basin_ids.remove('18018')
# basin_ids.remove('21006')
# basin_ids.remove('28117')
# basin_ids.remove('39127')
# basin_ids.remove('41013')
# basin_ids.remove('41023')
# basin_ids.remove('46005')
# basin_ids.remove('47018')
# basin_ids.remove('54003')
# basin_ids.remove('54028')
# basin_ids.remove('54034')
# basin_ids.remove('54038')
# basin_ids.remove('54060')
# basin_ids.remove('67010')
# basin_ids.remove('74006')
# basin_ids.remove('76011')
# basin_ids.remove('80136')
# basin_ids =  basin_ids[565:]

## Retrieve data functions

In [7]:
def get_station_location(basin_id):
    # Load location file
    location_file = f"{AUXDATA}/CAMELS-GB/data/CAMELS_GB_topographic_attributes.csv"
    df_loc = pd.read_csv(location_file, index_col='gauge_id')
    
    # Select basin_ids and retrieve lat lon
    df_loc = df_loc.loc[int(basin_id)]
    latlon = (df_loc.gauge_lat, df_loc.gauge_lon)

    return latlon

def get_observations(basin_id, start_date, end_date):
    # Set observation file
    obs_file = glob(f'{OBSDIR}/*_{basin_id}_*.csv')[0]
    
    # Load observation dataframe
    df_obs = pd.read_csv(obs_file, parse_dates=True, index_col='date')
    
    # Select calibration period (drop first year)
    mask = (df_obs.index > start_date) & (df_obs.index <= end_date)
    df_obs = df_obs.loc[mask]
    
    return df_obs

def get_simulations(basin_id, start_date, end_date):
    # Load simulation results
    sim_file = f"{MODELS}/uk/netcdf/discharge_dailyTot_output.nc"
    ds_sim = xr.open_dataset(sim_file)

    # Get station location
    latlon = get_station_location(basin_id)

    # Extract station location timeseries
    ds_sim = ds_sim.discharge.sel(lat=latlon[0], lon=latlon[1], method='nearest')

    # Convert to dataframe
    df_sim = ds_sim.to_dataframe()

    # Select calibration period (drop first year)
    mask = (df_sim.index > start_date) & (df_sim.index <= end_date)
    df_sim = df_sim.loc[mask]

    # Rename column
    df_sim = df_sim.drop(columns=['lat','lon'])
    df_sim = df_sim.rename(columns={'discharge': f'sim'})

    return df_sim

# Adjust station location to river network

In [8]:
def get_adjusted_station_location_simulations(basin_id, start_date, end_date):

     # Get station_location
    station_lat, station_lon = get_station_location(basin_id)

    # Create 4 pixel buffer
    buffer = 0.0083333
    min_lat = station_lat-buffer
    max_lat = station_lat+buffer
    min_lon = station_lon-buffer
    max_lon = station_lon+buffer

    # Load simulation file
    sim_file = f"{MODELS}/uk/netcdf/discharge_dailyTot_output.nc"
    ds = xr.open_dataset(sim_file)
    da = ds.sel(lat=slice(max_lat,min_lat), lon=slice(min_lon,max_lon)).discharge

    # Load observation file
    df_obs = get_observations(basin_id, start_date, end_date)

    da = abs(da - df_obs.discharge_vol.mean())
    da_max = da.where(da==da.min(), drop=True).squeeze()

    # Select pixel with highest discharge value
    print(np.count_nonzero(da_max.lat.values))
    if np.count_nonzero(da_max.lat.values) > 1:
         df_sim = ds.discharge.sel(lat=da_max.lat.values[0], lon=da_max.lon.values[0]).to_dataframe()
    else:
        df_sim = ds.discharge.sel(lat=da_max.lat.values, lon=da_max.lon.values).to_dataframe()

    # Select calibration period (drop first year)
    df_sim = df_sim.reset_index()
    df_sim = df_sim.set_index('time')
    mask = (df_sim.index > start_date) & (df_sim.index <= end_date)
    df_sim = df_sim.loc[mask]

    # Rename column
    df_sim = df_sim.drop(columns=['lat','lon'])
    df_sim = df_sim.rename(columns={'discharge': f'sim'})
    
    return df_sim

## Calculate objective functions

In [9]:
def calculate_objective_functions(basin_id, df_sim, df_obs):
    
    # Create empty dataframe and lists
    df = pd.DataFrame()

    # Combine obs and sim because of nan values
    df_eval = df_obs.discharge_vol.to_frame().join(df_sim)

    # Calculate objective functions and round
    nse = hydroeval.evaluator(hydroeval.nse, df_eval[f'sim'], df_eval.discharge_vol, axis=1)
    nse = np.round(nse[0], 4)

    kge_2009 = hydroeval.evaluator(hydroeval.kge, df_eval[f'sim'], df_eval.discharge_vol, axis=1)
    kge_2009 = np.round(kge_2009[0][0], 4)

    kge_2012 = hydroeval.evaluator(hydroeval.kgeprime, df_eval[f'sim'], df_eval.discharge_vol, axis=1)
    kge_2012 = np.round(kge_2012[0][0], 4)

    kge_np = hydroeval.evaluator(hydroeval.kgenp, df_eval[f'sim'], df_eval.discharge_vol, axis=1)
    kge_np_value = np.round(kge_np[0][0], 4)
    kge_np_r = np.round(kge_np[0][1], 4)
    kge_np_alpha = np.round(kge_np[0][2], 4)
    kge_np_beta = np.round(kge_np[0][3], 4)

    df['basin_id'] = [basin_id]
    df['nse']      = [nse]
    df['kge_2009'] = [kge_2009]
    df['kge_2012'] = [kge_2012]
    df['kge_np']   = [kge_np_value]

    df['kge_np_r'] = [kge_np_r]
    df['kge_np_alpha'] = [kge_np_alpha]
    df['kge_np_beta'] = [kge_np_beta]
    return df

# Streamflow analyses with adjusted station location

In [156]:
for i, basin_id in enumerate(basin_ids):
    print(basin_id)

    df_sim = get_adjusted_station_location_simulations(basin_id, start_date, end_date)
    df_obs = get_observations(basin_id, start_date, end_date)

    df_sim.to_csv(f'{OUTPUT}/{basin_id}_evaluation_simulations_adjusted_location_4px.csv')
    df_obs.to_csv(f'{OUTPUT}/{basin_id}_evaluation_observations.csv', index=False)   

    # Calculate objective function for each water year and take average
    years = list(range(int(start_date[:4]), int(end_date[:4])))

    objective_dfs = []
    for year in years:
        start_year = f'{year}-10-01'
        end_year = f'{year+1}-09-30'

        # Select water year
        mask_sim = (df_sim.index >= start_year) & (df_sim.index <= end_year)
        mask_obs = (df_obs.index >= start_year) & (df_obs.index <= end_year)

        df_sim_year = df_sim.loc[mask_sim]
        df_obs_year = df_obs.loc[mask_obs]

        # Calculate objective function
        df_objective = calculate_objective_functions(basin_id, df_sim_year, df_obs_year)
        objective_dfs.append(df_objective)

    # Merge water years objective values and take the mean value
    df = pd.concat(objective_dfs,axis=1)
    df = df.groupby(level=0,axis=1).mean()
    df = df.sort_values('kge_np', ascending=False)
    df['basin_id'] = [basin_id] * len(df)
    df.to_csv(f'{OUTPUT}/{basin_id}_evaluation_objective_functions.csv', index=False)

74006
2


  r = r_num / r_den
  r = r_num / r_den
  gamma = ((np.std(simulations, axis=0, dtype=np.float64) / sim_mean)
  simulations / (simulations.shape[0] * np.mean(simulations, axis=0,
  r = r_num / r_den
  r = r_num / r_den
  gamma = ((np.std(simulations, axis=0, dtype=np.float64) / sim_mean)
  simulations / (simulations.shape[0] * np.mean(simulations, axis=0,
  r = r_num / r_den
  r = r_num / r_den
  gamma = ((np.std(simulations, axis=0, dtype=np.float64) / sim_mean)
  simulations / (simulations.shape[0] * np.mean(simulations, axis=0,
  r = r_num / r_den
  r = r_num / r_den
  gamma = ((np.std(simulations, axis=0, dtype=np.float64) / sim_mean)
  simulations / (simulations.shape[0] * np.mean(simulations, axis=0,
  r = r_num / r_den
  r = r_num / r_den
  gamma = ((np.std(simulations, axis=0, dtype=np.float64) / sim_mean)
  simulations / (simulations.shape[0] * np.mean(simulations, axis=0,
  r = r_num / r_den
  r = r_num / r_den
  gamma = ((np.std(simulations, axis=0, dtype=np.float64) / sim_m

# Create overview dataframe

In [265]:
files = glob(f'{OUTPUT}/*_evaluation_objective_functions_adjusted_location_4px.csv')

dataframes = []

for file in files:
    df = pd.read_csv(file)
    dataframes.append(df)
    
df_out = pd.concat(dataframes)
df_out.to_csv(f'{ROOT}/results/pcr-globwb/evaluation_overview_pcrglobwb_adjusted_loc.csv')

# Gumboot prep

In [266]:
for i, basin_id in enumerate(basin_ids):
    print(i, end='\r')

    df_sim = pd.read_csv(f"{OUTPUT}/{basin_id}_evaluation_simulations_adjusted_location_4px.csv")
    df_sim['time'] = pd.to_datetime(df_sim['time'])
    df_sim = df_sim.set_index('time')
    
    df_obs = get_observations(basin_id, start_date, end_date)

    df_eval = df_sim.join(df_obs.discharge_vol)
    df_eval = df_eval.reset_index()
    df_eval = df_eval.rename(columns={'time':'date', 'discharge_vol':'obs'})
    if df_eval.columns[0] == 'index':
        df_eval = df_eval.rename(columns={'index':'date'})
    
    df_eval = df_eval.set_index('date')
    df_eval.to_csv(f'{OUTPUT}/{basin_id}_evaluation_simulations_adjusted_location_4px_gumboot.csv')

655

# Calculate for flow categories

In [None]:
def get_flow_category_simulations(basin_id, flow_category, start_date, end_date):
    dataframes = []

    # Set simulation file
    sim_file = glob(f'{ROOT}/results/categories/{category}/{basin_id}_model_difference.csv')[0]
    
    # Load simulation dataframe
    df = pd.read_csv(sim_file, parse_dates=True, index_col='time')

    # Select calibration period (drop first year)
    mask = (df.index > start_date) & (df.index <= end_date)
    df = df.loc[mask]

    # Rename column
    df = df.rename(columns={'Q_1': f'evaluation'})

    return df


def get_flow_category_observations(basin_id, start_date, end_date):
    # Set observation file
    obs_file = glob(f'{OBSDIR}/*_{basin_id}_*.csv')[0]
    
    # Load observation dataframe
    df_obs = pd.read_csv(obs_file, parse_dates=True, index_col='date')
    
    # Select calibration period (drop first year)
    mask = (df_obs.index > start_date) & (df_obs.index <= end_date)
    df_obs = df_obs.loc[mask]
    
    return df_obs


def get_simulations(basin_id, start_date, end_date):
    # Load simulation results
    sim_file = f"{MODELS}/uk/netcdf/discharge_dailyTot_output.nc"
    ds_sim = xr.open_dataset(sim_file)

    # Get station location
    latlon = get_station_location(basin_id)

    # Extract station location timeseries
    ds_sim = ds_sim.discharge.sel(lat=latlon[0], lon=latlon[1], method='nearest')

    # Convert to dataframe
    df_sim = ds_sim.to_dataframe()

    # Select calibration period (drop first year)
    mask = (df_sim.index > start_date) & (df_sim.index <= end_date)
    df_sim = df_sim.loc[mask]

    # Rename column
    df_sim = df_sim.drop(columns=['lat','lon'])
    df_sim = df_sim.rename(columns={'discharge': f'sim'})

    return df_sim

In [None]:
for i, basin_id in enumerate(basin_ids):
    print(basin_id)

    df_sim = get_adjusted_station_location_simulations(basin_id, start_date, end_date)
    df_obs = get_observations(basin_id, start_date, end_date)

    df_sim.to_csv(f'{OUTPUT}/{basin_id}_evaluation_simulations_adjusted_location_4px.csv')
    df_obs.to_csv(f'{OUTPUT}/{basin_id}_evaluation_observations.csv', index=False)   

    # Calculate objective function for each water year and take average
    years = list(range(int(start_date[:4]), int(end_date[:4])))

    objective_dfs = []
    for year in years:
        start_year = f'{year}-10-01'
        end_year = f'{year+1}-09-30'

        # Select water year
        mask_sim = (df_sim.index >= start_year) & (df_sim.index <= end_year)
        mask_obs = (df_obs.index >= start_year) & (df_obs.index <= end_year)

        df_sim_year = df_sim.loc[mask_sim]
        df_obs_year = df_obs.loc[mask_obs]

        # Calculate objective function
        df_objective = calculate_objective_functions(basin_id, df_sim_year, df_obs_year)
        objective_dfs.append(df_objective)

    # Merge water years objective values and take the mean value
    df = pd.concat(objective_dfs,axis=1)
    df = df.groupby(level=0,axis=1).mean()
    df = df.sort_values('kge_np', ascending=False)
    df['basin_id'] = [basin_id] * len(df)
    df.to_csv(f'{OUTPUT}/{basin_id}_evaluation_objective_functions.csv', index=False)