# Streamflow analyses PCR-GLOBWB

In [1]:
import os
import hydroeval
import xarray as xr
import pandas as pd
import numpy as np

from pathlib import Path
from glob import glob

# Set Paths

In [2]:
# Set Paths
ROOT = Path('/gpfs/work1/0/wtrcycle/users/jaerts/camels_uk/')
MODELS = Path(f'{ROOT}/pcr-globwb/')
AUXDATA = Path(f'{ROOT}/aux_data/')
OBSDIR = Path(f"{AUXDATA}/CAMELS-GB/data/timeseries/")
OUTPUT = Path(f'{ROOT}/results/pcr-globwb/evaluation/')

# UK CloneMap Run
## Config

In [3]:
# Get available basin IDs
basin_dirs = glob(f'{MODELS}/*')
basin_ids = [s.split('/')[-1] for s in basin_dirs]
basin_ids.remove('uk')
basin_ids.sort()

# Time period (drop first year)
start_date = '2008-01-01'
end_date   = '2015-09-30'
# Temp REMOVE!!!
end_date   = '2013-12-31'

## Retrieve data functions

In [4]:
def get_station_location(basin_id):
    # Load location file
    location_file = f"{AUXDATA}/CAMELS-GB/data/CAMELS_GB_topographic_attributes.csv"
    df_loc = pd.read_csv(location_file, index_col='gauge_id')
    
    # Select basin_ids and retrieve lat lon
    df_loc = df_loc.loc[int(basin_id)]
    latlon = (df_loc.gauge_lat, df_loc.gauge_lon)

    return latlon

def get_observations(basin_id, start_date, end_date):
    # Set observation file
    obs_file = glob(f'{OBSDIR}/*_{basin_id}_*.csv')[0]
    
    # Load observation dataframe
    df_obs = pd.read_csv(obs_file, parse_dates=True, index_col='date')
    
    # Select calibration period (drop first year)
    mask = (df_obs.index > start_date) & (df_obs.index <= end_date)
    df_obs = df_obs.loc[mask]
    
    return df_obs

def get_simulations(basin_id, start_date, end_date):
    # Load simulation results
    sim_file = f"{MODELS}/uk/netcdf/discharge_dailyTot_output.nc"
    ds_sim = xr.open_dataset(sim_file)

    # Get station location
    latlon = get_station_location(basin_id)

    # Extract station location timeseries
    ds_sim = ds_sim.discharge.sel(lat=latlon[0], lon=latlon[1], method='nearest')

    # Convert to dataframe
    df_sim = ds_sim.to_dataframe()

    # Select calibration period (drop first year)
    mask = (df_sim.index > start_date) & (df_sim.index <= end_date)
    df_sim = df_sim.loc[mask]

    # Rename column
    df_sim = df_sim.drop(columns=['lat','lon'])
    df_sim = df_sim.rename(columns={'discharge': f'sim'})

    return df_sim

# Adjust station location to river network

In [5]:
def get_adjusted_station_location_simulations(basin_id, start_date, end_date):

     # Get station_location
    station_lat, station_lon = get_station_location(basin_id)

    # Create 4 pixel buffer
    buffer = 0.0083333
    min_lat = station_lat-buffer
    max_lat = station_lat+buffer
    min_lon = station_lon-buffer
    max_lon = station_lon+buffer

    # Load simulation file
    sim_file = f"{MODELS}/uk/netcdf/discharge_dailyTot_output.nc"
    ds = xr.open_dataset(sim_file)
    da = ds.sel(lat=slice(max_lat,min_lat), lon=slice(min_lon,max_lon)).discharge

    # Load observation file
    df_obs = get_observations(basin_id, start_date, end_date)

    da = abs(da - df_obs.discharge_vol.mean())
    da_max = da.where(da==da.min(), drop=True).squeeze()

    # Select pixel with highest discharge value
    df_sim = ds.discharge.sel(lat=da_max.lat.values, lon=da_max.lon.values).to_dataframe()

    # Select calibration period (drop first year)
    mask = (df_sim.index > start_date) & (df_sim.index <= end_date)
    df_sim = df_sim.loc[mask]

    # Rename column
    df_sim = df_sim.drop(columns=['lat','lon'])
    df_sim = df_sim.rename(columns={'discharge': f'sim'})
    
    return df_sim

## Calculate objective functions

In [6]:
def calculate_objective_functions(basin_id, df_sim, df_obs):
    
    # Create empty dataframe and lists
    df = pd.DataFrame()

    # Calculate objective functions and round
    nse = hydroeval.evaluator(hydroeval.nse, df_sim[f'sim'], df_obs.discharge_vol, axis=1)
    nse = np.round(nse[0], 4)

    kge_2009 = hydroeval.evaluator(hydroeval.kge, df_sim[f'sim'], df_obs.discharge_vol, axis=1)
    kge_2009 = np.round(kge_2009[0][0], 4)

    kge_2012 = hydroeval.evaluator(hydroeval.kgeprime, df_sim[f'sim'], df_obs.discharge_vol, axis=1)
    kge_2012 = np.round(kge_2012[0][0], 4)

    kge_np = hydroeval.evaluator(hydroeval.kgenp, df_sim[f'sim'], df_obs.discharge_vol, axis=1)
    kge_np_value = np.round(kge_np[0][0], 4)
    kge_np_r = np.round(kge_np[0][1], 4)
    kge_np_alpha = np.round(kge_np[0][2], 4)
    kge_np_beta = np.round(kge_np[0][3], 4)
    
    df['basin_id'] = [basin_id]
    df['nse']      = [nse]
    df['kge_2009'] = [kge_2009]
    df['kge_2012'] = [kge_2012]
    df['kge_np']   = [kge_np_value]

    df['kge_np_r'] = [kge_np_r]
    df['kge_np_alpha'] = [kge_np_alpha]
    df['kge_np_beta'] = [kge_np_beta]
    print(df)
    return df

# Streamflow Analyses

In [None]:
for i, basin_id in enumerate(basin_ids):
    print(i, end='\r')
    print(basin_id)
    df_sim = get_simulations(basin_id, start_date, end_date)
    df_obs = get_observations(basin_id, start_date, end_date)

    df_objective = calculate_objective_functions(basin_id, df_sim, df_obs)
    
    df_sim.to_csv(f'{OUTPUT}/{basin_id}_calibration_simulations.csv')
    df_objective.to_csv(f'{OUTPUT}/{basin_id}_calibration_objective_functions.csv', index=False)

10003
  basin_id     nse  kge_2009  kge_2012  kge_np  kge_np_r  kge_np_alpha  \
0    10003  0.3215    0.3408    0.2974  0.6442    0.6976        0.8615   

   kge_np_beta  
0       1.1263  
1001
  basin_id     nse  kge_2009  kge_2012  kge_np  kge_np_r  kge_np_alpha  \
0     1001  0.1366    0.0951     0.074   0.453    0.5368        0.7235   

   kge_np_beta  
0       1.0906  
101002
  basin_id     nse  kge_2009  kge_2012  kge_np  kge_np_r  kge_np_alpha  \
0   101002  0.1639    0.3389    0.3294  0.6368    0.6636        0.8659   

   kge_np_beta  
0       1.0277  
101005
  basin_id     nse  kge_2009  kge_2012  kge_np  kge_np_r  kge_np_alpha  \
0   101005 -2.2368    -1.013   -1.0655 -0.9397    0.6318        0.8037   

   kge_np_beta  
0       2.8943  
102001
  basin_id     nse  kge_2009  kge_2012  kge_np  kge_np_r  kge_np_alpha  \
0   102001 -0.2201   -0.3869   -0.1699  0.0437    0.6539        0.7382   

   kge_np_beta  
0       0.1478  
106001
  basin_id     nse  kge_2009  kge_2012  kge_np

# Create overview dataframe

# Streamflow analyses with adjusted station location


In [None]:
for i, basin_id in enumerate(basin_ids):
    print(i, end='\r')
    print(basin_id)
    df_sim = get_adjusted_station_location_simulations(basin_id, start_date, end_date)
    df_obs = get_observations(basin_id, start_date, end_date)

    df_objective = calculate_objective_functions(basin_id, df_sim, df_obs)
    
    df_sim.to_csv(f'{OUTPUT}/{basin_id}_evaluation_simulations_adjusted_location_4px.csv')
    df_objective.to_csv(f'{OUTPUT}/{basin_id}_evaluation_objective_functions_adjusted_location_4px.csv', index=False)

In [None]:
basin_id = basin_ids[0]

sim_file = f"{MODELS}/uk/netcdf/discharge_dailyTot_output.nc"
ds_sim = xr.open_dataset(sim_file)

# Get station location
latlon = get_station_location(basin_id)

# Extract station location timeseries
ds_sim2 = ds_sim.discharge.sel(lat=latlon[0], lon=latlon[1], method='nearest')

In [None]:
ds_sim2.plot()

In [None]:
df_obs = get_observations(basin_id, start_date, end_date)

In [None]:
df_obs

In [None]:
df = ds_sim2.to_dataframe()

In [None]:
df = df.join(df_obs)

In [None]:
df.discharge.plot()
df.discharge_vol.plot()

In [None]:
df

In [None]:
min_lon = -2.25 
min_lat = 57.2 
max_lon =  -2.75
max_lat = 57.4 

cropped_ds = ds_sim.sel(lat=slice(min_lat,max_lat), lon=slice(min_lon,max_lon))

In [None]:
ds_sim.discharge.isel(time=1500).plot()

In [None]:
sim_file = f"{MODELS}/uk/netcdf/discharge_dailyTot_output.nc"

In [None]:
# Create empty dataframe and lists
df = pd.DataFrame()

# Calculate objective functions and round
nse = hydroeval.evaluator(hydroeval.nse, df_sim[f'sim'], df_obs.discharge_vol, axis=1)
nse = np.round(nse[0], 4)

kge_2009 = hydroeval.evaluator(hydroeval.kge, df_sim[f'sim'], df_obs.discharge_vol, axis=1)
kge_2009 = np.round(kge_2009[0][0], 4)

kge_2012 = hydroeval.evaluator(hydroeval.kgeprime, df_sim[f'sim'], df_obs.discharge_vol, axis=1)
kge_2012 = np.round(kge_2012[0][0], 4)

kge_np = hydroeval.evaluator(hydroeval.kgenp, df_sim[f'sim'], df_obs.discharge_vol, axis=1)
kge_np_value = np.round(kge_np[0][0], 4)
kge_np_r = np.round(kge_np[0][1], 4)
kge_np_alpha = np.round(kge_np[0][2], 4)
kge_np_beta = np.round(kge_np[0][3], 4)

df['basin_id'] = [basin_id]
df['nse']      = [nse]
df['kge_2009'] = [kge_2009]
df['kge_2012'] = [kge_2012]
df['kge_np']   = [kge_np_value]

df['kge_np_r'] = [kge_np_r]
df['kge_np_alpha'] = [kge_np_alpha]
df['kge_np_beta'] = [kge_np_beta]

In [None]:
df_sim = get_simulations(basin_id, start_date, end_date)
df_obs = get_observations(basin_id, start_date, end_date)

df_objective = calculate_objective_functions(basin_id, df_sim, df_obs)

df_sim.to_csv(f'{OUTPUT}/{basin_id}_calibration_simulations.csv')
df_objective.to_csv(f'{OUTPUT}/{basin_id}_calibration_objective_functions.csv', index=False)

In [None]:
df_objective

## Calculate objective functions

In [None]:
for i, basin_id in enumerate(basin_ids):
    print(i, end='\r')
    df_sim = get_simulations(basin_id, start_date, end_date)
    df_obs = get_observations(basin_id, start_date, end_date)

    df_objective = calculate_objective_functions(basin_id, df_sim, df_obs)
    
    df_sim.to_csv(f'{OUTPUT}/{basin_id}_calibration_simulations.csv')
    df_objective.to_csv(f'{OUTPUT}/{basin_id}_calibration_objective_functions.csv', index=False)

# Create overview dataframe

In [None]:
# Load results and create overview dataframe
result_files = glob(f"{OUTPUT}/*_calibration_objective_functions.csv")

# Create empty dataframe and lists
df_out = pd.DataFrame()
basin_ids = []
ls_kge_np = []
ls_kge_np_r = []
ls_kge_np_alpha = []
ls_kge_np_beta = []
ls_kge_2009 = []
ls_kge_2012 = []
ls_nse = []

for file in result_files:
    # Read results
    df = pd.read_csv(file)
    
    # Select first row
    df = df.loc[0]
    
#     # Append results
#     basin_ids.append(int(df['basin_id']))
#     ls_kge_np.append(df['kge_np'])
#     ls_kge_np_r.append(df['kge_np_r'])
#     ls_kge_np_alpha.append(df['kge_np_alpha'])
#     ls_kge_np_beta.append(df['kge_np_beta'])
#     ls_kge_2009.append(df['kge_2009'])
#     ls_kge_2012.append(df['kge_2012'])
#     ls_nse.append(df['nse'])

# # Create output dataframe
# df_out['basin_id'] = basin_ids    
# df_out['kge_np'] = ls_kge_np    
# df_out['kge_np_r'] = ls_kge_np_r    
# df_out['kge_np_alpha'] = ls_kge_np_alpha    
# df_out['kge_np_beta'] = ls_kge_np_beta    
# df_out['kge_2009'] = ls_kge_2009    
# df_out['kge_2012'] = ls_kge_2012    
# df_out['nse'] = ls_nse 

# # Write output
# df_out.to_csv(f'{ROOT}/results/pcr-globwb/evaluation_overview_wflow.csv')

In [None]:
df

# individual basin sim file analyses

# Check if output exists

In [None]:
basin_exists = []

for basin_id in basin_ids:
    sim_file = Path(f'{MODELS}/{basin_id}/netcdf/discharge_dailyTot_output.nc')
    if sim_file.is_file() is True and os.path.getsize(sim_file) > 10000:
        basin_exists.append(basin_id)

In [None]:
basin_exists

# Retrieve data functions

In [None]:
def get_station_location(basin_id):
    # Load location file
    location_file = f"{AUXDATA}/CAMELS-GB/data/CAMELS_GB_topographic_attributes.csv"
    df_loc = pd.read_csv(location_file, index_col='gauge_id')
    
    # Select basin_ids and retrieve lat lon
    df_loc = df_loc.loc[int(basin_id)]
    latlon = (df_loc.gauge_lat, df_loc.gauge_lon)

    return latlon

def get_observations(basin_id, start_date, end_date):
    # Set observation file
    obs_file = glob(f'{OBSDIR}/*_{basin_id}_*.csv')[0]
    
    # Load observation dataframe
    df_obs = pd.read_csv(obs_file, parse_dates=True, index_col='date')
    
    # Select calibration period (drop first year)
    mask = (df_obs.index > start_date) & (df_obs.index <= end_date)
    df_obs = df_obs.loc[mask]
    
    return df_obs

def get_simulations(basin_id, calibration_values, start_date, end_date):
    dataframes = []

    for calibration_value in calibration_values:
        # Set simulation file
        sim_file = glob(f'{MODELS}/{basin_id}/ksathorfrac_{calibration_value}/output.csv')[0]

        # Load simulation dataframe
        df = pd.read_csv(sim_file, parse_dates=True, index_col='time')

        # Select calibration period (drop first year)
        mask = (df.index > start_date) & (df.index <= end_date)
        df = df.loc[mask]
        
        # Rename column
        df = df.rename(columns={'Q_1': f'ksathorfrac_{calibration_value}'})
        
        # Append to list
        dataframes.append(df)

    # Concat simulation dataframes
    df_sim = pd.concat(dataframes,  axis=1, ignore_index=False)
    
    return df_sim

def calculate_objective_functions(basin_id, df_sim, df_obs, calibration_values):
    
    # Create empty dataframe and lists
    df = pd.DataFrame()
    basin_ids = []
    ksathorfracs = []
    nse_values = []
    kge_2009_values = []
    kge_2012_values = []
    kge_np_values = []
    kge_np_r_values = []
    kge_np_alpha_values = []
    kge_np_beta_values = []

    # Calculate objective functions for each parameter value
    for calibration_value in calibration_values:
        basin_ids.append(basin_id)
        ksathorfracs.append(calibration_value)

        # Calculate objective functions and round
        nse = hydroeval.evaluator(hydroeval.nse, df_sim[f'ksathorfrac_{calibration_value}'], df_obs.discharge_vol, axis=1)
        nse_values.append(np.round(nse[0], 4))

        kge_2009 = hydroeval.evaluator(hydroeval.kge, df_sim[f'ksathorfrac_{calibration_value}'], df_obs.discharge_vol, axis=1)
        kge_2009_values.append(np.round(kge_2009[0][0], 4))

        kge_2012 = hydroeval.evaluator(hydroeval.kgeprime, df_sim[f'ksathorfrac_{calibration_value}'], df_obs.discharge_vol, axis=1)
        kge_2012_values.append(np.round(kge_2012[0][0], 4))    

        kge_np = hydroeval.evaluator(hydroeval.kgenp, df_sim[f'ksathorfrac_{calibration_value}'], df_obs.discharge_vol, axis=1)
        kge_np_values.append(np.round(kge_np[0][0], 4))    
        kge_np_r_values.append(np.round(kge_np[0][1], 4))
        kge_np_alpha_values.append(np.round(kge_np[0][2], 4))
        kge_np_beta_values.append(np.round(kge_np[0][3], 4))
    
    df['basin_id'] = basin_ids
    df['ksathorfrac'] = ksathorfracs
    df['nse'] = nse_values
    df['kge_2009'] = kge_2009_values
    df['kge_2012'] = kge_2012_values
    df['kge_np'] = kge_np_values
    df['kge_np_r'] = kge_np_r_values
    df['kge_np_alpha'] = kge_np_alpha_values
    df['kge_np_beta'] = kge_np_beta_values

    return df

In [None]:
basin_exists = ['1001']

In [None]:
for basin_id in basin_exists:
    print(basin_id)
    latlon = get_station_location(basin_id)

    sim_file = f"{MODELS}/{basin_id}/netcdf/discharge_dailyTot_output.nc"
    ds_sim = xr.open_dataset(sim_file)
    ds_sim.sel(lat=latlon[0], lon=latlon[1], method='nearest').discharge.plot()
    df_obs = get_observations(basin_id, start_date, end_date)
    df_obs.discharge_vol.plot()

In [None]:
xr.open_dataset("/gpfs/work1/0/wtrcycle/users/jaerts/camels_uk/pcr-globwb/12005/netcdf/discharge_dailyTot_output.nc")

In [None]:
dataframes = []

    for calibration_value in calibration_values:
        # Set simulation file
        sim_file = glob(f'{MODELS}/{basin_id}/ksathorfrac_{calibration_value}/output.csv')[0]

        # Load simulation dataframe
        df = pd.read_csv(sim_file, parse_dates=True, index_col='time')

        # Select calibration period (drop first year)
        mask = (df.index > start_date) & (df.index <= end_date)
        df = df.loc[mask]
        
        # Rename column
        df = df.rename(columns={'Q_1': f'ksathorfrac_{calibration_value}'})
        
        # Append to list
        dataframes.append(df)

    # Concat simulation dataframes
    df_sim = pd.concat(dataframes,  axis=1, ignore_index=False)

In [None]:
latlon

In [None]:
len(basin_ids)

In [None]:
import xarray as xr

import glob
from dask.diagnostics import ProgressBar

In [None]:
files = glob.glob("/gpfs/work1/0/wtrcycle/users/jaerts/camels_uk/pcr-globwb/uk/ceh-gear_chess_camels-gb_uk_pet_20*.nc")

In [None]:
ds = xr.open_mfdataset(files, chunks={'time':1})


In [None]:
# Output filename
output_fname = f'{BASINDIR}/ceh-gear_chess_camels-gb_{basin_id}_{variable}_{year}.nc'

# Remove existing file
if output_fname:
    OUTPUT = Path(output_fname)
    OUTPUT.unlink(output_fname)

# Save to netcdf
write_job = da.to_netcdf(output_fname, encoding={f'{variable}': {'_FillValue': -9999, 'missing_value':-9999}}, compute=False)
with ProgressBar():
    write_job.compute()

In [None]:
ds = xr.open_dataset("/gpfs/work1/0/wtrcycle/users/jaerts/camels_uk/pcr-globwb/uk/netcdf/discharge_dailyTot_output.nc")

In [None]:
ds.discharge.isel(time=200).plot()