## Bias Correction on hydrological model output

Basin: Santa Lucia \
Hydrological model: WFLOW-SBM \
Timestep: Daily \
Station: 133.0 Santa Lucia

#### Import libraries

In [1]:
import math
import requests
import geoglows
import numpy as np
import pandas as pd
import datetime as dt
import geopandas as gpd
import hydrostats as hs
import scipy.stats as sp
import plotly.express as px
import hydrostats.data as hd
import plotly.graph_objs as go

from IPython.display import display, HTML

import warnings
warnings.filterwarnings('ignore')

#### Define files names and location

In [2]:
obs_file = 'data/qobs.csv'
sim_file = 'data/qsim.csv'

#### import observed historical data

In [3]:
observed_historical = pd.read_csv(obs_file, parse_dates=['Fecha'],index_col="Fecha",dayfirst=True,na_values="NA")
observed_historical.index = pd.to_datetime(observed_historical.index)
observed_historical.index = observed_historical.index.to_series().dt.strftime("%Y-%m-%d")
observed_historical.index = pd.to_datetime(observed_historical.index)
# reindex de missing data
# Identify the missing data from a date range (1980 to 2023)
observed_missing = pd.date_range(start = observed_historical.index[0].strftime('%Y-%m-%d'), end = observed_historical.index[-1].strftime('%Y-%m-%d'),freq='D')
# Re-index the dataframe based on the missind date variable
observed_historical = observed_historical.reindex(observed_missing,fill_value=None)
observed_historical.index.name = 'time'
observed_historical.rename(columns={observed_historical.columns[0]: "qobs"}, inplace=True)
display(HTML(observed_historical.tail(6).to_html()))

Unnamed: 0_level_0,qobs
time,Unnamed: 1_level_1
2025-01-26,2.178
2025-01-27,
2025-01-28,3.911
2025-01-29,3.941
2025-01-30,3.548
2025-01-31,3.78


#### import simulated historical data

In [4]:
# Cargar el archivo y convertir la columna 'time' a DatetimeIndex
simulated_historical = pd.read_csv(sim_file, parse_dates=['time'], index_col='time')
simulated_historical.index = simulated_historical.index.strftime('%Y-%m-%d')
simulated_historical.index = pd.to_datetime(simulated_historical.index)
simulated_historical.rename(columns={simulated_historical.columns[0]: "qsim"}, inplace=True)
display(HTML(simulated_historical.tail(6).to_html()))

Unnamed: 0_level_0,qsim
time,Unnamed: 1_level_1
2019-12-26,10.64
2019-12-27,10.76
2019-12-28,12.56
2019-12-29,14.22
2019-12-30,13.34
2019-12-31,15.58


#### Define period for calculating bias-correction

In [5]:
# Here we select the period of record to create the monthly Flow Duration Curve (FDC)
tstart = "2011-01-01"
tend = "2019-12-31"

In [6]:
# Filter data between the tstart and tend
filtered_observed = observed_historical.loc[tstart:tend]
filtered_simulated = simulated_historical.loc[tstart:tend]
# define datetime in the string date column
filtered_observed.index = pd.to_datetime(filtered_observed.index)
filtered_simulated.index = pd.to_datetime(filtered_simulated.index)

#### Performing Bias-Correction

Here we are going to apply the correct_historical function provided by the GeoGlows Package

In [7]:

corrected_historical = geoglows.bias.correct_historical(filtered_simulated, filtered_observed)
corrected_historical.index = pd.to_datetime(corrected_historical.index)
corrected_historical.index = corrected_historical.index.to_series().dt.strftime("%Y-%m-%d")
corrected_historical.index = pd.to_datetime(corrected_historical.index)
display(HTML(corrected_historical.tail(6).to_html()))

Unnamed: 0,Corrected Simulated Streamflow
2019-12-26,6.237379
2019-12-27,6.307725
2019-12-28,7.362921
2019-12-29,8.336046
2019-12-30,7.820173
2019-12-31,9.133305


#### Plotting the results

In [8]:
site = 'Santa Lucia River at Ruta 11 (station id = 133.0)'
plot_titles = [f' Santa Lucia River at Ruta 11 (station id = 133.0)']
# This is a plot of the Original Simulated, Corrected Simulated, and Observed data
hydroviewer_figure = geoglows.plots.corrected_retrospective(corrected_historical, filtered_simulated, filtered_observed, plot_titles=plot_titles)
hydroviewer_figure.show()

In [9]:
#Scatter Plot in Normal Scale
'''Merge Data'''

merged_df = hd.merge_data(sim_df=filtered_simulated, obs_df=filtered_observed)
merged_df2 = hd.merge_data(sim_df=corrected_historical, obs_df=filtered_observed)

'''Plotting Data'''

scatter_data = go.Scatter(
    x=merged_df.iloc[:, 0].values,
		y=merged_df.iloc[:, 1].values,
		mode='markers',
		name='original',
		marker=dict(color='#ef553b')
)

scatter_data2 = go.Scatter(
    x=merged_df2.iloc[:, 0].values,
		y=merged_df2.iloc[:, 1].values,
		mode='markers',
		name='corrected',
		marker=dict(color='#00cc96')
)

min_value = min(min(merged_df.iloc[:, 1].values), min(merged_df.iloc[:, 0].values))
max_value = max(max(merged_df.iloc[:, 1].values), max(merged_df.iloc[:, 0].values))

min_value2 = min(min(merged_df2.iloc[:, 1].values), min(merged_df2.iloc[:, 0].values))
max_value2 = max(max(merged_df2.iloc[:, 1].values), max(merged_df2.iloc[:, 0].values))

line_45 = go.Scatter(
    x=[min_value, max_value],
		y=[min_value, max_value],
		mode='lines',
		name='45deg line',
		line=dict(color='black')
)

slope, intercept, r_value, p_value, std_err = sp.linregress(merged_df.iloc[:, 0].values, merged_df.iloc[:, 1].values)

slope2, intercept2, r_value2, p_value2, std_err2 = sp.linregress(merged_df2.iloc[:, 0].values, merged_df2.iloc[:, 1].values)

line_adjusted = go.Scatter(
    x=[min_value, max_value],
		y=[slope * min_value + intercept, slope * max_value + intercept],
		mode='lines',
		name='{0}x + {1} (Original)'.format(str(round(slope, 2)), str(round(intercept, 2))),
		line=dict(color='red')
)

line_adjusted2 = go.Scatter(
    x=[min_value, max_value],
		y=[slope2 * min_value + intercept2, slope2 * max_value + intercept2],
		mode='lines',
		name='{0}x + {1} (Corrected)'.format(str(round(slope2, 2)), str(round(intercept2, 2))),
		line=dict(color='green')
)

layout = go.Layout(title="Scatter Plot for {0} - {1}".format(site, "133.0"),
                   xaxis=dict(title='Simulated', ), yaxis=dict(title='Observed', autorange=True),
                   showlegend=True)

chart_obj = go.Figure(data=[scatter_data, scatter_data2, line_45, line_adjusted, line_adjusted2], layout=layout)

chart_obj.show()

In [10]:
# This is a plot of the daily averages
hydroviewer_figure = geoglows.plots.corrected_day_average(corrected_historical, filtered_simulated, filtered_observed, plot_titles=plot_titles)
hydroviewer_figure.show()

In [11]:
# This is a plot of the monthly averages
hydroviewer_figure =  geoglows.plots.corrected_month_average(corrected_historical, filtered_simulated, filtered_observed, plot_titles=plot_titles)
hydroviewer_figure.show()

In [12]:
display(HTML(geoglows.bias.statistics_tables(corrected_historical, filtered_simulated, filtered_observed)))

Unnamed: 0,Original Full Time Series,Corrected Full Time Series
ME,21.281269,0.711295
RMSE,121.19758,84.747256
NRMSE (Mean),1.396367,0.976408
MAPE,65.151894,51.701639
NSE,0.775212,0.89009
KGE (2009),0.621586,0.895638
KGE (2012),0.747162,0.903488


In [14]:
def plot_fdc(*dataframes, labels=None):
    """
    It generates a Flow Duration Curve for different Dataframe.
    
    Parameters:
    - *dataframes: one or more dataframe with DatetimeIndex and one column (streamflow)
    - labels (list, optional): list with labels
    
    Return:
    - Graphic with the FDC with log-scale 
    """
    fig = go.Figure()

    if labels is None:
        labels = [f"Serie {i+1}" for i in range(len(dataframes))]

    for i, df in enumerate(dataframes):
        # Make sur to use DatetimeIndex
        df.index = pd.to_datetime(df.index)

        # Extrat daily discharge 
        q_daily = df.iloc[:, 0].dropna()

        # Sort discharge from highest to lowest
        q_sorted = q_daily.sort_values(ascending=False).reset_index(drop=True)

        # Calculate the Probability of excedeence 
        n = len(q_sorted)
        exceedance_prob = [(j + 1) / (n + 1) * 100 for j in range(n)]

        # Add into the graph
        fig.add_trace(go.Scatter(
            x=exceedance_prob, 
            y=q_sorted,
            mode='lines',
            name=labels[i]
        ))

    # update graph (log scale in Y)
    fig.update_layout(
        title="Flow Duration Curve",
        xaxis_title="Non-exceedance Probability (%)",
        yaxis_title="Streamflow (m³/s)",
        xaxis=dict(type='linear', autorange=True),  # X linear scale
        yaxis=dict(type='log', tickformat=".2f"),  # Y log scale and no scientific notation
        legend_title="FDC"
    )

    return fig


In [15]:
fig = plot_fdc(corrected_historical, filtered_simulated, filtered_observed, labels=["Qbc", "Qsim", "Qobs"])
fig.show()
