# Persistance Forecast

#### Jose Valles (jose.valles.leon@gmail.com)

In [1]:
# Importing the libraries
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import seaborn as sns
import numpy as np
import calendar
import datetime
import matplotlib.pyplot as plt
plt.style.use('classic')
import os
%matplotlib inline

from IPython.display import HTML

sns.set()

In [2]:
# Quantile Function Definitions
def q1(x):
    return x.quantile(0.25)

def q2(x):
    return x.median()

def q3(x):
    return x.quantile(0.75)

Import de daily discharge from a hydrological station located in Uruguay 

In [3]:
station_name = '1330'
input_folder = os.path.join('../stations/data', f'{station_name}.csv')

DISCHARGE_DAILY = pd.read_csv(input_folder,parse_dates=['Fecha'],index_col="Fecha",dayfirst=True,na_values="NA")

Identify the missing dates and change dataframe columns name. **Important** Change the end date parameter in the ``pd.date_range``

In [4]:
# Identify the missing data from a date range (1980 to 2023)
DISCHARGE_DAILY_date_missing = pd.date_range(start = DISCHARGE_DAILY.index[0].strftime('%Y-%m-%d'), end = DISCHARGE_DAILY.index[-1].strftime('%Y-%m-%d'),freq='D')
# Re-index the dataframe based on the missind date variable
DISCHARGE_DAILY = DISCHARGE_DAILY.reindex(DISCHARGE_DAILY_date_missing,fill_value=None)
# Set index Fecha
DISCHARGE_DAILY.index.name = 'date' 
# Change columns names
DISCHARGE_DAILY.columns = ['discharge']
# Print the last 6 values
HTML(DISCHARGE_DAILY.tail(6).to_html())

Unnamed: 0_level_0,discharge
date,Unnamed: 1_level_1
2025-12-26,2.043
2025-12-27,2.005
2025-12-28,2.016
2025-12-29,1.999
2025-12-30,2.029
2025-12-31,1.951


### Calculate monthly mean from daily data

First, we define a percentage of missing value. For this exercise, we use a 50% of missing data in the given month

In [5]:
# Percentage of missing data
max_pct_missing = 50

In [6]:
# group the Dataframe in a monthly time scale
GROUPER_DISCHARGE_MONTHLY = DISCHARGE_DAILY.groupby(pd.Grouper(freq='1MS'))

# this function allows to calculate the percentange of missing values and label the dataframe to "missing"
NUMBER_MISSING = GROUPER_DISCHARGE_MONTHLY.apply(lambda x: pd.isnull(x).sum()*100/len(x)).unstack(1)
NUMBER_MISSING = NUMBER_MISSING.to_frame()

# change the column name to "missing"
NUMBER_MISSING.columns = ['number_missing'] 

# this function allows to calculate the percentange of missing values and label the dataframe to "missing"
BOOL_MISSING = GROUPER_DISCHARGE_MONTHLY.apply(lambda x: pd.isnull(x).sum()*100/len(x)).unstack(1) < max_pct_missing
BOOL_MISSING = BOOL_MISSING.to_frame() # Convert to DataFrame
BOOL_MISSING.columns = ['missing'] # change the column name to "missing"

# BOOL_MISSING[~BOOL_MISSING['missing']] # print the dates that does not fulfill the criterion of null data in for each month
# BOOL_MISSING.to_clipboard()

# NUMBER_MISSING.to_clipboard() # Uncomment if you want to visualize all the result in a CSV

Next, we identify the months which contains lower that the max_pct_missing value and the monthly flow is only calculated if 50% o more of recorded value in a given month

In [7]:
# from daily to monthly
DISCHARGE_MONTHLY = DISCHARGE_DAILY.resample('M').apply(lambda x: x.mean() if x.isnull().sum()*100/len(x) < max_pct_missing else np.nan)
# from monthly to 3 months
DISCHARGE_THREE_MONTHS = DISCHARGE_MONTHLY.rolling(3).apply(lambda x: x.mean() if x.isnull().sum()*100/len(x) < max_pct_missing else np.nan)
# Create columns
DISCHARGE_MONTHLY['year'] = DISCHARGE_MONTHLY.index.year
DISCHARGE_MONTHLY['month'] = DISCHARGE_MONTHLY.index.month
## create column for day, month, year in the daily discharge ()
DISCHARGE_DAILY['year'] = DISCHARGE_DAILY.index.year
DISCHARGE_DAILY['month'] = DISCHARGE_DAILY.index.month
DISCHARGE_DAILY['monthday'] = DISCHARGE_DAILY.index.day_of_year
# Transform the date to Qlog
DISCHARGE_MONTHLY['Q_to_log'] = np.log(DISCHARGE_MONTHLY['discharge'])
## print the first results
HTML(DISCHARGE_MONTHLY.tail(6).to_html(index=False))
# DISCHARGE_MONTHLY.to_clipboard() # Uncomment if you want to visualize all the time serie

discharge,year,month,Q_to_log
11.830516,2025,7,2.470682
30.211484,2025,8,3.408222
13.1669,2025,9,2.577706
15.270839,2025,10,2.725945
3.9391,2025,11,1.370952
1.63471,2025,12,0.491465


### Select the period of record to estimate the mean and std

For this section, we will use a period of reference or the dataset of year overdue. We are in 2023 so the period of record is 1980-2022

In [8]:
DISCHARGE_SELECTION = DISCHARGE_MONTHLY[(DISCHARGE_MONTHLY['year'] >= 1980) & (DISCHARGE_MONTHLY['year'] <= 2010)]

Calculate the average dischare for each month in the period of record

In [9]:
DISCHARGE_MONTHLY_STATS = DISCHARGE_SELECTION.Q_to_log.groupby(DISCHARGE_SELECTION.index.month).agg([np.mean, np.std, q1, q3])
Q_MONTHLY_STATS = DISCHARGE_SELECTION.discharge.groupby(DISCHARGE_SELECTION.index.month).agg([np.mean, np.std, q1, q3])
# Display results
HTML(DISCHARGE_MONTHLY_STATS.to_html())

Unnamed: 0_level_0,mean,std,q1,q3
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,2.551158,1.203665,1.886299,3.329626
2,3.050261,1.33962,2.086789,3.456424
3,3.289186,1.216251,2.308507,4.060106
4,3.607872,1.313309,2.514972,4.405753
5,4.04997,1.301068,2.860236,5.029519
6,4.565733,1.122627,4.150366,5.294164
7,4.663712,0.93192,4.243954,5.214687
8,4.470057,0.954427,3.913864,5.172735
9,4.553169,0.896194,3.925445,5.120129
10,4.187485,1.167207,3.169116,5.127345


In [10]:
DISCHARGE_MONTHLY['anomaly'] = np.nan
for i in range(len(DISCHARGE_MONTHLY)):
    # Extract the current month 
    m = DISCHARGE_MONTHLY.month[i]
    # Extract the current year
    y = DISCHARGE_MONTHLY.year[i]
    DISCHARGE_MONTHLY.loc[DISCHARGE_MONTHLY.eval('month==@m & year==@y'),'anomaly'] = (DISCHARGE_MONTHLY['Q_to_log'][i] - DISCHARGE_MONTHLY_STATS.query('index == @m')["mean"].item()) / DISCHARGE_MONTHLY_STATS.query('index == @m')["std"].item()

DISCHARGE_MONTHLY = DISCHARGE_MONTHLY[['year','month','discharge','Q_to_log','anomaly']]

In [11]:
HTML(DISCHARGE_MONTHLY.tail(8).to_html(index=False))

year,month,discharge,Q_to_log,anomaly
2025,5,21.97271,3.089801,-0.737985
2025,6,54.914133,4.005771,-0.498797
2025,7,11.830516,2.470682,-2.353237
2025,8,30.211484,3.408222,-1.112537
2025,9,13.1669,2.577706,-2.20428
2025,10,15.270839,2.725945,-1.252169
2025,11,3.9391,1.370952,-2.172191
2025,12,1.63471,0.491465,-2.408095


### Make a Persistance Forecast

We create a function called add_month that add the number of months based on an input date.

In [12]:
def add_months(sourcedate, months):
    month = sourcedate.month - 1 + months
    year = sourcedate.year + month // 12
    month = month % 12 + 1
    day = min(sourcedate.day, calendar.monthrange(year,month)[1])
    date_result = datetime.date(year, month, day)
    date_result = date_result.strftime('%Y-%m-%d %H:%M:%S')
    date_result = datetime.datetime.strptime(date_result,'%Y-%m-%d %H:%M:%S')
    return date_result

We extract the month overdue based on the current date

In [13]:
# today = datetime.datetime(2022, 1, 1) # Uncomment if you would like to specify a date
today = datetime.date.today() # Uncomment if you would like to use today date
first = today.replace(day=1)
# last_month = first - datetime.timedelta(days=1)
last_month = DISCHARGE_MONTHLY.index[-1] # Uncomment if you would like to use the last date in the df
print(last_month.strftime('%Y-%m-%d'))

2025-12-31


Make a Forecast and create a new dataframe for forecasting purposes 

In [14]:
# Make a Forecast using the persistance of the discharge anomaly
PERSISTANCE_FCST = DISCHARGE_MONTHLY.query("month == @last_month.month & year == @last_month.year")['anomaly'].item()
# For the forecast, we will create a new dataframe 
DISCHARGE_MONTHLY_FCST = DISCHARGE_MONTHLY

In [15]:
# Define the forecast lead-times
arr = np.array([1,2,3,4,5,6])
for x in arr:
    # convert to datetime the forecast lead-time
    forecast_month = add_months(last_month,x)
    # revert anomaly transformation
    q2log = (PERSISTANCE_FCST * DISCHARGE_MONTHLY_STATS.query('index == @forecast_month.month')["std"].item()) + DISCHARGE_MONTHLY_STATS.query('index == @forecast_month.month')["mean"].item()
    # Revert forecast to obtain flow units
    exp_log = np.exp(q2log)
    # put it in the forecast dataframe
    DISCHARGE_MONTHLY_FCST.loc[forecast_month] = [forecast_month.year,forecast_month.month, exp_log, q2log, PERSISTANCE_FCST]

# DISCHARGE_MONTHLY_FCST.index = DISCHARGE_MONTHLY_FCST.index.map(lambda t: t.replace(day=1))

In [16]:
HTML(DISCHARGE_MONTHLY_FCST.tail(8).to_html(index=True))

Unnamed: 0_level_0,year,month,discharge,Q_to_log,anomaly
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2025-11-30,2025.0,11.0,3.9391,1.370952,-2.172191
2025-12-31,2025.0,12.0,1.63471,0.491465,-2.408095
2026-01-31,2026.0,1.0,0.706536,-0.347381,-2.408095
2026-02-28,2026.0,2.0,0.838893,-0.175672,-2.408095
2026-03-31,2026.0,3.0,1.433815,0.360339,-2.408095
2026-04-30,2026.0,4.0,1.560959,0.4453,-2.408095
2026-05-31,2026.0,5.0,2.501461,0.916875,-2.408095
2026-06-30,2026.0,6.0,6.43879,1.862341,-2.408095


In [17]:
start = add_months(last_month, -5)
end = add_months(last_month, 6)
ndata = DISCHARGE_MONTHLY_FCST.loc[(DISCHARGE_MONTHLY_FCST.index >= start) & (DISCHARGE_MONTHLY_FCST.index <= end)].copy()

In [18]:
ndata.index = ndata.index.map(lambda t: t.replace(day=1))

In [19]:
# create empty columns in the dataframe
ndata['25th_percentile'] = np.nan
ndata['75th_percentile'] = np.nan

for i in range(len(ndata)):
    # Extract the current month 
    m = ndata.month[i]
    y = ndata.year[i]
    ndata.loc[ndata.eval('month==@m & year==@y'),'25th_percentile']  = Q_MONTHLY_STATS.query('index==@m')['q1'].item()
    ndata.loc[ndata.eval('month==@m & year==@y'),'75th_percentile']  = Q_MONTHLY_STATS.query('index==@m')['q3'].item()

In [20]:
import plotly.graph_objects as go

# selection using boolean masks
cut = last_month.replace(day=1)
y1 = ndata[ndata.index <= cut]
y2 = ndata[ndata.index >= cut]

x = ndata.index

fig = go.Figure()

# percentile shaded band (plot lower first, then upper with fill='tonexty')
fig.add_trace(go.Scatter(
    x=x, y=ndata['25th_percentile'],
    mode='lines', line=dict(color='rgba(0,0,0,0)'), showlegend=False
))
fig.add_trace(go.Scatter(
    x=x, y=ndata['75th_percentile'],
    mode='lines', line=dict(color='rgba(0,0,0,0)'),
    fill='tonexty', fillcolor='rgba(128,128,128,0.3)', name='25-75 percentile'
))

# observed (historical) series
fig.add_trace(go.Scatter(
    x=y1.index, y=y1['discharge'],
    mode='lines+markers',
    line=dict(color='black', width=3),
    marker=dict(symbol='circle', color='black', line=dict(color='black'),size=8),
    name='Observado'
))

# forecast series
fig.add_trace(go.Scatter(
    x=y2.index, y=y2['discharge'],
    mode='lines+markers',
    line=dict(color='blue', width=3, dash='dash'),
    marker=dict(symbol='circle', color='blue', line=dict(color='blue'), size=8),
    name='Perspectiva Hidrológica (Persistencia)'
))

# vertical "now" line and annotation
fig.add_shape(type='line', x0=cut, x1=cut, y0=0, y1=1, xref='x', yref='paper',
              line=dict(color='red', dash='dot', width=2))
fig.add_annotation(x=cut, y=1.02, xref='x', yref='paper', text='Ahora', showarrow=False, font=dict(color='red'))

fig.update_layout(
    xaxis=dict(tickformat='%m-%Y', dtick='M1'),
    xaxis_title='Fecha',
    yaxis_title='Caudal (m³/s)',
    width=1200, height=600,
    margin=dict(l=60, r=20, t=20, b=60),
    # legend below the plot
    legend=dict(orientation='h', yanchor='top', y=-0.1, xanchor='center', x=0.5)
)

fig.show()

In [21]:
cut = last_month.replace(day=1)

fig3 = go.Figure()

fig3.add_trace(go.Scatter(
    x=ndata.index,
    y=ndata['anomaly'],
    mode='lines+markers',
    line=dict(color='black', width=4),
    marker=dict(symbol='circle', color='black', size=10, line=dict(color='black')),
    name='Anomaly',
    hovertemplate='%{x|%b-%Y}<br>Anomaly: %{y:.3f}<extra></extra>'
))

# vertical "now" line and annotation
fig3.add_shape(type='line', x0=cut, x1=cut, y0=0, y1=1, xref='x', yref='paper',
               line=dict(color='red', dash='dash', width=2))

fig3.add_annotation(x=cut, y=1.02, xref='x', yref='paper', text='Ahora', showarrow=False, font=dict(color='red'))
fig3.update_yaxes(range=[-3, 3]) 
fig3.update_layout(
    xaxis=dict(tickformat='%m-%Y', dtick='M1', tickangle=0),
    xaxis_title='Fecha',
    yaxis_title='Anomalía de Caudales (--)',
    width=1200, height=600,
    margin=dict(l=60, r=20, t=20, b=60),
    legend=dict(orientation='h', yanchor='top', y=-0.1, xanchor='center', x=0.5)
)

fig3.show()