# Analogues Forecast

#### Jose Valles (jose.valles.leon@gmail.com)

In [1]:
# Importing the libraries
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import seaborn as sns
import numpy as np
import calendar
import datetime
import matplotlib.pyplot as plt
plt.style.use('classic')
import os
%matplotlib inline

from IPython.display import HTML

sns.set()

In [2]:
# Quantile Function Definitions
def q1(x):
    return x.quantile(0.25)

def q2(x):
    return x.median()

def q3(x):
    return x.quantile(0.75)

Import de daily discharge from a hydrological station located in Uruguay 

In [4]:
station_name = '1330'
input_folder = os.path.join('../stations/data', f'{station_name}.csv')
DISCHARGE_DAILY = pd.read_csv(input_folder,parse_dates=['Fecha'],index_col="Fecha",dayfirst=True,na_values="NA")

Identify the missing dates and change dataframe columns name. **Important** Change the end date parameter in the ``pd.date_range``

In [5]:
# Identify the missing data from a date range (1980 to 2023)
DISCHARGE_DAILY_date_missing = pd.date_range(start = DISCHARGE_DAILY.index[0].strftime('%Y-%m-%d'), end = DISCHARGE_DAILY.index[-1].strftime('%Y-%m-%d'),freq='D')
# Re-index the dataframe based on the missind date variable
DISCHARGE_DAILY = DISCHARGE_DAILY.reindex(DISCHARGE_DAILY_date_missing,fill_value=None)
# Set index Fecha
DISCHARGE_DAILY.index.name = 'date' 
# Change columns names
DISCHARGE_DAILY.columns = ['discharge']
# Print the last 6 values
HTML(DISCHARGE_DAILY.tail(6).to_html())

Unnamed: 0_level_0,discharge
date,Unnamed: 1_level_1
2025-12-26,2.043
2025-12-27,2.005
2025-12-28,2.016
2025-12-29,1.999
2025-12-30,2.029
2025-12-31,1.951


### Calculate monthly mean from daily data

First, we define a percentage of missing value. For this exercise, we use a 50% of missing data in the given month

In [5]:
# Percentage of missing data
max_pct_missing = 50

In [6]:
# group the Dataframe in a monthly time scale
GROUPER_DISCHARGE_MONTHLY = DISCHARGE_DAILY.groupby(pd.Grouper(freq='1MS'))

# this function allows to calculate the percentange of missing values and label the dataframe to "missing"
NUMBER_MISSING = GROUPER_DISCHARGE_MONTHLY.apply(lambda x: pd.isnull(x).sum()*100/len(x)).unstack(1)
NUMBER_MISSING = NUMBER_MISSING.to_frame()

# change the column name to "missing"
NUMBER_MISSING.columns = ['number_missing'] 

# this function allows to calculate the percentange of missing values and label the dataframe to "missing"
BOOL_MISSING = GROUPER_DISCHARGE_MONTHLY.apply(lambda x: pd.isnull(x).sum()*100/len(x)).unstack(1) < max_pct_missing
BOOL_MISSING = BOOL_MISSING.to_frame() # Convert to DataFrame
BOOL_MISSING.columns = ['missing'] # change the column name to "missing"

# BOOL_MISSING[~BOOL_MISSING['missing']] # print the dates that does not fulfill the criterion of null data in for each month
# BOOL_MISSING.to_clipboard()

# NUMBER_MISSING.to_clipboard() # Uncomment if you want to visualize all the result in a CSV

Next, we identify the months which contains lower that the max_pct_missing value and the monthly flow is only calculated if 50% o more of recorded value in a given month

In [7]:
# from daily to monthly
DISCHARGE_MONTHLY = DISCHARGE_DAILY.resample('M',closed="right").apply(lambda x: x.mean() if x.isnull().sum()*100/len(x) < max_pct_missing else np.nan)
# from monthly to 3 months
DISCHARGE_THREE_MONTHS = DISCHARGE_MONTHLY.rolling(3).apply(lambda x: x.mean() if x.isnull().sum()*100/len(x) < max_pct_missing else np.nan)
# Create columns
DISCHARGE_MONTHLY['year'] = DISCHARGE_MONTHLY.index.year
DISCHARGE_MONTHLY['month'] = DISCHARGE_MONTHLY.index.month
## create column for day, month, year in the daily discharge ()
DISCHARGE_DAILY['year'] = DISCHARGE_DAILY.index.year
DISCHARGE_DAILY['month'] = DISCHARGE_DAILY.index.month
DISCHARGE_DAILY['monthday'] = DISCHARGE_DAILY.index.day_of_year
# Transform the date to Qlog
DISCHARGE_MONTHLY['Q_to_log'] = np.log(DISCHARGE_MONTHLY['discharge'])
## print the first results
HTML(DISCHARGE_MONTHLY.tail(6).to_html(index=False))
# DISCHARGE_MONTHLY.to_clipboard() # Uncomment if you want to visualize all the time serie

discharge,year,month,Q_to_log
11.830516,2025,7,2.470682
30.211484,2025,8,3.408222
13.1669,2025,9,2.577706
15.270839,2025,10,2.725945
3.9391,2025,11,1.370952
1.63471,2025,12,0.491465


### Select the period of record to estimate the mean and std

For this section, we will use a period of reference or the dataset of year overdue. We are in 2023 so the period of record is 1980-2022

In [8]:
DISCHARGE_SELECTION = DISCHARGE_MONTHLY[(DISCHARGE_MONTHLY['year'] >= 1980) & (DISCHARGE_MONTHLY['year'] <= 2022)]

Calculate the average dischare for each month in the period of record

In [9]:
DISCHARGE_MONTHLY_STATS = DISCHARGE_SELECTION.Q_to_log.groupby(DISCHARGE_SELECTION.index.month).agg([np.mean, np.std])
Q_MONTHLY_STATS = DISCHARGE_SELECTION.discharge.groupby(DISCHARGE_SELECTION.index.month).agg([np.mean, np.std, q1, q3])
# Display results
HTML(DISCHARGE_MONTHLY_STATS.to_html())

Unnamed: 0_level_0,mean,std
date,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2.631497,1.108565
2,2.920379,1.249889
3,3.09812,1.176433
4,3.397223,1.410155
5,3.799416,1.28975
6,4.388054,1.219185
7,4.622332,0.965535
8,4.506851,0.943445
9,4.585282,0.933443
10,4.086109,1.222229


Calculate the anomalies in the monthly discharge values

In [10]:
DISCHARGE_MONTHLY['anomaly'] = np.nan
for i in range(len(DISCHARGE_MONTHLY)):
    # Extract the current month 
    m = DISCHARGE_MONTHLY.month[i]
    # Extract the current year
    y = DISCHARGE_MONTHLY.year[i]
    DISCHARGE_MONTHLY.loc[DISCHARGE_MONTHLY.eval('month==@m & year==@y'),'anomaly'] = (DISCHARGE_MONTHLY['Q_to_log'][i] - DISCHARGE_MONTHLY_STATS.query('index == @m')["mean"].item()) / DISCHARGE_MONTHLY_STATS.query('index == @m')["std"].item()

DISCHARGE_MONTHLY = DISCHARGE_MONTHLY[['year','month','discharge','Q_to_log','anomaly']]

In [11]:
HTML(DISCHARGE_MONTHLY.tail(6).to_html(index=True))

Unnamed: 0_level_0,year,month,discharge,Q_to_log,anomaly
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2025-07-31,2025,7,11.830516,2.470682,-2.228453
2025-08-31,2025,8,30.211484,3.408222,-1.164486
2025-09-30,2025,9,13.1669,2.577706,-2.150723
2025-10-31,2025,10,15.270839,2.725945,-1.112856
2025-11-30,2025,11,3.9391,1.370952,-1.929182
2025-12-31,2025,12,1.63471,0.491465,-2.172911


### Make a Analogue Forecast

We create a function called add_month that add the number of months based on an input date.

In [12]:
def add_months(sourcedate, months):
    month = sourcedate.month - 1 + months
    year = sourcedate.year + month // 12
    month = month % 12 + 1
    day = min(sourcedate.day, calendar.monthrange(year,month)[1])
    date_result = datetime.date(year, month, day)
    date_result = date_result.strftime('%Y-%m-%d %H:%M:%S')
    date_result = datetime.datetime.strptime(date_result,'%Y-%m-%d %H:%M:%S')
    return date_result

We extract the current month overdue based on the current date

In [13]:
# today = datetime.datetime(2026,1,1)
today = datetime.date.today()
first = today.replace(day=1)
last_month = first - datetime.timedelta(days=1)
last_month = pd.Timestamp(DISCHARGE_MONTHLY.index[-1]).to_pydatetime().date()
print(last_month.strftime('%Y-%m-%d'))
first_month = add_months(last_month,-9)
print(first_month.strftime('%Y-%m-%d'))

2025-12-31
2025-03-31


Query in the anomaly_analogue_ts the first date and end date

In [14]:
ANOMALY_NOW = DISCHARGE_MONTHLY.query('index > @first_month & index <= @last_month')
HTML(ANOMALY_NOW.to_html(index=False))

year,month,discharge,Q_to_log,anomaly
2025,4,5.294,1.666574,-1.227275
2025,5,21.97271,3.089801,-0.550195
2025,6,54.914133,4.005771,-0.313557
2025,7,11.830516,2.470682,-2.228453
2025,8,30.211484,3.408222,-1.164486
2025,9,13.1669,2.577706,-2.150723
2025,10,15.270839,2.725945,-1.112856
2025,11,3.9391,1.370952,-1.929182
2025,12,1.63471,0.491465,-2.172911


Define a Root Mean Square Error Function. This function will be used to select the analogues years

In [15]:
def rmse(predictions,targets):
    # Normalize the targets from 0 to 1
    return ((predictions - targets) ** 2).mean() ** 0.5

In [16]:
YEAR_ANALYSIS = range(1981,2024,1)

# create empty database and lists
df = pd.DataFrame()
ANOMALY_HISTORICAL = pd.DataFrame()
RMSE = []
ANO = []

for y in YEAR_ANALYSIS:
    # define end month and year based on the for-loop
    END_DATE_HIST = last_month.replace(year=y)
    next_month = last_month.replace(year=y,day=28) + datetime.timedelta(days=4)
    END_DATE_HIST = next_month - datetime.timedelta(days=next_month.day)
    # 9 months before
    FIRST_DATE_HIST = add_months(END_DATE_HIST,-9)
    # query in the anomaly time serie the first date and end_date
    ANOMALY_HISTORIC = DISCHARGE_MONTHLY.query('index > @FIRST_DATE_HIST & index <= @END_DATE_HIST')
    # create a dataframe (df) for the "preditec" (analogue) and "target" (current year)
    df['predicted'] = ANOMALY_HISTORIC["anomaly"].values
    df['target'] = ANOMALY_NOW["anomaly"].values
    # if there is missing data then escape the for-loop
    if df['predicted'].isnull().values.any() == False:
        # calculate the RMSE
        RMSE_val = rmse(df['predicted'],df['target'])
        # append in the empty lists
        RMSE.append(RMSE_val)
        ANO.append(y)
        ANOMALY_HISTORICAL[y] = df['predicted']
        print('the year ',y, 'has a RMSE of:', round(RMSE_val,2))
    else:
        print('the year ',y, 'has missing data')
        continue

RESULTS = pd.DataFrame()
RESULTS['YEAR'] = ANO
RESULTS['RMSE'] = RMSE

the year  1981 has a RMSE of: 1.88
the year  1982 has missing data
the year  1983 has a RMSE of: 2.16
the year  1984 has a RMSE of: 2.0
the year  1985 has a RMSE of: 2.02
the year  1986 has a RMSE of: 2.37
the year  1987 has a RMSE of: 1.48
the year  1988 has a RMSE of: 1.18
the year  1989 has a RMSE of: 1.12
the year  1990 has a RMSE of: 1.86
the year  1991 has a RMSE of: 2.09
the year  1992 has a RMSE of: 1.72
the year  1993 has a RMSE of: 2.53
the year  1994 has a RMSE of: 2.04
the year  1995 has a RMSE of: 1.64
the year  1996 has a RMSE of: 1.56
the year  1997 has a RMSE of: 1.82
the year  1998 has a RMSE of: 1.62
the year  1999 has a RMSE of: 1.82
the year  2000 has missing data
the year  2001 has a RMSE of: 2.08
the year  2002 has a RMSE of: 2.42
the year  2003 has a RMSE of: 2.23
the year  2004 has a RMSE of: 1.6
the year  2005 has a RMSE of: 1.77
the year  2006 has a RMSE of: 1.61
the year  2007 has a RMSE of: 1.95
the year  2008 has a RMSE of: 0.9
the year  2009 has a RMSE of:

In [17]:
ANOMALY_HISTORICAL = ANOMALY_HISTORICAL.reset_index()
ANOMALY_HISTORICAL = ANOMALY_HISTORICAL.rename(columns={"index": "MONTH"})

In [18]:
ANOMALY_HISTORICAL['MONTH'] = pd.date_range(FIRST_DATE_HIST,END_DATE_HIST,freq="M", inclusive="right").strftime("%b")
HTML(ANOMALY_HISTORICAL.to_html(index=False))

MONTH,1981,1983,1984,1985,1986,1987,1988,1989,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2016,2017,2019,2020,2021,2022,2023
Apr,-0.217127,-0.355615,-0.241374,0.610233,-0.623168,-0.626465,0.717627,-0.995157,1.727497,-0.933395,-0.107533,0.96253,-0.089941,1.155848,0.645941,-0.947107,-0.105156,0.707885,-0.174479,2.026585,-0.959566,-0.184098,1.660606,-0.672437,1.873155,-0.838034,-0.792923,0.47445,1.958978,-1.116519,-1.425319,-1.31296,0.710371,-0.682033,-1.707484
May,1.707261,-0.735843,0.73982,0.801778,0.374191,-1.01135,-0.845325,-1.739965,0.030599,0.013598,0.953081,0.954425,1.122809,0.292403,-0.720531,-1.01151,-0.301231,-0.428673,-0.038696,1.455925,0.66861,0.362411,1.251189,-1.167472,1.554948,-1.037927,-1.145982,-0.310643,0.079965,0.147772,0.10314,-1.717113,-0.068713,-0.170533,-2.59755
Jun,-0.133024,0.32553,1.190399,0.726823,0.544603,-1.744517,-1.316293,-2.05692,0.198856,0.390482,0.586509,0.35552,-0.591985,0.371459,-1.527434,-0.126635,-0.187558,0.338235,1.239561,0.759596,1.031756,-0.069842,1.455351,1.207978,-0.202355,-0.994097,-0.578406,0.665624,-0.261694,-0.870978,1.506537,0.530105,1.03421,-1.426605,-3.212811
Jul,0.353903,-0.298303,1.496154,0.518201,0.119276,0.026462,-0.731281,-1.644798,-1.763364,0.647313,0.348136,-0.403391,-0.453135,0.579684,-2.01137,-1.577191,0.878613,1.570003,-0.197071,0.416333,0.52202,-0.200469,0.359069,0.231629,-1.624967,-0.380378,1.050493,1.085822,1.361521,0.202824,0.065935,-0.502122,-0.067605,0.211426,-4.046311
Aug,0.947084,1.08524,-1.065916,0.732703,1.979453,0.031578,-0.067059,0.020019,-2.167918,0.968583,-0.686086,0.075043,0.589309,-1.246202,-2.587995,-0.054673,-0.455874,0.084064,0.753817,0.62509,0.501469,-0.885222,-1.3293,0.301054,-0.281894,-1.057596,-0.342946,1.011398,0.603291,1.544311,0.133342,-0.747008,0.025114,-1.393357,-1.057067
Sep,0.471845,1.228256,-0.111097,0.388442,0.914139,-0.631591,-1.324348,-1.668867,-1.672177,0.715289,0.546023,0.58197,0.76458,-1.594812,0.42439,-1.144435,-0.731983,0.488955,0.129482,0.412721,1.479993,0.198091,-0.454557,-1.430568,0.981742,-1.120002,-0.323794,1.544749,0.738968,1.51706,0.420582,0.081887,0.110662,-2.194138,-0.980777
Oct,-0.79241,1.575719,0.294332,0.831719,0.895503,0.639688,-1.343321,-1.911254,0.086084,1.136318,0.325131,1.277731,0.264326,-0.354116,-0.088873,-0.045205,-1.267261,-1.084482,1.269477,-0.361447,-0.073774,0.851916,-0.811611,-0.110208,1.280056,-1.287705,1.318786,-0.750263,-0.617977,-0.075171,1.634838,-1.041966,-1.105925,-1.939143,-2.014144
Nov,-1.164422,1.171698,1.12026,0.678932,1.613434,-0.46166,-0.571787,-1.342366,1.025792,0.207405,0.08314,1.535452,0.60592,0.627659,0.1386,0.632491,-0.611431,-1.299847,1.241965,-0.202217,1.357965,0.168358,-1.016039,0.723326,-0.834403,-1.481921,1.282078,-0.806993,-0.535591,-0.319698,-0.130349,-1.835686,-1.211086,-1.18749,-0.983696
Dec,0.468841,-0.261713,-0.861407,-0.701966,0.519981,-0.414759,-0.910289,-0.197747,1.036881,0.633217,-0.742048,2.32559,1.029605,-0.707846,-0.472499,2.192217,0.705116,-0.682488,0.394142,1.9014,0.236884,-0.419679,-0.436017,-0.270586,-0.650314,-0.832496,0.810519,-0.724762,-0.339071,-0.921445,-0.902076,-1.436225,-1.238243,-1.169179,1.336839


In [19]:
BEST_RMSE = RESULTS.sort_values(by=['YEAR']).tail(5)
BEST_RMSE 

Unnamed: 0,YEAR,RMSE
30,2019,1.817802
31,2020,1.094244
32,2021,1.429466
33,2022,1.050287
34,2023,1.871941


Select the 5 minimum RMSE value from the dataframe

In [20]:
# Extract by minimum RMSE
# BEST_RMSE = RESULTS.sort_values(by=['RMSE']).head(5)
# Extract only five years 
BEST_RMSE = RESULTS.sort_values(by=['YEAR']).tail(5)
BEST_RMSE['RMSE_inv'] = 1 / BEST_RMSE['RMSE'] 
BEST_RMSE['weight'] = (1/BEST_RMSE['RMSE'])/sum(BEST_RMSE['RMSE_inv'])
HTML(BEST_RMSE.to_html(index=False))

YEAR,RMSE,RMSE_inv,weight
2019,1.817802,0.550115,0.150722
2020,1.094244,0.913873,0.250385
2021,1.429466,0.699562,0.191667
2022,1.050287,0.952121,0.260864
2023,1.871941,0.534205,0.146362


Extract the years from the ANOMALY Historical Variable

In [21]:
ANOMALY_PLOT = ANOMALY_HISTORICAL.loc[:, BEST_RMSE['YEAR']]
ANOMALY_PLOT = ANOMALY_PLOT.reset_index()
ANOMALY_PLOT = ANOMALY_PLOT.rename(columns={"index": "MONTH"})
ANOMALY_PLOT['MONTH'] = pd.date_range(FIRST_DATE_HIST,END_DATE_HIST,freq="M", inclusive="right").strftime("%b")
ANOMALY_PLOT["REF"] = ANOMALY_NOW["anomaly"].values

Append the "REF" or "ACTUAL" date and Extract the years from the ANOMALY Historical Variable

In [22]:
HTML(ANOMALY_PLOT.to_html(index=False))

MONTH,2019,2020,2021,2022,2023,REF
Apr,-1.425319,-1.31296,0.710371,-0.682033,-1.707484,-1.227275
May,0.10314,-1.717113,-0.068713,-0.170533,-2.59755,-0.550195
Jun,1.506537,0.530105,1.03421,-1.426605,-3.212811,-0.313557
Jul,0.065935,-0.502122,-0.067605,0.211426,-4.046311,-2.228453
Aug,0.133342,-0.747008,0.025114,-1.393357,-1.057067,-1.164486
Sep,0.420582,0.081887,0.110662,-2.194138,-0.980777,-2.150723
Oct,1.634838,-1.041966,-1.105925,-1.939143,-2.014144,-1.112856
Nov,-0.130349,-1.835686,-1.211086,-1.18749,-0.983696,-1.929182
Dec,-0.902076,-1.436225,-1.238243,-1.169179,1.336839,-2.172911


Generate plot

In [23]:
import plotly.graph_objects as go

fig = go.Figure()

# thin grey lines for each analogue column (except MONTH and REF)
for col in ANOMALY_PLOT.drop('MONTH', axis=1).columns:
    if col == 'REF':
        continue
    fig.add_trace(go.Scatter(
        x=ANOMALY_PLOT['MONTH'],
        y=ANOMALY_PLOT[col],
        mode='lines+markers',
        line=dict(color='rgba(128,128,128,0.4)', width=1),
        marker=dict(size=6),
        name=str(col),
        hovertemplate='%{x}<br>Year: ' + str(col) + '<br>Anomaly: %{y:.2f}<extra></extra>',
        showlegend=False
    ))

# highlighted REF curve
fig.add_trace(go.Scatter(
    x=ANOMALY_PLOT['MONTH'],
    y=ANOMALY_PLOT['REF'],
    mode='lines+markers',
    line=dict(color='blue', width=4),
    marker=dict(size=8, color='white', line=dict(color='black', width=1)),
    name='REF',
    hovertemplate='%{x}<br>REF Anomaly: %{y:.2f}<extra></extra>',
    showlegend=True
))

# add text labels for each analogue at the 9th month position (index 8)
label_x = ANOMALY_PLOT['MONTH'].iloc[8]
annotations = []

# collect analogue columns (exclude MONTH and REF) and their y values at the 9th month
cols = [c for c in ANOMALY_PLOT.drop('MONTH', axis=1).columns if c != 'REF']
y_vals = ANOMALY_PLOT.loc[8, cols].astype(float).values

# collision resolution: sort and enforce a minimum vertical gap
min_gap = 0.12  # tweak this to increase/decrease separation
adjusted = y_vals.copy()
order = np.argsort(y_vals)  # indices into adjusted sorted by y

for i in range(1, len(order)):
    prev_idx = order[i - 1]
    cur_idx = order[i]
    if adjusted[cur_idx] - adjusted[prev_idx] < min_gap:
        adjusted[cur_idx] = adjusted[prev_idx] + min_gap

# build annotations with small horizontal shifts to further reduce overlap
for j, col_name in enumerate(cols):
    y_val = float(adjusted[j])
    # horizontal shift alternates left/right to spread labels (pixels)
    xshift = ((j % 2) * 2 - 1) * (6 + (j // 2) * 3)
    annotations.append(dict(
        x=label_x,
        y=y_val,
        xref='x',
        yref='y',
        text=str(col_name),
        showarrow=False,
        xanchor='left',
        xshift=xshift,
        font=dict(color='black', size=11),
        bgcolor='rgba(255,255,255,0.75)',
        bordercolor='rgba(0,0,0,0.15)',
        borderwidth=0.5
    ))

fig.update_layout(
    annotations=annotations,
    title='Discharge Anomaly - Analogues',
    xaxis_title='Months',
    yaxis_title='Discharge Anomaly',
    yaxis=dict(range=[-3, 3]),
    width=1200,
    height=500
)

fig.show()

In [24]:
# Define the forecast lead-times
arr = np.array([1,2,3,4,5,6])

month_fcst = [last_month.month]
for x in arr:
    # convert to datetime the forecast lead-time
    forecast_month = add_months(last_month,x)
    # extract the month from the forecast month variable
    m = forecast_month.month
    month_fcst.append(m)
    a = [] # empty the variable a
    # For each analogue year (and its respective RMSE), we multiply the analogue year for that month and the RMSE
    for i in range(len(BEST_RMSE.RMSE)):
        y = BEST_RMSE['YEAR'].iat[i]
        a_val = DISCHARGE_MONTHLY.query('month == @m & year == @y')['anomaly'].item() * BEST_RMSE['weight'].iat[i]
        a.append(a_val)
    ANALOGUE_FCST = np.nansum(a)
    # ANALOGUE_FCST = np.sum(a)
    q2log = (ANALOGUE_FCST * DISCHARGE_MONTHLY_STATS.query('index == @m')["std"].item()) + DISCHARGE_MONTHLY_STATS.query('index == @m')["mean"].item()
    exp_log = np.exp(q2log)
    DISCHARGE_MONTHLY.loc[forecast_month] = [forecast_month.year,forecast_month.month, exp_log, q2log, ANALOGUE_FCST]

In [25]:
HTML(DISCHARGE_MONTHLY.tail(8).to_html(index=True))

Unnamed: 0_level_0,year,month,discharge,Q_to_log,anomaly
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2025-11-30,2025.0,11.0,3.9391,1.370952,-1.929182
2025-12-31,2025.0,12.0,1.63471,0.491465,-2.172911
2026-01-31,2026.0,1.0,13.458513,2.599612,-0.028763
2026-02-28,2026.0,2.0,12.83902,2.552489,-0.294338
2026-03-31,2026.0,3.0,10.655755,2.3661,-0.622237
2026-04-30,2026.0,4.0,9.201776,2.219397,-0.835246
2026-05-31,2026.0,5.0,14.883416,2.700248,-0.852233
2026-06-30,2026.0,6.0,56.903886,4.041364,-0.284363


In [26]:
start = add_months(last_month, -5)
end = add_months(last_month, 7)
ndata = DISCHARGE_MONTHLY.loc[(DISCHARGE_MONTHLY.index >= start) & (DISCHARGE_MONTHLY.index <= end)].copy()

In [27]:
ndata.index = ndata.index.map(lambda t: t.replace(day=1))

In [28]:
# create empty columns in the dataframe
ndata['25th_percentile'] = np.nan
ndata['75th_percentile'] = np.nan

for i in range(len(ndata)):
    # Extract the current month 
    m = ndata.month[i]
    y = ndata.year[i]
    ndata.loc[ndata.eval('month==@m & year==@y'),'25th_percentile']  = Q_MONTHLY_STATS.query('index==@m')['q1'].item()
    ndata.loc[ndata.eval('month==@m & year==@y'),'75th_percentile']  = Q_MONTHLY_STATS.query('index==@m')['q3'].item()

In [29]:
ndata

Unnamed: 0_level_0,year,month,discharge,Q_to_log,anomaly,25th_percentile,75th_percentile
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2025-07-01,2025.0,7.0,11.830516,2.470682,-2.228453,68.105032,181.048742
2025-08-01,2025.0,8.0,30.211484,3.408222,-1.164486,47.444774,180.931097
2025-09-01,2025.0,9.0,13.1669,2.577706,-2.150723,51.933833,179.949233
2025-10-01,2025.0,10.0,15.270839,2.725945,-1.112856,22.890734,167.541378
2025-11-01,2025.0,11.0,3.9391,1.370952,-1.929182,15.493933,134.037058
2025-12-01,2025.0,12.0,1.63471,0.491465,-2.172911,8.287645,36.378435
2026-01-01,2026.0,1.0,13.458513,2.599612,-0.028763,7.636032,31.973742
2026-02-01,2026.0,2.0,12.83902,2.552489,-0.294338,7.505875,31.132536
2026-03-01,2026.0,3.0,10.655755,2.3661,-0.622237,9.282452,46.075339
2026-04-01,2026.0,4.0,9.201776,2.219397,-0.835246,9.5531,81.223517


In [30]:
import matplotlib.dates as mdates

y1 = ndata.iloc[ndata.index <= last_month.replace(day=1).strftime('%Y-%m-%d')]
y2 = ndata.iloc[ndata.index >= last_month.replace(day=1).strftime('%Y-%m-%d')]
# y3 = ndata.iloc[ndata.index >= last_month.replace(day=1).strftime('%Y-%m-%d')]

import plotly.graph_objects as go

fig = go.Figure()

# shaded normal band (25th-75th)
fig.add_trace(go.Scatter(
    x=list(ndata.index) + list(ndata.index[::-1]),
    y=list(ndata['25th_percentile']) + list(ndata['75th_percentile'][::-1]),
    fill='toself',
    fillcolor='rgba(128,128,128,0.3)',
    line=dict(color='rgba(0,0,0,0)'),
    hoverinfo='skip',
    name='Normal'
))

# analogues (grey, faint)
for y in BEST_RMSE['YEAR'].tolist():
    query_result_indices = DISCHARGE_MONTHLY.query("month == @m & year == @y").index
    start_index = query_result_indices[0] - pd.DateOffset(months=6)
    end_index = query_result_indices[0] + pd.DateOffset(months=6)
    extracted_values = DISCHARGE_MONTHLY.loc[start_index:end_index].copy()
    extracted_values.index = extracted_values.index.map(lambda t: t.replace(day=1))
    extracted_values.index = ndata.index  # align to forecast/index window
    fig.add_trace(go.Scatter(
        x=extracted_values.index,
        y=extracted_values['discharge'],
        mode='lines+markers',
        line=dict(color='rgba(128,128,128,0.4)', width=1),
        marker=dict(size=6),
        name=str(y),
        hovertemplate='%{x|%b-%Y}<br>Year: ' + str(y) + '<br>Discharge: %{y:.2f}<extra></extra>',
        visible='legendonly'
    ))

# historical part (y1) and forecast part (y2)
fig.add_trace(go.Scatter(
    x=y1.index,
    y=y1['discharge'],
    mode='lines+markers',
    line=dict(color='black', width=3),
    marker=dict(size=8),
    name='Actual'
))
fig.add_trace(go.Scatter(
    x=y2.index,
    y=y2['discharge'],
    mode='lines+markers',
    line=dict(color='blue', width=3, dash='dash'),
    marker=dict(size=8),
    name='Pron√≥stico'
))

# vertical line for "now"
fig.add_vline(x=last_month.replace(day=1), line=dict(color='red', dash='dot', width=2))
fig.add_annotation(x=last_month.replace(day=1), y=max(ndata['discharge'].max(), extracted_values['discharge'].max()),
                   text='Ahora', showarrow=False, yshift=10)

fig.update_xaxes(
    tickformat='%m-%Y',
    tickmode='array',
    tickvals=ndata.index,
)

fig.update_yaxes(title_text='Caudal (m¬≥/s)')
fig.update_layout(
    title='Analog√≠a de caudales hist√≥ricos y pron√≥stico',
    legend=dict(orientation='h', yanchor='top', y=-0.1, xanchor='center', x=0.5),
    margin=dict(l=50, r=20, t=60, b=50),
    hovermode='x unified',
    width=1200, height=600,
)

fig.show()

In [31]:
BEST_RMSE

Unnamed: 0,YEAR,RMSE,RMSE_inv,weight
30,2019,1.817802,0.550115,0.150722
31,2020,1.094244,0.913873,0.250385
32,2021,1.429466,0.699562,0.191667
33,2022,1.050287,0.952121,0.260864
34,2023,1.871941,0.534205,0.146362
