In [19]:
# Importing the libraries
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import seaborn as sns
import numpy as np
import calendar
import datetime
import matplotlib.pyplot as plt
import plotly.express as px
plt.style.use('classic')
import os
%matplotlib inline

from IPython.display import HTML


In [20]:
station_name = '1170'
station_code = int(station_name)/10
input_folder = os.path.join('../stations/data', f'{station_name}.csv')

DISCHARGE_DAILY = pd.read_csv(input_folder,parse_dates=['Fecha'],index_col="Fecha",dayfirst=True,na_values="NA")

In [21]:
station_code

117.0

In [22]:
# Identify the missing data from a date range (1980 to 2023)
DISCHARGE_DAILY_date_missing = pd.date_range(start = DISCHARGE_DAILY.index[0].strftime('%Y-%m-%d'), end = DISCHARGE_DAILY.index[-1].strftime('%Y-%m-%d'),freq='D')
# Re-index the dataframe based on the missind date variable
DISCHARGE_DAILY = DISCHARGE_DAILY.reindex(DISCHARGE_DAILY_date_missing,fill_value=None)
# Set index Fecha
DISCHARGE_DAILY.index.name = 'date' 
# Change columns names
DISCHARGE_DAILY.columns = ['discharge']
# Print the last 6 values
HTML(DISCHARGE_DAILY.tail(6).to_html())

Unnamed: 0_level_0,discharge
date,Unnamed: 1_level_1
2025-12-26,0.732
2025-12-27,0.688
2025-12-28,0.577
2025-12-29,0.515
2025-12-30,0.464
2025-12-31,0.426


In [23]:
# Percentage of missing data
max_pct_missing = 50

In [24]:
# group the Dataframe in a monthly time scale
GROUPER_DISCHARGE_MONTHLY = DISCHARGE_DAILY.groupby(pd.Grouper(freq='1MS'))

# this function allows to calculate the percentange of missing values and label the dataframe to "missing"
NUMBER_MISSING = GROUPER_DISCHARGE_MONTHLY.apply(lambda x: pd.isnull(x).sum()*100/len(x)).unstack(1)
NUMBER_MISSING = NUMBER_MISSING.to_frame()

# change the column name to "missing"
NUMBER_MISSING.columns = ['number_missing'] 

# this function allows to calculate the percentange of missing values and label the dataframe to "missing"
BOOL_MISSING = GROUPER_DISCHARGE_MONTHLY.apply(lambda x: pd.isnull(x).sum()*100/len(x)).unstack(1) < max_pct_missing
BOOL_MISSING = BOOL_MISSING.to_frame() # Convert to DataFrame
BOOL_MISSING.columns = ['missing'] # change the column name to "missing"

# BOOL_MISSING[~BOOL_MISSING['missing']] # print the dates that does not fulfill the criterion of null data in for each month
# BOOL_MISSING.to_clipboard()

# NUMBER_MISSING.to_clipboard() # Uncomment if you want to visualize all the result in a CSV

In [25]:
# from daily to monthly
DISCHARGE_MONTHLY = DISCHARGE_DAILY.resample('M').apply(lambda x: x.mean() if x.isnull().sum()*100/len(x) < max_pct_missing else np.nan)
# Create columns
DISCHARGE_MONTHLY['year'] = DISCHARGE_MONTHLY.index.year
DISCHARGE_MONTHLY['month'] = DISCHARGE_MONTHLY.index.month
## create column for day, month, year in the daily discharge ()
DISCHARGE_DAILY['year'] = DISCHARGE_DAILY.index.year
DISCHARGE_DAILY['month'] = DISCHARGE_DAILY.index.month
DISCHARGE_DAILY['monthday'] = DISCHARGE_DAILY.index.day_of_year
## print the first results
HTML(DISCHARGE_MONTHLY.tail(6).to_html(index=True))
# DISCHARGE_MONTHLY.to_clipboard() # Uncomment if you want to visualize all the time serie

Unnamed: 0_level_0,discharge,year,month
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2025-07-31,,2025,7
2025-08-31,12.034323,2025,8
2025-09-30,,2025,9
2025-10-31,,2025,10
2025-11-30,1.160526,2025,11
2025-12-31,0.697452,2025,12


In [26]:
current_month = 12
current_year = 2025
# from DISCHARGE_MONTHLY select all the month based on current_month
DISCHARGE_SELECTED = DISCHARGE_MONTHLY[DISCHARGE_MONTHLY['month'] == current_month]
# Remove the columns year and month with NaN values in discharge
DISCHARGE_SELECTED = DISCHARGE_SELECTED[['discharge']].dropna()
# Sort the DISCHARGE_SELECTED by discharge values from lowest to highest
DISCHARGE_SELECTED_SORTED = DISCHARGE_SELECTED.sort_values(by='discharge', ascending=True) 


DISCHARGE_SELECTED_SORTED.head(6)


Unnamed: 0_level_0,discharge
date,Unnamed: 1_level_1
2020-12-31,0.428548
2008-12-31,0.58271
2005-12-31,0.586548
1995-12-31,0.694968
2025-12-31,0.697452
2007-12-31,0.699387


In [27]:
# using plotly make a bar plot where y is discharge and x is the column year
fig = px.bar(DISCHARGE_SELECTED_SORTED, x=DISCHARGE_SELECTED_SORTED.index.year, y='discharge',
                         labels={'x':'Año', 'discharge':'Caudal (m3/s)'},
                         title=f'Caudales mensuales para el mes de {calendar.month_name[current_month]} - Estación {station_code}',
                         template='plotly',
                         category_orders={'x': DISCHARGE_SELECTED_SORTED.index.year.tolist()})
fig.update_layout(xaxis=dict(type='category', tickangle=-90))

# Change the bar color to red for the current_year
colors = ['red' if year == current_year else '#636efa' for year in DISCHARGE_SELECTED_SORTED.index.year]
fig.update_traces(marker=dict(color=colors))

fig.show()
