In [106]:
import pandas as pd
import geopandas as gpd
import json
import numpy as np
from datetime import datetime
import os

# Load dam polygon GeoJSON
gdf = gpd.read_file("data/2025-05-01/Bulk_Water_Dams.geojson")

# Load dam levels CSV, set date format
df = pd.read_csv("data/2025-05-19/Dam_Levels_from_2012.csv", encoding="ISO-8859-1")
df['DATE'] = df['DATE'].str.replace('Sept', 'Sep', regex=False)
df['DATE'] = pd.to_datetime(df['DATE'], format='%d-%b-%y')

# Clean column names
df.columns = df.columns.str.strip().str.replace(r"\s+", "", regex=True).str.lower()

# Mapping from NAME in GeoJSON to CSV prefix (lowercase, no spaces)
dam_name_mapping = {
    "Woodhead": "woodhead",
    "Hely-Hutchinson": "hely-hutchinson",
    "Lewis Gay": "lewisgay",
    "Kleinplaats": "kleinplaats",
    "Victoria": "victoria",
    "Alexandra": "alexandra",
    "De Villiers": "devilliers",
    "Steenbras Lower": "steenbraslower",
    "Steenbras Upper": "steenbrasupper",
    "Voëlvlei": "voëlvlei",
    "Wemmershoek": "wemmershoek",
    "Theewaterskloof": "theewaterskloof",
    "Berg River": "bergriver",
    "Land-en-Zeezicht Dam": "land-enzeezicht",
    "Big 5 Total": "totalstored-big5",
    "Big 6 Total": "totalstored-big6"
}

def build_timeseries(prefix):
    # Find all columns related to this dam (that start with the prefix)
    prefix_cols = [col for col in df.columns if col.startswith(prefix)]

    def find_col(keyword):
        # Look for a column that contains the keyword (case-insensitive)
        matches = [col for col in prefix_cols if keyword in col]
        return matches[0] if matches else None

    # Find matching columns
    height_col = find_col("height")
    storage_col = find_col("storage")
    current_col = find_col("current")
    last_year_col = find_col("lastyear")

    # If we find no relevant columns, return empty
    if not any([height_col, storage_col, current_col, last_year_col]):
        return [], None

    # Build DataFrame
    cols = {'date': 'date'}
    if height_col: cols[height_col] = 'height_m'
    if storage_col: cols[storage_col] = 'storage_ml'
    if current_col: cols[current_col] = 'percent_full'
    if last_year_col: cols[last_year_col] = 'last_year_percent_full'

    col_keys = list(cols.keys())
    ts = df[col_keys].copy()
    # ts['date'] = pd.to_datetime(ts['date']).dt.strftime('%Y-%m-%d')  # ensure datetime
    ts.rename(columns=cols, inplace=True)

    # Ensure numeric columns are truly numeric
    for col in ['height_m', 'storage_ml', 'percent_full', 'last_year_percent_full']:
        if col in ts.columns:
            ts[col] = pd.to_numeric(ts[col], errors='coerce')

    # format nulls
    ts = ts.where(pd.notnull(ts), None)

    return ts


# Create output containers
dam_ts_daily = {}
dam_ts_monthly = {}
dam_ts_yearly = {}

for dam_name, prefix in dam_name_mapping.items():
    if dam_name == 'Woodhead':
        df_ts = build_timeseries(prefix)

    # DAILY
    df_ts_sorted = df_ts.sort_values('date')
    df_ts_sorted['date'] = df_ts_sorted['date'].dt.strftime('%Y-%m-%d')
    dam_ts_daily[prefix] = df_ts_sorted.where(pd.notnull(df_ts_sorted), None).to_dict(orient='records')
    
    # MONTHLY
    monthly = df_ts.resample('ME', on='date').mean(numeric_only=True).reset_index()
    monthly['date'] = monthly['date'].dt.strftime('%Y-%m')
    dam_ts_monthly[prefix] = monthly.where(pd.notnull(monthly), None).to_dict(orient='records')
    
    # YEARLY
    yearly = df_ts.resample('YE', on='date').mean(numeric_only=True).reset_index()
    yearly['date'] = yearly['date'].dt.strftime('%Y')
    dam_ts_yearly[prefix] = yearly.where(pd.notnull(yearly), None).to_dict(orient='records')

In [108]:
dam_ts_monthly['woodhead'][0]

{'date': '2000-01',
 'height_m': 31.8271875,
 'storage_ml': 492.60625,
 'percent_full': 51.587208085,
 'last_year_percent_full': 36.051371393636366}

In [109]:
dam_ts_yearly['woodhead'][0]

{'date': '2000',
 'height_m': 30.411062670299728,
 'storage_ml': 447.5024523160763,
 'percent_full': 46.86380273495913,
 'last_year_percent_full': 36.051371393636366}

In [107]:
dam_ts_daily['woodhead'][0]

{'date': '2000-01-01',
 'height_m': 30.58,
 'storage_ml': 418.2,
 'percent_full': 43.7951618,
 'last_year_percent_full': nan}

Unnamed: 0,date
0,2000-01-31
1,2000-02-29
2,2000-03-31
3,2000-04-30
4,2000-05-31
...,...
300,2025-01-31
301,2025-02-28
302,2025-03-31
303,2025-04-30
