In [57]:
import pandas as pd
import geopandas as gpd
import json
from datetime import datetime
from pandas._libs.tslibs.nattype import NaTType

# --- Load data ---
gdf = gpd.read_file("data/Bulk_Water_Dams.geojson")
df = pd.read_csv("data/Dam_Levels_from_2012.csv", encoding="ISO-8859-1")

# --- Clean date column ---
# df['DATE'] = pd.to_datetime(df['DATE'], format='%Y-%m-%d', errors='coerce')
df['DATE'] = pd.to_datetime(df['DATE'])

# Remove ALL whitespace from column names
df.columns = df.columns.str.strip()
df.columns = df.columns.str.replace("  ", " ", regex=True)

# --- Prepare mapping between GeoJSON NAME and CSV column prefix ---
dam_name_mapping = {
    "Woodhead": "WOODHEAD",
    "Hely-Hutchinson": "HELY-HUTCHINSON",
    "Lewis Gay": "LEWIS GAY",
    "Kleinplaats": "KLEINPLAATS",
    "Victoria": "VICTORIA",
    "Alexandra": "ALEXANDRA",
    "De Villiers": "DE VILLIERS",
    "Steenbras Lower": "STEENBRAS LOWER",
    "Steenbras Upper": "STEENBRAS UPPER",
    "Voëlvlei": "VOËLVLEI",
    "Wemmershoek": "WEMMERSHOEK",
    "Theewaterskloof": "THEEWATERSKLOOF",
    "Berg River": "BERG RIVER",
    "Land-en-Zeezicht Dam": "LAND-en ZEEZICHT"
}

# --- Build output features ---
features = []

for _, row in gdf.iterrows():
    dam_name = row["NAME"]
    csv_key = dam_name_mapping.get(dam_name)

    if not csv_key:
        continue  # Skip dams not found in CSV mapping

    # Define expected columns
    height_col = f"{csv_key} HEIGHT(m)" if f"{csv_key} HEIGHT(m)" in df.columns else f"{csv_key} HEIGHT (m)" if f"{csv_key} HEIGHT (m)" in df.columns else f"{csv_key} HEIGHT"
    storage_col = f"{csv_key} STORAGE(Ml)" if f"{csv_key} STORAGE(Ml)" in df.columns else f"{csv_key} STORAGE (Ml)" if f"{csv_key} STORAGE (Ml)" in df.columns else f"{csv_key} STORAGE" if f"{csv_key} STORAGE" in df.columns else "STEENBRAS STORAGE (Ml)"
    current_col = f"{csv_key} Current%" if f"{csv_key} Current%" in df.columns else f"{csv_key} Current %" if f"{csv_key} Current %" in df.columns else f"{csv_key} Current"
    last_year_col = f"{csv_key} Last Year%" if f"{csv_key} Last Year%" in df.columns else f"{csv_key} Last Year %" if f"{csv_key} Last Year %" in df.columns else f"{csv_key} Last Year" if f"{csv_key} Last Year" in df.columns else f"{csv_key}Last Year %" if f"{csv_key}Last Year %" in df.columns else f"{csv_key}Last Year%"

    # Extract timeseries
    ts = df[['DATE', height_col, storage_col, current_col, last_year_col]].copy()
    ts.columns = ['date', 'height_m', 'storage_ml', 'percent_full', 'last_year_percent_full']
    ts.dropna(subset=['percent_full'], inplace=True)

    # Format timeseries for output
    ts['date'] = ts['date'].dt.strftime('%Y-%m-%d')  # Convert Timestamps to ISO strings
    ts = ts.where(pd.notnull(ts), None)  # Replace all NaNs with None
    timeseries = ts.to_dict(orient='records')

    # Get most recent percent full
    current_percentage_full = ts.sort_values("date").iloc[-1]["percent_full"]

    # Build feature
    feature = {
        "type": "Feature",
        "geometry": row["geometry"].__geo_interface__,
        "properties": {
            k: (
                v.strftime('%Y-%m-%dT%H:%M:%SZ') if isinstance(v, pd.Timestamp)
                else None if isinstance(v, NaTType)
                else v
            )
            for k, v in row.drop("geometry").items()
        }
    }
    feature["properties"]["current_percentage_full"] = current_percentage_full
    feature["properties"]["timeseries"] = timeseries

    features.append(feature)





# --- Build final GeoJSON ---
output_geojson = {
    "type": "FeatureCollection",
    "name": "SL_WTNK_BULK_DAMS_SYNC",
    "crs": {
        "type": "name",
        "properties": {"name": "urn:ogc:def:crs:OGC:1.3:CRS84"}
    },
    "features": features
}

# --- Save to file ---
with open("output/Bulk_Water_Dams_Enriched.geojson", "w") as f:
    json.dump(output_geojson, f)

  df['DATE'] = pd.to_datetime(df['DATE'])
