In [None]:
import pandas as pd
import geopandas as gpd
import numpy as np
import glob
import re
from shapely.geometry import Point
import os
from tqdm import tqdm
import shutil
import importlib
import extended_hydrofabric_functions as hf
importlib.reload(hf)
from extended_hydrofabric_functions import *

In [None]:
# Configs

huc_code = '14'
huc_feature = 'huc2'
feature_name = 'comid'


In [None]:
# Functions
# (moved to external script: extended_hydrofabric_functions.py)


In [None]:
# Directories

sites_df  = vcat("data/wade data/*/sites.csv", subset_dedup=["SiteUUID"])
alloc_df  = vcat("data/wade data/*/waterallocations.csv", subset_dedup= None)
source_df = vcat("data/wade data/*/watersources.csv", subset_dedup=["WaterSourceUUID"])


diversions_df = pd.read_csv('data/Diversions/diversion_points_lopez.csv')

agg_csv= 'data/*' # If you have aggregated diversions with correct format (see example) and place them in data folder 


wbd_layer = gpd.read_file('data/wbd/WBDHU2.shp')

# Load gage locations
gage_shp = gpd.read_file('data/GageLoc/GageLoc.shp') # This is optional, it is nod added to the extended hydrofabric
reference_fabric_path = 'data/reference fabric gpkg/reference_14.gpkg'
flow_layer = gpd.read_file(reference_fabric_path, layer='reference_flowline')
gage_data = gpd.read_file(reference_fabric_path, layer='event')
# Load reservoir datasets
resops_df = pd.read_csv('data/reservoirs/updated_reservoir_attributes.csv')
grand_df  = pd.read_csv('data/reservoirs/GRAND.csv')

In [None]:
# WaDE data Harmonizing

alloc_norm = alloc_df.copy()

# split lists, explode, and clean whitespace/empties
alloc_norm['SiteUUID'] = (
    alloc_norm['SiteUUID']
    .astype(str)
    .str.split(',')
)
alloc_exploded = (
    alloc_norm
    .explode('SiteUUID')
    .assign(SiteUUID=lambda d: d['SiteUUID'].str.strip())
)
alloc_exploded = alloc_exploded[alloc_exploded['SiteUUID'].notna() & (alloc_exploded['SiteUUID'] != '')]

# --- Merge with sites (keep only allocations that actually have a site)
wade_1 = pd.merge(alloc_exploded, sites_df, on='SiteUUID', how='inner')


if 'WaterSourceUUIDs' in wade_1.columns:
    wade_1['WaterSourceUUIDs'] = (
        wade_1['WaterSourceUUIDs']
        .astype(str)
        .str.split(',')
    )
    wade_1 = (
        wade_1
        .explode('WaterSourceUUIDs')
        .assign(WaterSourceUUID=lambda d: d['WaterSourceUUIDs'].str.strip())
        .drop(columns=['WaterSourceUUIDs'])
    )
elif 'WaterSourceUUID' in wade_1.columns:
    # Ensure clean strings if it's already singular
    wade_1['WaterSourceUUID'] = wade_1['WaterSourceUUID'].astype(str).str.strip()

if 'SiteNativeID' in wade_1.columns:
    wade_1 = wade_1.rename(columns={'SiteNativeID': 'WDID'})

wade_df = pd.merge(wade_1, source_df, on='WaterSourceUUID', how='left')

wade_df['WDID'] = wade_df['WDID'].astype(str)
diversions_df['WadeID'] = diversions_df['WadeID'].astype(str)
wade_df['WDID'] = wade_df['WDID'].str.strip()
diversions_df['WadeID'] = diversions_df['WadeID'].str.strip()
if os.path.exists(agg_csv):
    aggregated_diversions_df = pd.read_csv(agg_csv)
    aggregated_diversions_df['Aggregation ID'] = aggregated_diversions_df['Aggregation ID'].astype(str).str.strip()
else:
    print(" aggregated_table.csv not found. Continuing without aggregated diversion data.")

    aggregated_diversions_df = pd.DataFrame(columns=[
        'Aggregation ID', 'Aggregation Name', 'Water Source'
    ])



In [None]:
# Diversions Layer

diversions_df['geometry'] = [Point(xy) for xy in zip(diversions_df['X'], diversions_df['Y'])]
diversions_gdf = gpd.GeoDataFrame(diversions_df, geometry='geometry', crs='EPSG:4326')  # Assuming input is in lat/lon

diversions_gdf = diversions_gdf.to_crs(wbd_layer.crs)

wbd_selected = wbd_layer[wbd_layer[huc_feature] == huc_code]

diversions_selected = gpd.clip(diversions_gdf, wbd_selected)

diversions_selected = diversions_selected.drop(columns=['geometry', 'index_right'], errors='ignore').copy()
POD_df = diversions_selected.copy().drop(columns=['ID', 'State'], errors='ignore')
POD_df.columns = ['WDID', 'LATITUDE', 'LONGITUDE', 'POI_NATIVE_ID']

# Ensure string keys (important for merges)
POD_df['WDID'] = POD_df['WDID'].astype(str).str.strip()
wade_df['WDID'] = wade_df['WDID'].astype(str).str.strip()
aggregated_diversions_df['Aggregation ID'] = aggregated_diversions_df['Aggregation ID'].astype(str).str.strip()

# ---- Join 1: WaDE (Physical) ----
wade_cols = ['WDID', 'SiteName', 'WaterSourceName', 'BeneficialUseCategory']
wade_keyed = wade_df[wade_cols].drop_duplicates('WDID')

pod_wade = POD_df.merge(
    wade_keyed,
    on='WDID',
    how='left'
)

# ---- Join 2: Aggregated diversions ----
agg_cols = ['Aggregation ID', 'Aggregation Name', 'Water Source']
agg_keyed = aggregated_diversions_df[agg_cols].drop_duplicates('Aggregation ID')

pod_all = pod_wade.merge(
    agg_keyed,
    left_on='WDID',
    right_on='Aggregation ID',
    how='left'
)

is_physical = pod_all['SiteName'].notna()
is_agg = (~is_physical) & pod_all['Aggregation Name'].notna()

pod_all['TYPE'] = np.select(
    [is_physical, is_agg],
    ['Physical', 'Aggregated Diversion'],
    default=None
)

# Fill outputs based on which tier matched
pod_all['SITE_NAME'] = np.where(is_physical, pod_all['SiteName'], pod_all['Aggregation Name'])
pod_all['WATER_SOURCE'] = np.where(is_physical, pod_all['WaterSourceName'], pod_all['Water Source'])
pod_all['BENEFICIAL_CATEGORY_USE'] = np.where(is_physical, pod_all['BeneficialUseCategory'], 'TBD')


pod_all['SOURCE_GNIS_ID'] = 'TBD'

# Keep only classified
POD_df = pod_all.dropna(subset=['TYPE']).copy()

# Drop helper/join columns
POD_df = POD_df.drop(
    columns=[
        'SiteName','WaterSourceName','BeneficialUseCategory',
        'Aggregation ID','Aggregation Name','Water Source'
    ],
    errors='ignore'
)

POD_df.to_csv(f"data/output/PODtable_{huc_code}.csv", index=False)

In [None]:
# --- POI Extraction Block ---

# Load POD table generated earlier in the notebook
POD_df = pd.read_csv(f'data/output/PODtable_{huc_code}.csv', dtype={'WDID': str})

# Filter to selected HUC2
huc_boundary = wbd_layer[wbd_layer[huc_feature] == huc_code]


# Merge reservoir attributes
resops_updated = resops_df.merge(
    grand_df.rename(columns={'GRAND_ID': 'DAM_ID'})[
        ['DAM_ID', 'RIVER', 'CAP_MCM', 'DAM_HGT_M', 'AREA_SKM', 'MAIN_USE']
    ],
    on='DAM_ID',
    how='left'
)

# Convert POD table to GDF
pod_gdf = gpd.GeoDataFrame(
    POD_df,
    geometry=gpd.points_from_xy(POD_df.LONGITUDE, POD_df.LATITUDE),
    crs="EPSG:4326"
)

# Convert reservoir CSV to GDF
resops_gdf = gpd.GeoDataFrame(
    resops_updated,
    geometry=gpd.points_from_xy(resops_updated.LONGITUDE, resops_updated.LATITUDE),
    crs="EPSG:4326"
)

# Reproject everything to match WBD layer
target_crs = huc_boundary.crs

pod_gdf    = pod_gdf.to_crs(target_crs)
resops_gdf = resops_gdf.to_crs(target_crs)
gage_shp   = gage_shp.to_crs(target_crs)
huc_boundary = huc_boundary.to_crs(target_crs)

# Select points inside the HUC boundary
pod_selected    = pod_gdf[pod_gdf.within(huc_boundary.unary_union)]
resops_selected = resops_gdf[resops_gdf.within(huc_boundary.unary_union)]
gage_selected   = gage_shp[gage_shp.within(huc_boundary.unary_union)]

# Output file
output_gpkg_path = f"data/output/POI_{huc_code}.gpkg"

# Write layers
pod_selected.to_file(output_gpkg_path, layer="DIVERSION_POINTS", driver="GPKG")
resops_selected.to_file(output_gpkg_path, layer="RESERVOIR_POINTS", driver="GPKG")
gage_selected.to_file(output_gpkg_path, layer="GAGE_POINTS", driver="GPKG")

print(f"Selected POI layers saved to: {output_gpkg_path}")


In [None]:
POD_data = gpd.read_file(f'data/output/POI_{huc_code}.gpkg', layer='DIVERSION_POINTS')
res_data = gpd.read_file(f'data/output/POI_{huc_code}.gpkg', layer='RESERVOIR_POINTS')


target_crs = flow_layer.crs
POD_data = POD_data.to_crs(target_crs)
res_data = res_data.to_crs(target_crs)
flow_layer['gnis_id'] = pd.to_numeric(flow_layer['gnis_id'], errors='coerce')
flow_layer['streamorde'] = pd.to_numeric(flow_layer['streamorde'], errors='coerce')
POD_data['Source GNIS ID Raw'] = POD_data['SOURCE_GNIS_ID']

flow = flow_layer.copy()
flow["gnis_id"] = pd.to_numeric(flow["gnis_id"], errors="coerce")
flow["streamorde"] = pd.to_numeric(flow["streamorde"], errors="coerce")

pods = POD_data.copy()
pods["WDID"] = pods["WDID"].astype(str)

pods["Source GNIS ID Raw"] = pods["SOURCE_GNIS_ID"]

# Numeric GNIS id for join
pods["SOURCE_GNIS_ID_num"] = pd.to_numeric(pods["SOURCE_GNIS_ID"], errors="coerce")
flow["gnis_id_num"] = flow["gnis_id"]

# Normalized names for Tier 2 join
pods["WATER_SOURCE_str"] = pods["WATER_SOURCE"].astype(str).str.strip()
pods["ws_norm"] = pods["WATER_SOURCE_str"].apply(lambda x: normalize_name(x) if x.lower() != "tbd" else "")
flow["gnis_name_norm"] = flow["gnis_name"].astype(str).apply(normalize_name)

# Results containers 
res_comid = {}
res_gnis_name = {}
res_gnis_id = {}

# Counters 
res_fallback_counter = 0
fallback3_count = 0
fallback4_count = 0
fallback3_ids = []
no_link_count = 0
no_link_ids = []
no_link_records = []

feature_name = "comid"  # you set above

# -------------------------
# Tier 1: GNIS ID join
# if SOURCE_GNIS_ID is not na and Source GNIS ID Raw != 'tbd'
# -------------------------
t1_mask = pods["SOURCE_GNIS_ID_num"].notna() & (pods["Source GNIS ID Raw"].astype(str).str.strip().str.lower() != "tbd")
pods_t1 = pods.loc[t1_mask, ["WDID", "geometry", "SOURCE_GNIS_ID_num"]].rename(columns={"geometry": "geom_pod"}).copy()

if not pods_t1.empty:
    cand_t1 = pods_t1.merge(
        flow[["comid", "gnis_id", "gnis_id_num", "gnis_name", "geometry"]].rename(columns={"geometry": "geom_seg"}),
        left_on="SOURCE_GNIS_ID_num",
        right_on="gnis_id_num",
        how="left"
    )

    # Matched GNIS rows (have comid)
    matched = cand_t1[cand_t1["comid"].notna()].copy()
    if not matched.empty:
        matched["dist"] = matched["geom_seg"].distance(matched["geom_pod"])
        best_idx = matched.groupby("WDID")["dist"].idxmin()
        best = matched.loc[best_idx]

        for r in best.itertuples(index=False):
            res_comid[r.WDID] = int(r.comid) if pd.notna(r.comid) else None
            res_gnis_name[r.WDID] = r.gnis_name
            res_gnis_id[r.WDID] = r.gnis_id

    # Unmatched GNIS (no segments with that GNIS) ->
    unmatched_wdids = set(pods_t1["WDID"]) - set(matched["WDID"]) if not pods_t1.empty else set()
    if unmatched_wdids:
        pods_unmatched = pods.loc[pods["WDID"].isin(unmatched_wdids), ["WDID", "geometry"]].copy()
        # Vectorized nearest-anywhere using spatial index
        nearest_any = gpd.sjoin_nearest(
            pods_unmatched,
            flow[["comid", "gnis_id", "gnis_name", "geometry"]],
            how="left",
            distance_col="dist"
        )
        for r in nearest_any.itertuples(index=False):
            res_comid[r.WDID] = int(r.comid) if pd.notna(r.comid) else None
            res_gnis_name[r.WDID] = r.gnis_name
            res_gnis_id[r.WDID] = r.gnis_id

remaining_wdids = set(pods["WDID"]) - set(res_comid.keys())
pods_remain = pods[pods["WDID"].isin(remaining_wdids)].copy()

# -------------------------
# Tier 2: normalized name join
# Condition WATER_SOURCE != 'tbd'
# If no name matches -> fallback to streamorde>=3 (Tier 3), and increment fallback3_count
# -------------------------
t2_mask = pods_remain["WATER_SOURCE_str"].str.lower().ne("tbd")
pods_t2 = pods_remain.loc[t2_mask, ["WDID", "geometry", "ws_norm"]].rename(columns={"geometry": "geom_pod"}).copy()

pods_t2 = pods_t2[pods_t2["ws_norm"].astype(str).str.len() > 0].copy()

if not pods_t2.empty:
    cand_t2 = pods_t2.merge(
        flow[["comid", "gnis_id", "gnis_name", "gnis_name_norm", "geometry"]].rename(columns={"geometry": "geom_seg"}),
        left_on="ws_norm",
        right_on="gnis_name_norm",
        how="left"
    )

    matched2 = cand_t2[cand_t2["comid"].notna()].copy()
    if not matched2.empty:
        matched2["dist"] = matched2["geom_seg"].distance(matched2["geom_pod"])
        best2_idx = matched2.groupby("WDID")["dist"].idxmin()
        best2 = matched2.loc[best2_idx]

        for r in best2.itertuples(index=False):
            res_comid[r.WDID] = int(r.comid) if pd.notna(r.comid) else None
            res_gnis_name[r.WDID] = r.gnis_name
            res_gnis_id[r.WDID] = r.gnis_id

    matched2_wdids = set(matched2["WDID"]) if not matched2.empty else set()
    t2_no_match_wdids = set(pods_t2["WDID"]) - matched2_wdids
else:
    t2_no_match_wdids = set()

# Remaining after Tier 2 (go to Tier 3 iteration)
remaining_wdids = set(pods["WDID"]) - set(res_comid.keys())
pods_t3 = pods[pods["WDID"].isin(remaining_wdids)].copy()

# Build streamorde>=3 subset
flow_stream3 = flow[flow["streamorde"].fillna(0) >= 3].copy()

# -------------------------
# Tier 3: ITERATION 
# This tier applies to:
#   - WATER_SOURCE == 'tbd'
#   - OR Tier 2 produced no name match
# -------------------------
t2_no_match_wdids = set(t2_no_match_wdids)
pods_t3_idx = list(pods_t3.index)

for idx in tqdm(pods_t3_idx, total=len(pods_t3_idx), desc="Tier 3 (streamorde>=3) POD linking"):
    row = pods.loc[idx]
    wdid = row["WDID"]
    geom = row.geometry

    # Count fallback3 like your function:
    # - if WATER_SOURCE == tbd  -> fallback3
    # - if WATER_SOURCE != tbd but no match in Tier2 -> fallback3
    ws_is_tbd = str(row.get("WATER_SOURCE", "")).strip().lower() == "tbd"
    if ws_is_tbd or (wdid in t2_no_match_wdids):
        fallback3_count += 1
        fallback3_ids.append(wdid)

    # If there are no streamorde>=3 segments
    if flow_stream3.empty:
        fallback4_count += 1
        no_link_count += 1
        no_link_ids.append(wdid)
        no_link_records.append({
            "WDID": wdid,
            "geometry": geom,
            "SOURCE_GNIS_ID": row.get("SOURCE_GNIS_ID", None),
            "Source GNIS ID Raw": row.get("Source GNIS ID Raw", None),
            "WATER_SOURCE": row.get("WATER_SOURCE", None),
            "reason": "no_streamorde>=3_segment"
        })
        res_comid[wdid] = None
        res_gnis_name[wdid] = None
        res_gnis_id[wdid] = None
        continue

    # Nearest among streamorde>=3
    dists = flow_stream3.geometry.distance(geom)
    best_i = dists.idxmin()
    seg = flow_stream3.loc[best_i]

    res_comid[wdid] = seg[feature_name]
    res_gnis_name[wdid] = seg["gnis_name"]
    res_gnis_id[wdid] = seg["gnis_id"]

pods["SOURCE_COMID"] = pods["WDID"].map(res_comid)
pods["WATER_SOURCE"] = pods["WDID"].map(res_gnis_name)
pods["SOURCE_GNIS_ID"] = pods["WDID"].map(res_gnis_id)
pods = pods.drop(columns=["Source GNIS ID Raw", "SOURCE_GNIS_ID_num", "WATER_SOURCE_str", "ws_norm"], errors="ignore")
POD_data = pods
closest_features_res = []

for idx, point in tqdm(res_data.iterrows(),total=len(res_data), desc="Linking Reservoir IDs") :
    river_name = res_data.at[idx, 'RIVER']
    point_geom = point.geometry
    closest_segment_test = closest_segment_func2(point, flow_layer)
    closest_segment = walk_downstream_past_lakepond(closest_segment_test, flow_layer)
    
    closest_feature_value = closest_segment[feature_name] if closest_segment is not None else None
    closest_features_res.append(closest_feature_value)

res_data['SOURCE_COMID'] = closest_features_res
res_data['POI_NATIVE_ID'] = res_data['NID_ID']

#gage_data['SOURCE_COMID'] = np.nan
#gage_data.loc[gage_data['hl_reference'] == 'type_gages', 'Source_' + feature_name] = gage_data['hy_id']

#gage_data['POI_NativeID'] = np.nan
#gage_data.loc[gage_data['hl_reference'] == 'type_gages', 'POI_NativeID'] = gage_data['hl_link']

print(f"\nNumber of PODs that used fallback 3: {fallback3_count}")
print(f"\nNumber of PODs that used fallback 4: {fallback4_count}")
print(f"\nNumber of Reservoirs that used fallback 3: {hf.res_fallback_counter}")

enhanced_fabric_path = f'data/output/enhanced_reference_{huc_code}.gpkg'
shutil.copyfile(reference_fabric_path, enhanced_fabric_path)

# with fiona.Env():
#     fiona.remove(enhanced_fabric_path, layer='event')

#gage_data.to_file(enhanced_fabric_path, layer='event', driver='GPKG', mode='a')
POD_data.to_file(enhanced_fabric_path, layer='DIVERSION_POINTS', driver='GPKG', mode='a')
res_data.to_file(enhanced_fabric_path, layer='RESERVOIR_POINTS', driver='GPKG', mode='a')