In [3]:
# pip install pandas
# pip install geopandas

This block imports the required Python libraries:

pandas for handling tabular data (such as time series).
geopandas for working with geospatial data (GeoPackage format).
sqlite3 for connecting and interacting with an SQLite database.


In [4]:
# Import necessary libraries
import pandas as pd  # Pandas is used for handling tabular data efficiently
import geopandas as gpd  # Geopandas is used for handling geospatial data
import sqlite3  # SQLite3 is used to connect and interact with an SQLite database

This block loads three geospatial layers from a GeoPackage (.gpkg) file using geopandas:

POD_points: Contains the locations of Points of Diversion (POD), where water is withdrawn.
event: Represents gage stations or hydrological monitoring points.
ResOps_points: Includes information about reservoirs and their operations.
Each layer is stored as a GeoDataFrame, allowing spatial analysis and attribute queries.

In [5]:
# Read geospatial layers from a GeoPackage file
pod_layer = gpd.read_file('data/enhanced_reference_14.gpkg', layer='DIVERSION_POINTS')  
# POD (Point of Diversion) layer contains locations where water is diverted

gage_layer = gpd.read_file('data/enhanced_reference_14.gpkg', layer='event')  
# Event layer contains gage locations or monitoring points

res_layer = gpd.read_file('data/enhanced_reference_14.gpkg', layer='RESERVOIR_POINTS')  
# Reservoir operations layer contains details about reservoirs

DataSourceError: data/enhanced_reference_14.gpkg: No such file or directory

### Config

Start and end date for the simulation functionality 

In [6]:
# HUC4 must match what you used when building the relational DB
huc4_code = "14"

# Path to the relational database
db_path = f"data/relational_db_{huc4_code}.db"

# Simulation window for reservoir operations (change as needed)
start_date = "10/1/2000"
end_date   = "10/1/2025"

This block establishes a connection to an SQLite database to query key identifiers. First, it retrieves the POI_TypeID for "USGS_Gage" from the POI_Type table and the VariableID for "Demand" from the Variables table, both of which are crucial for querying related records.

Next, the script iterates through each reservoir point in the res_layer, extracting its Source_comid (hydrofabric segment ID) and using it to look up the corresponding POIID in the POI table. Once the POIID is found, it queries the POI_Values table to retrieve the historical Flow data (CFS), which is then converted into a pandas.DataFrame for easy processing. Next, script retrieves data such as the POI_NativeID (a unique identifier) and the POI_Flow_ComID (hydrofabric segment ID) from the POI table.

In each iteration, the script structures key variables for each point for integration into a water management model. These include the Gage unique ID, hydrofabric segment, and historical Flow records. 

### <span style="color:red">There is a specified block in the code that can be used to implement MODEL parametrizing code.</span> 

Use the specified variables in each iteration to inject data to the model with model specific functions. 



In [7]:

window_start = pd.to_datetime(start_date, format="%m/%d/%Y")
window_end   = pd.to_datetime(end_date,   format="%m/%d/%Y")
# Connect to the relational DB defined in the previous cell
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

# ------------------------------------------------------------------
# 1. Look up the POI_TYPE_ID for USGS gages
# ------------------------------------------------------------------
cursor.execute("""
    SELECT POI_TYPE_ID
    FROM POI_TYPE
    WHERE POI_TYPE_NAME = 'USGS_GAGE'
""")
poi_type_result = cursor.fetchone()

if poi_type_result:
    gage_poi_type_id = poi_type_result[0]
else:
    conn.close()
    raise RuntimeError("No POI_TYPE_ID found for 'USGS_GAGE' in the POI_TYPE table.")

# ------------------------------------------------------------------
# 2. Look up VARIABLE_ID(s) for gage variables
#    (change names if your VARIABLES table uses different labels)
# ------------------------------------------------------------------
variable_names = ["GAGE_FLOW"]
variable_ids = {}

for var_name in variable_names:
    cursor.execute("""
        SELECT VARIABLE_ID
        FROM VARIABLES
        WHERE VARIABLE_NAME = ?
    """, (var_name,))
    result = cursor.fetchone()
    if result:
        variable_ids[var_name] = result[0]
    else:
        print(f"WARNING: No VARIABLE_ID found for '{var_name}' in VARIABLES")

if not variable_ids:
    conn.close()
    raise RuntimeError("No gage VARIABLE_IDs were found. Check the VARIABLES table.")

# ------------------------------------------------------------------
# 3. Loop over gage points from the GAGE layer
# ------------------------------------------------------------------
for idx, row in gage_layer.iterrows():
    # Hydrofabric COMID associated with this gage
    # NOTE: change 'SOURCE_COMID' to match your actual column name
    source_comid = row["hy_id"]

    # ------------------------------------------------------------------
    # 3a. Look up POI_ID for this gage in POI table
    # ------------------------------------------------------------------
    cursor.execute("""
        SELECT POI_ID
        FROM POI
        WHERE POI_FLOW_COMID = ? AND POI_TYPE_ID = ?
    """, (source_comid, gage_poi_type_id))
    poiid_result = cursor.fetchone()

    if not poiid_result:
        # No matching POI row for this gage – skip
        print(f"No POI row found for gage COMID {source_comid}")
        continue

    poiid = poiid_result[0]

    # ------------------------------------------------------------------
    # 3b. Get gage time-series (GAGE_FLOW) from POI_VALUES
    # ------------------------------------------------------------------
    timeseries_dict = {}

    for var_name, var_id in variable_ids.items():
        cursor.execute("""
            SELECT LOCAL_DATE_TIME, DATA_VALUE
            FROM POI_VALUES
            WHERE POI_ID = ? AND VARIABLE_ID = ?
            ORDER BY LOCAL_DATE_TIME
        """, (poiid, var_id))
        records = cursor.fetchall()

        if not records:
            # No time series for this variable / gage
            continue

        df = pd.DataFrame(records, columns=["LocalDateTime", var_name])

# 1) Parse the datetime column
        # If you know the exact format from the DB, be explicit; e.g. "%Y-%m-%d %H:%M:%S"
        df["LocalDateTime"] = pd.to_datetime(df["LocalDateTime"], errors="coerce")
        
        # 2) Quick sanity check BEFORE masking
        print(f"\n=== Gage {poiid}, variable {var_name} ===")
        print("dtype:", df["LocalDateTime"].dtype)
        print("min date:", df["LocalDateTime"].min())
        print("max date:", df["LocalDateTime"].max())
        
        # If everything is NaT, parsing failed
        if df["LocalDateTime"].isna().all():
            print("All LocalDateTime are NaT -> parsing problem.")
            continue
        
        # 3) Apply simulation window
        mask = df["LocalDateTime"].between(window_start, window_end, inclusive="both")
        df = df.loc[mask]
        
        if df.empty:
            print(f" -> No rows in window {window_start} to {window_end}")
            continue
        
        df.set_index("LocalDateTime", inplace=True)
        timeseries_dict[var_name] = df

    if not timeseries_dict:
        print(f"No gage time-series found in window for COMID {source_comid}")
        continue

    # Combine all variable DataFrames on datetime index
    timeseries = pd.concat(timeseries_dict.values(), axis=1, join="outer").reset_index()

    # ------------------------------------------------------------------
    # 3c. Get the gage's native ID and segment COMID from POI
    # ------------------------------------------------------------------
    cursor.execute("""
        SELECT POI_NATIVE_ID, POI_FLOW_COMID
        FROM POI
        WHERE POI_ID = ?
    """, (poiid,))
    poi_record = cursor.fetchone()

    if not poi_record:
        print(f"No POI_NATIVE_ID/POI_FLOW_COMID found for POI_ID {poiid}")
        continue

    poi_native_id, segment_comid = poi_record

    # Optional: small sanity print
    print(f"Gage {idx}: POI_NATIVE_ID={poi_native_id}, segment_comid={segment_comid}")
    print(timeseries.head())

    ##########################################################################
    # MODEL HOOK BLOCK – use these variables for your WMM / hydrologic model
    #
    # 1. `POI_ID`         – Native ID of the USGS gage (POI)
    # 2. `segment_comid`  – Hydrofabric segment ID where the gage occurs
    # 3. `timeseries`     – Pandas DataFrame with columns:
    #                           LocalDateTime, GAGE_FLOW
    #
    # -> Here is where you call your model-specific constructor / API:
    #
    #        gage_id          = poi_native_id,
    #        segment_comid    = segment_comid,
    #        ts_df            = timeseries,
    #
    ##########################################################################

# Close DB when finished
conn.close()


OperationalError: unable to open database file

In [8]:
# ------------------------------------------------------------------
# SUMMARY TABLE FOR ALL GAGES
# Columns:
#   GageID          – Native POI ID of the gage
#   SegmentCOMID    – Hydrofabric segment COMID
#   StartDate       – First available datetime in DB
#   EndDate         – Last available datetime in DB
# ------------------------------------------------------------------

gage_comids = gage_layer["hy_id"].dropna().unique().tolist()

if len(gage_comids) == 0:
    raise RuntimeError("No COMIDs found in gage_layer['hy_id'].")

placeholders = ",".join(["?"] * len(gage_comids))

sql = f"""
    SELECT 
        p.POI_NATIVE_ID      AS GageID,
        p.POI_FLOW_COMID     AS SegmentCOMID,
        MIN(v.LOCAL_DATE_TIME) AS StartDate,
        MAX(v.LOCAL_DATE_TIME) AS EndDate
    FROM POI p
    JOIN POI_VALUES v
        ON p.POI_ID = v.POI_ID
    WHERE p.POI_TYPE_ID = ?
      AND p.POI_FLOW_COMID IN ({placeholders})
    GROUP BY 
        p.POI_ID,
        p.POI_NATIVE_ID,
        p.POI_FLOW_COMID
    ORDER BY p.POI_FLOW_COMID
"""

params = [gage_poi_type_id] + gage_comids

gage_summary_df = pd.read_sql_query(sql, conn, params=params)

# Convert to datetime
gage_summary_df["StartDate"] = pd.to_datetime(gage_summary_df["StartDate"], errors="coerce")
gage_summary_df["EndDate"]   = pd.to_datetime(gage_summary_df["EndDate"], errors="coerce")

print("\n=== GAGE DATA SUMMARY (window-agnostic) ===")
print(gage_summary_df.head())

output_path = f"data/gage_table_{huc4_code}.csv"
gage_summary_df.to_csv(output_path, index=False)


NameError: name 'gage_layer' is not defined