In [None]:
import json
from pathlib import Path

import folium
import geopandas as gpd
import pandas as pd
from matplotlib import pyplot as plt
from shapely.geometry import Point

In [None]:
BASE = Path("/Volumes/x10pro/estuary/")

In [None]:
skipped_regions = pd.read_csv(BASE / "geos/skipped_regions.csv")["Site code"].to_list()

In [None]:
gdf = gpd.read_file(BASE / "geos/ca_data_w_usgs.geojson")
gdf = gdf[~gdf["Site code"].isin(skipped_regions)].copy()
gdf = gdf.set_index("Site code")
gdf.head()

In [None]:
rect_df = []

for pth in Path(BASE / "ca_grids/").iterdir():
    gid = int(pth.stem)
    if gid in skipped_regions:
        continue
    tp_df = gpd.read_file(pth)
    geo = tp_df.iloc[0].geometry
    rect_df.append({"Site code": gid, "geometry": geo, "Site name": gdf.loc[gid]["Site name"]})

rect_df = gpd.GeoDataFrame(rect_df, geometry="geometry", crs=tp_df.crs)
rect_df.head()

In [None]:
mm = []
for site_code, row in gdf.iterrows():
    matches = rect_df[rect_df.intersects(row.geometry)]
    if len(matches) and (matches["Site code"] == site_code).all():
        mm.append(site_code)
gdf.loc[(rect_df[~rect_df["Site code"].isin(mm)]["Site code"])]

In [None]:
df = pd.read_csv(BASE / "ca_all/empa/grab_events.csv")
df = df[df.latitude > -87]
df.loc[df.longitude > 0, "longitude"] = df[df.longitude > 0].longitude * -1
df = df[df.tide != "Not recorded"]

# Step 1: parse as datetime (no tz yet)
df["samplecollectiondate"] = (
    df["samplecollectiondate"]
    .str.replace("\u202f", " ", regex=False)  # replace narrow no-break space
    .str.replace("\u200b", "", regex=False)  # remove zero-width space, just in case
)
df["samplecollectiondate"] = pd.to_datetime(
    df["samplecollectiondate"],
    format="%m/%d/%y %I:%M %p",  # 12-hour time with AM/PM
    errors="coerce",
)

# Step 2: localize to Pacific at midnight
df["samplecollectiondate"] = (
    df["samplecollectiondate"]
    .dt.tz_localize("America/Los_Angeles")
    # Step 3: shift to noon
    .dt.floor("D")
    + pd.Timedelta(hours=12)
)

# Step 4: convert to UTC for storage
df["samplecollectiondate"] = df["samplecollectiondate"].dt.tz_convert("UTC")
df.head()

In [None]:
rpoints = {}
for r, rows in df.groupby(["siteid", "stationno"]):
    lon = rows.longitude.median()
    lat = rows.latitude.median()
    rpoints[r] = Point(lon, lat)

len(rpoints)

In [None]:
# Convert lat/lon to shapely Points
geometry = [rpoints[(r.siteid, r.stationno)] for _, r in df.iterrows()]

# Create a GeoDataFrame
empa_gdf = gpd.GeoDataFrame(df, geometry=geometry, crs="EPSG:4326")

empa_gdf.head()

In [None]:
all_sensors = gpd.GeoDataFrame(
    [{"siteid": s, "stationno": ss, "geometry": p} for (s, ss), p in rpoints.items()], crs="wgs84"
)

nearests = (
    gpd.sjoin_nearest(
        all_sensors.to_crs("ESRI:54008"),
        gdf[["geometry", "Site name"]].to_crs("ESRI:54008"),
        how="inner",
        max_distance=1700,
        distance_col="dist",
    )
    .sort_values(by=["Site code", "dist", "siteid"])
    .drop_duplicates(["siteid", "Site code"])
)

# Manualally reviewed and filtered
nearests = nearests[nearests.siteid != "SC-CARP"]

nearests

In [None]:
matching_sites = {}
for _, row in nearests.iterrows():
    if row["Site code"] in matching_sites:
        continue
    matching_sites[row["Site code"]] = row.siteid

revmatching_sites = {v: k for k, v in matching_sites.items()}

with open(BASE / "geos/ca_empa_matching_sites.json", "w") as f:
    json.dump(matching_sites, f)

matching_sites

In [None]:
filt_empa_gdf = empa_gdf[empa_gdf.siteid.isin(list(matching_sites.values()))].copy()
filt_empa_gdf.head()

In [None]:
filt_empa_gdf.tide.unique()

In [None]:
for (s, d), rows in filt_empa_gdf.groupby(["estuaryname", "samplecollectiondate"]):
    if not (rows.tide == rows.iloc[0].tide).all():
        print(s, d)
        print(rows.tide.tolist())

In [None]:
filt_empa_gdf["mouth_closed"] = filt_empa_gdf.tide == "closed"

empa_labels = filt_empa_gdf.groupby(["estuaryname", "samplecollectiondate"], as_index=False).agg(
    {"mouth_closed": "any"}
)
empa_labels["Site code"] = -1
empa_labels["siteid"] = ""
for idx, row in empa_labels.iterrows():
    siteid = filt_empa_gdf[filt_empa_gdf.estuaryname == row.estuaryname].siteid.iloc[0]
    code = revmatching_sites[siteid]
    empa_labels.loc[idx, "Site code"] = code
    empa_labels.loc[idx, "siteid"] = siteid

print(len(empa_labels))

empa_labels.head()

In [None]:
# Convert to numeric: open=1, closed=0
empa_labels["mouth_status"] = (~empa_labels["mouth_closed"]).astype(int)

# One figure
plt.figure(figsize=(8, 4))

# Plot each estuary separately
for estuary, g in empa_labels.groupby("estuaryname"):
    g = g.sort_values("samplecollectiondate")
    plt.scatter(
        g["samplecollectiondate"],
        g["mouth_status"],
        marker="o",
        label=estuary,
        s=5,
    )

plt.yticks([0, 1], ["Closed", "Open"])
plt.ylabel("Mouth Status")
plt.title("Estuary Mouth Status Over Time")
plt.legend(
    title="Estuary",
    loc="upper center",
    bbox_to_anchor=(0.5, -0.25),  # push down below x-axis
    ncol=3,  # spread entries into columns
)
plt.grid(True, axis="y", alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
empa_labels.groupby("estuaryname", as_index=False).agg(
    total_count=("mouth_closed", "size"), closed_pct=("mouth_closed", "mean")
).round(2)

In [None]:
empa_labels["label"] = empa_labels.mouth_status.apply(lambda a: "open" if a == 1 else "closed")
empa_labels["instrument"] = "empa"
empa_labels_save = empa_labels.rename(
    columns={"samplecollectiondate": "acquired", "Site code": "region"}
).drop(columns=["mouth_closed", "siteid", "estuaryname", "mouth_status"])

empa_labels_save.to_csv(BASE / "geos" / "empa_labels.csv", index=False)

In [None]:
lookup = dict(zip(df.siteid, df.estuaryname, strict=False))
revlookup = dict(zip(df.estuaryname, df.siteid, strict=False))

sensors = gpd.GeoDataFrame(
    [
        {"siteid": s, "estuaryname": lookup[s], "stationno": ss, "geometry": p}
        for (s, ss), p in rpoints.items()
        if s in list(matching_sites.values())
    ],
    crs="wgs84",
)
sensors.to_file(BASE / "geos/filtered_empa_sites.geojson")

lookup = dict(zip(df.siteid, df.estuaryname, strict=False))

sensors = gpd.GeoDataFrame(
    [
        {"siteid": s, "estuaryname": lookup[s], "stationno": ss, "geometry": p}
        for (s, ss), p in rpoints.items()
    ],
    crs="wgs84",
)
sensors.to_file(BASE / "geos/empa_sites.geojson")

In [None]:
sensors = gpd.GeoDataFrame(
    [
        {"siteid": s, "estuaryname": lookup[s], "stationno": ss, "geometry": p}
        for (s, ss), p in rpoints.items()
        if s not in list(matching_sites.values())
    ],
    crs="wgs84",
)

aaa = gpd.read_file("/Users/kyledorman/data/estuary/geos/ca_data.geojson")
aaa = aaa[~aaa["Site code"].isin(list(matching_sites.keys()))]

# Initialize map centered on all data
bounds = aaa.total_bounds  # [minx, miny, maxx, maxy]
m = folium.Map(location=[(bounds[1] + bounds[3]) / 2, (bounds[0] + bounds[2]) / 2], zoom_start=8)

# Add sites (one color)
for _, row in aaa.iterrows():
    folium.CircleMarker(
        location=[row.geometry.y, row.geometry.x],
        radius=6,
        color="black",
        fill=True,
        fill_color="black",
        popup=f"Site: {row['Site code']}",
    ).add_to(m)

# Add sensor type 1
for _, row in sensors.iterrows():
    folium.Marker(
        location=[row.geometry.y, row.geometry.x],
        icon=folium.Icon(color="red", icon="circle", prefix="fa"),
        popup=f"Sensor: {row['estuaryname']}",
    ).add_to(m)

# Show the map
m

In [None]:
sensors = gpd.GeoDataFrame(
    [
        {"siteid": s, "estuaryname": lookup[s], "stationno": ss, "geometry": p}
        for (s, ss), p in rpoints.items()
        if s in list(matching_sites.values())
    ],
    crs="wgs84",
)

aaa = gpd.read_file("/Users/kyledorman/data/estuary/geos/ca_data.geojson")
aaa = aaa[aaa["Site code"].isin(list(matching_sites.keys()))]

# Assume your GeoDataFrames are in EPSG:4326 (lat/lon).
# If not, reproject:
# gdf = gdf.to_crs(epsg=4326)

# Initialize map centered on all data
bounds = aaa.total_bounds  # [minx, miny, maxx, maxy]
m = folium.Map(location=[(bounds[1] + bounds[3]) / 2, (bounds[0] + bounds[2]) / 2], zoom_start=8)

# Add sites (one color)
for _, row in aaa.iterrows():
    folium.CircleMarker(
        location=[row.geometry.y, row.geometry.x],
        radius=6,
        color="black",
        fill=True,
        fill_color="black",
        popup=f"Site: {row['Site code']}",
    ).add_to(m)

# Add sensor type 1
for _, row in sensors.iterrows():
    folium.Marker(
        location=[row.geometry.y, row.geometry.x],
        icon=folium.Icon(color="red", icon="circle", prefix="fa"),
        popup=f"Sensor: {row['estuaryname']}",
    ).add_to(m)

# Show the map
m