In [58]:
# ==============================
# User Parameters
# ==============================

TRACK_FILE = "track01.kml"
DATA_FILE = "data.csv"
SEARCH_RADIUS = 5                      # search radius
ONLY_UNMAPPED = True                   # << map only markers without ref:hmdb
EXPORT_RESULTS = False                 # << toggle CSV export
SHOW_MAP_IN_NOTEBOOK = False           # display inside Jupyter
OPEN_MAP_IN_BROWSER = True             # open in browser window
EXPORT_GEOJSON_COMBINED = False        # toggle ON/OFF
EXPORT_KML = False

# ==============================
# Imports
# ==============================

import xml.etree.ElementTree as ET
import pandas as pd
from shapely.geometry import LineString, Point
from shapely.ops import transform
from geopy.distance import geodesic
import pyproj
from IPython.display import display
import webbrowser
import folium
from folium.plugins import MarkerCluster
from folium import FeatureGroup, LayerControl, Map, CircleMarker, GeoJson
import geopandas as gpd
import html
import functools

# ==============================
# Load GPS Route from KML
# ==============================

def load_kml_route(path):
    tree = ET.parse(path)
    root = tree.getroot()
    ns = {"kml": "http://www.opengis.net/kml/2.2"}

    coords = []
    for c in root.findall(".//kml:coordinates", ns):
        text = c.text.strip()
        for row in text.split():
            lon, lat, *_ = map(float, row.split(","))
            coords.append((lon, lat))

    return LineString(coords)


# ==============================
# Load Route + Data
# ==============================

route = load_kml_route(TRACK_FILE)
print(f"Route loaded: {len(route.coords)} points")

df = pd.read_csv(DATA_FILE, low_memory=False)
print(f"Loaded dataset: {len(df)} rows")


# ==============================
# Filtering: Unmapped / All
# ==============================

if ONLY_UNMAPPED:
    markers = df[df["ref:hmdb"].isna() | (df["ref:hmdb"].astype(str).str.strip() == "")]
    print(f"Filtering to UNMAPPED markers only: {len(markers)} rows")
else:
    markers = df.copy()
    print(f"Using ALL markers: {len(markers)} rows")


# ==============================
# Coordinate Cleanup
# ==============================

markers = markers.copy()  # avoid SettingWithCopyWarning explicitly

LAT_COL = "thc:Latitude"
LON_COL = "thc:Longitude"

markers.loc[:, LAT_COL] = pd.to_numeric(markers[LAT_COL], errors='coerce')
markers.loc[:, LON_COL] = pd.to_numeric(markers[LON_COL], errors='coerce')

markers = markers.dropna(subset=[LAT_COL, LON_COL]).copy()
print(f"Valid coordinate markers: {len(markers)}")

markers.loc[:, "geometry"] = markers.apply(
    lambda r: Point(r[LON_COL], r[LAT_COL]),
    axis=1
)


# ==============================
# Distance Filter & Optional CSV Export
# ==============================

# Project to a metric CRS so distance is in meters
proj = pyproj.Transformer.from_crs(
    "EPSG:4326",
    "EPSG:3857",
    always_xy=True
).transform

route_proj = transform(proj, route)

def fast_distance_miles(pt):
    pt_proj = transform(proj, pt)
    return route_proj.distance(pt_proj) / 1609.34   # meters ‚Üí miles

near = markers[markers["geometry"].apply(
    lambda p: fast_distance_miles(p) <= SEARCH_RADIUS
)]

tag = "unmapped" if ONLY_UNMAPPED else "all"
output_csv = f"near_route_{tag}_{SEARCH_RADIUS}mi.csv"

if EXPORT_RESULTS:
    near.to_csv(output_csv, index=False)
    print(f"Found {len(near)} {tag} markers within {SEARCH_RADIUS} miles of the route.")
    print(f"Saved results ‚Üí {output_csv}")
else:
    print(f"Found {len(near)} {tag} markers within {SEARCH_RADIUS} miles of the route.")
    print("EXPORT_RESULTS=False ‚Üí CSV not written.")

# ==============================
# Folium Interactive Map
# ==============================

# Center map on route midpoint
c_lon, c_lat = route.coords[len(route.coords) // 2]
m = folium.Map(location=[c_lat, c_lon], zoom_start=9)

# Feature groups
fg_route    = FeatureGroup(name="Route Line", show=True)
fg_unmapped = FeatureGroup(name="Unmapped (HMDB missing)", show=True)
fg_mapped   = FeatureGroup(name="Mapped (HMDB present)", show=True)

# Draw route line
GeoJson(route, name="Route").add_to(fg_route)

# Marker clusters
clu_unmapped = MarkerCluster(name="Unmapped Cluster")
clu_mapped   = MarkerCluster(name="Mapped Cluster")

# ---------------------------------
# Add markers to the map
# ---------------------------------
for _, row in near.iterrows():

    lat, lon = row[LAT_COL], row[LON_COL]
    name = row.get("name", "Unnamed Marker")

    # HMDB detection
    val = row.get("ref:hmdb")
    is_mapped = pd.notna(val) and str(val).strip().lower() not in ["", "nan", "none"]
    color = "green" if is_mapped else "red"

    popup_html = f"""
        <b>{name}</b><br>
        County: {row.get('addr:county', '')}<br>
        HMDB: {val if is_mapped else '<i>None</i>'}
    """

    marker = CircleMarker(
        location=[lat, lon],
        radius=6,
        color=color,
        fill=True,
        fill_opacity=0.85,
        popup=popup_html,
        tooltip=name
    )

    # Add to correct cluster
    if is_mapped:
        marker.add_to(clu_mapped)
    else:
        marker.add_to(clu_unmapped)

# Attach clusters to feature groups (correct placement)
clu_unmapped.add_to(fg_unmapped)
clu_mapped.add_to(fg_mapped)

# Add feature groups to map
fg_route.add_to(m)
fg_unmapped.add_to(m)
fg_mapped.add_to(m)

# Layer toggle control
LayerControl(collapsed=False).add_to(m)

# Auto-zoom to bounds
if len(near) > 0:
    bounds = [[row[LAT_COL], row[LON_COL]] for _, row in near.iterrows()]
    m.fit_bounds(bounds)

# ==============================
# Legend (Top-Right, Always Visible)
# ==============================

legend_html = """
<div style="
    position: fixed;
    top: 10px;
    left: 60px;                      /* distance from zoom buttons       */
    z-index: 999999;
    background: rgba(255,255,255,0.88);
    padding: 6px 10px;               /* smaller box                      */
    border-radius: 6px;
    box-shadow: 0 1px 4px rgba(0,0,0,0.25);
    font-size:12px;                  /* smaller text                     */
    line-height:14px;
">
    <b style="font-size:13px;">Legend</b><br>
    <span style="color:blue; font-size:14px;">‚ñ¨</span> Route<br>
    <span style="color:red; font-size:14px;">‚óè</span> Unmapped<br>
    <span style="color:green; font-size:14px;">‚óè</span> Mapped
</div>
"""

m.get_root().html.add_child(folium.Element(legend_html))


# ============================================================
# INTERACTIVE EXPORT BUTTONS ‚Äî KML FOR UNMAPPED OR ALL MARKERS
# ============================================================

from IPython.display import display
import ipywidgets as widgets
import datetime


def export_kml(unmapped_only=True):
    tag    = "unmapped" if unmapped_only else "all"
    stamp  = datetime.datetime.now().strftime("%Y%m%d_%H%M")
    filename = f"THC_markers_{tag}_{SEARCH_RADIUS}mi_{stamp}.kml"

    # Filter rows based on user choice
    df_kml = (
        markers[markers['mapped_status'] == "unmapped"].copy()
        if unmapped_only
        else markers.copy()
    )

    # Build XML list
    kml = [
        '<?xml version="1.0" encoding="UTF-8"?>',
        '<kml xmlns="http://www.opengis.net/kml/2.2"><Document>',
        f"<name>THC Markers ({tag}, {SEARCH_RADIUS}mi)</name>"
    ]

    for _, r in df_kml.iterrows():
        lat, lon = r[LAT_COL], r[LON_COL]
        name     = r.get("name", "Unknown")
        ref_tx   = r.get("ref_us_tx_thc", "")
        hmdb     = r.get("ref_hmdb", "")

        kml.append(f"""
        <Placemark>
            <name>{name}</name>
            <description><![CDATA[
                <b>{name}</b><br>
                THC ID: {ref_tx}<br>
                HMDB: {hmdb if hmdb else "<i>None</i>"}
            ]]></description>
            <Point><coordinates>{lon},{lat},0</coordinates></Point>
        </Placemark>
        """)

    kml.append("</Document></kml>")

    # Write file
    with open(filename, "w", encoding="utf-8") as f:
        f.write("".join(kml))

    print(f"üìç KML export complete ‚Üí {filename}")
    return filename

# ===========================
# Export Buttons UI
# ===========================

btn_unmapped = widgets.Button(
    description="Export Unmapped KML",
    button_style="danger"
)

btn_all = widgets.Button(
    description="Export All KML",
    button_style="success"
)


def on_unmapped_click(b):
    export_kml(unmapped_only=True)

def on_all_click(b):
    export_kml(unmapped_only=False)


btn_unmapped.on_click(on_unmapped_click)
btn_all.on_click(on_all_click)

display(widgets.HBox([btn_unmapped, btn_all]))

# ==============================
# Save HTML only if browser launch requested
# ==============================

map_filename = f"near_route_map_{tag}_{SEARCH_RADIUS}mi.html"

if OPEN_MAP_IN_BROWSER:
    m.save(map_filename)
    print(f"Map saved ‚Üí {map_filename}")
    webbrowser.open(map_filename)

# Display in notebook
if SHOW_MAP_IN_NOTEBOOK:
    display(m)


# ==============================
# Export Route + Markers to GeoJSON
# ==============================

if EXPORT_GEOJSON_COMBINED:

    KEEP_FIELDS = ["name", "ref:US-TX:thc", "ref:hmdb", "mapped_status"]
    slim_markers = near[KEEP_FIELDS + ["geometry"]].copy()

    # Convert markers ‚Üí GeoDataFrame
    gdf_markers = gpd.GeoDataFrame(
        near.copy(), geometry="geometry", crs="EPSG:4326"
    )

    # Convert route ‚Üí GeoDataFrame
    gdf_route = gpd.GeoDataFrame(
        [{"name": "travel_route", "geometry": route}],
        crs="EPSG:4326"
    )

    # Combine datasets
    combined = pd.concat([gdf_route, gdf_markers], ignore_index=True)
    combined = gpd.GeoDataFrame(combined, geometry="geometry", crs="EPSG:4326")

    output_geojson = f"combined_route_markers_{tag}_{SEARCH_RADIUS}mi.geojson"
    combined.to_file(output_geojson, driver="GeoJSON")

    print(f"\nüó∫ GeoJSON Exported ‚Üí {output_geojson}")
    print("Drag into https://geojson.io or QGIS to view route + markers together!\n")


# ==============================
# Export Clean Google-Friendly KML
# ==============================

def xml_safe(text):
    if text is None:
        return ""
    return html.escape(str(text))   # replaces &,<,>,",'


kml_filename = f"THC_markers_route_{SEARCH_RADIUS}mi.kml"

# Prepare export dataframe
export = near.copy()
export["mapped_status"] = export["ref:hmdb"].apply(
    lambda x: "mapped" if pd.notna(x) and str(x).strip() != "" else "unmapped"
)

export = export.rename(columns={
    "ref:US-TX:thc": "ref_tx_thc",
    "ref:hmdb":      "ref_hmdb"
})

export = export[["name", "ref_tx_thc", "ref_hmdb", "mapped_status", "geometry"]].copy()


# ------------------ Write KML File ------------------

with open(kml_filename, "w", encoding="utf-8") as f:

    # Header + styles
    f.write(
        """<?xml version="1.0" encoding="UTF-8"?>
<kml xmlns="http://www.opengis.net/kml/2.2">
<Document>

    <name>Texas Historical Markers</name>

    <!-- marker styles -->
    <Style id="mapped">
        <IconStyle><color>ff00ff00</color><scale>1.2</scale></IconStyle>
    </Style>

    <Style id="unmapped">
        <IconStyle><color>ff0000ff</color><scale>1.2</scale></IconStyle>
    </Style>
"""
    )

    # Route Folder
    if isinstance(route, LineString):
        coords = " ".join(f"{x},{y},0" for x, y in route.coords)

        f.write(
            f"""
    <Folder>
        <name>Route Line</name>
        <Placemark>
            <name>Route</name>
            <Style><LineStyle><color>ff0000ff</color><width>4</width></LineStyle></Style>
            <LineString>
                <tessellate>1</tessellate>
                <coordinates>{coords}</coordinates>
            </LineString>
        </Placemark>
    </Folder>
"""
        )

    # Markers Folder
    f.write("    <Folder><name>Markers</name>\n")

    for _, r in export.iterrows():
        if not isinstance(r.geometry, Point):
            continue

        lat, lon = r.geometry.y, r.geometry.x
        name   = xml_safe(r["name"])
        thc    = xml_safe(r["ref_tx_thc"])
        hmdb   = xml_safe(r["ref_hmdb"])
        status = r["mapped_status"]
        style  = "mapped" if status == "mapped" else "unmapped"

        desc = xml_safe(
            f"Name: {name} | THC: {thc} | HMDB: {hmdb} | Status: {status}"
        )

        f.write(
            f"""
        <Placemark>
            <name>{name}</name>
            <styleUrl>#{style}</styleUrl>
            <description>{desc}</description>
            <Point><coordinates>{lon},{lat},0</coordinates></Point>
        </Placemark>
"""
        )

    f.write("    </Folder>\n</Document>\n</kml>")


print(
    f"""
üìç Clean Google-Compatible KML Exported ‚Üí {kml_filename}

Open with:
‚Ä¢ Google MyMaps
‚Ä¢ Google Earth
‚Ä¢ OsmAnd / Maps.Me / Gaia
"""
)


Route loaded: 3048 points
Loaded dataset: 16974 rows
Filtering to UNMAPPED markers only: 5762 rows
Valid coordinate markers: 3812
Found 45 unmapped markers within 5 miles of the route.
EXPORT_RESULTS=False ‚Üí CSV not written.


HBox(children=(Button(button_style='danger', description='Export Unmapped KML', style=ButtonStyle()), Button(b‚Ä¶

Map saved ‚Üí near_route_map_unmapped_5mi.html

üìç Clean Google-Compatible KML Exported ‚Üí THC_markers_route_5mi.kml

Open with:
‚Ä¢ Google MyMaps
‚Ä¢ Google Earth
‚Ä¢ OsmAnd / Maps.Me / Gaia



In [59]:
# ======================
# User Parameters
# ======================

TRACK_FILE = "track01.kml"
DATA_FILE  = "data.csv"
SEARCH_RADIUS = 5
ONLY_UNMAPPED = True
EXPORT_RESULTS = True
SHOW_MAP_IN_NOTEBOOK = True
OPEN_MAP_IN_BROWSER = True
EXPORT_GEOJSON_COMBINED = False
EXPORT_KML = True

# ======================
# Core Imports (always needed)
# ======================
import xml.etree.ElementTree as ET
import pandas as pd
from shapely.geometry import LineString, Point
from shapely.ops import transform
import pyproj
import folium
from folium.plugins import MarkerCluster


############################
# Load Route from KML
############################
def load_kml_route(path):
    tree = ET.parse(path)
    coords=[]
    for c in tree.findall(".//{http://www.opengis.net/kml/2.2}coordinates"):
        coords.extend([(float(x),float(y)) for x,y,*_ in (row.split(",") for row in c.text.split())])
    return LineString(coords)

route = load_kml_route(TRACK_FILE)
print(f"Route points: {len(route.coords)}")

df = pd.read_csv(DATA_FILE)
print(f"Loaded {len(df)} rows")


############################
# Filtering / Preprocessing
############################
markers = df.copy()

if ONLY_UNMAPPED:
    markers = markers[markers["ref:hmdb"].fillna("").str.strip() == ""]
    print(f"Unmapped only ‚Üí {len(markers)}")
else:
    print(f"All markers ‚Üí {len(markers)}")

LAT_COL="thc:Latitude"; LON_COL="thc:Longitude"

markers[LAT_COL] = pd.to_numeric(markers[LAT_COL], errors='coerce')
markers[LON_COL] = pd.to_numeric(markers[LON_COL], errors='coerce')
markers = markers.dropna(subset=[LAT_COL,LON_COL])

markers["geometry"] = [Point(xy) for xy in zip(markers[LON_COL],markers[LAT_COL])]
print(f"Valid coords: {len(markers)}")

############################
# Distance filtering (fast)
############################
proj = pyproj.Transformer.from_crs("EPSG:4326","EPSG:3857",always_xy=True).transform
rproj = transform(proj, route)

# Distance vectorized faster than lambda apply
def dist(pt): return transform(proj,pt).distance(rproj) / 1609.34
near = markers[[dist(g)<=SEARCH_RADIUS for g in markers.geometry]]

tag="unmapped" if ONLY_UNMAPPED else "all"
print(f"Within {SEARCH_RADIUS}mi ‚Üí {len(near)}")

if EXPORT_RESULTS:
    near.to_csv(f"near_{tag}.csv",index=False)


############################
# Generate Map
############################
mid=route.coords[len(route.coords)//2]
m=folium.Map(location=[mid[1],mid[0]],zoom_start=9)

fgR=folium.FeatureGroup("Route")
folium.GeoJson(route).add_to(fgR)
fgR.add_to(m)

fgU=folium.FeatureGroup("Unmapped"); cluU=MarkerCluster().add_to(fgU)
fgM=folium.FeatureGroup("Mapped");   cluM=MarkerCluster().add_to(fgM)

for _,r in near.iterrows():
    is_mapped = pd.notna(r.get("ref:hmdb","")).and_(str(r["ref:hmdb"]).strip()!="")
    color="green" if is_mapped else "red"
    folium.CircleMarker(
        [r[LAT_COL],r[LON_COL]],
        radius=6,color=color,fill_opacity=.85,
        tooltip=r.get("name","Unnamed")
    ).add_to(cluM if is_mapped else cluU)

fgU.add_to(m); fgM.add_to(m)
folium.LayerControl(collapsed=False).add_to(m)

if len(near)>0:
    m.fit_bounds([[r[LAT_COL],r[LON_COL]] for _,r in near.iterrows()])


############################
# Legend (unchanged)
############################
m.get_root().html.add_child(folium.Element("""
<div style="position:fixed;top:10px;left:60px;z-index:9999;
background:rgba(255,255,255,.88);padding:6px 10px;border-radius:6px;
font-size:12px;box-shadow:0 1px 4px rgba(0,0,0,.25);">
<b style="font-size:13px;">Legend</b><br>
<span style="color:blue">‚ñ¨</span> Route<br>
<span style="color:red">‚óè</span> Unmapped<br>
<span style="color:green">‚óè</span> Mapped
</div>
"""))


############################
# Save HTML
############################
fname=f"map_{tag}_{SEARCH_RADIUS}mi.html"
m.save(fname)
print(f"Saved ‚Üí {fname}")

if OPEN_MAP_IN_BROWSER:
    import webbrowser
    webbrowser.open(fname)


#####


Route points: 3048
Loaded 16974 rows
Unmapped only ‚Üí 5762


  df = pd.read_csv(DATA_FILE)


Valid coords: 3812
Within 5mi ‚Üí 45


ModuleNotFoundError: No module named 'pandas.io.formats.csvs'

In [60]:
import pandas as pd
print(pd.__version__)
print(pd.__file__)


2.3.3
/home/joe/.local/share/pipx/venvs/notebook/lib/python3.12/site-packages/pandas/__init__.py
