In [None]:
import os
import sqlite3
import numpy as np
import pandas as pd

PROJECT_ROOT = os.getcwd()

MM_PATH = os.path.join(PROJECT_ROOT, "Output", "mapmatching", "mapmatching.csv")
DB_PATH = os.path.join(PROJECT_ROOT, "Output", "database", "unified_database.db")

# =========================================================
# 1. Load mapmatching and database
# =========================================================
df_mm = pd.read_csv(MM_PATH, low_memory=False)

conn = sqlite3.connect(DB_PATH)
tables = pd.read_sql("SELECT name FROM sqlite_master WHERE type='table'", conn)
table_names = tables["name"].tolist()
print("Tables in DB:", table_names)

# -------- Automatically find link table --------
link_table = None
for t in table_names:
    tl = t.lower()
    if "link" in tl and not any(x in tl for x in ["segmentid_to_link", "tmc_to_link", "zone_name_to_link"]):
        link_table = t
        break

print("Using link table:", link_table)

# Load full link table
df_link = pd.read_sql(f"SELECT * FROM {link_table}", conn)
conn.close()

print("Columns in link table:", df_link.columns.tolist())

# -------- Automatically detect length column --------
length_col_candidates = [
    c for c in df_link.columns
    if any(k in c.lower() for k in ["length", "len", "dist", "distance"])
]

length_col = length_col_candidates[0] if length_col_candidates else None
print("Detected length column:", length_col)

# =========================================================
# 2. Prepare mapmatching data
# =========================================================
df_mm["time_parsed"] = pd.to_datetime(df_mm["time"], errors="coerce")
df_mm = df_mm.dropna(subset=["time_parsed"])

df_mm["link_id"] = df_mm["link_id"].astype(str)
df_link["link_id"] = df_link["link_id"].astype(str)

# If we have a link length column, merge it; otherwise, weâ€™ll use route_dis later
if length_col is not None:
    df_mm = df_mm.merge(df_link[["link_id", length_col]], on="link_id", how="left")
else:
    print("No length column found; using route_dis as effective distance.")

df_mm = df_mm.sort_values(["agent_id", "time_parsed"])

# =========================================================
# 3. Compute speeds for waypoint_mapmatched
#    (per segment: agent + link)
# =========================================================
group = df_mm.groupby(["agent_id", "link_id"], sort=False)

enter_time = group["time_parsed"].first()
exit_time  = group["time_parsed"].last()
travel_time_s = (exit_time - enter_time).dt.total_seconds()

if length_col is not None:
    seg_len = group[length_col].first()
else:
    # If no link length is available, use difference in route_dis
    seg_len = group["route_dis"].max() - group["route_dis"].min()

df_seg = pd.DataFrame({
    "agent_id": enter_time.index.get_level_values(0),
    "link_id": enter_time.index.get_level_values(1),
    "enter_time": enter_time.values,
    "exit_time": exit_time.values,
    "travel_time_s": travel_time_s.values,
    "length_eff": seg_len.values
})

# Clean
df_seg = df_seg[(df_seg["travel_time_s"] > 0) & (df_seg["length_eff"] > 0)]

# m/s -> mph (if length_eff is in meters, this is correct;
# if it were in feet or km, the mean will shift, but the distribution shape is similar)
df_seg["speed_mph"] = (df_seg["length_eff"] / df_seg["travel_time_s"]) * 2.23694

# Filter unrealistic values
df_seg = df_seg[(df_seg["speed_mph"] > 0) & (df_seg["speed_mph"] < 120)]

print("Waypoint map-matched speeds:", len(df_seg))

# =========================================================
# 4. Load speeds from trajs
# =========================================================
conn = sqlite3.connect(DB_PATH)
df_tp = pd.read_sql("SELECT * FROM trajs", conn)
conn.close()

df_tp["CrossingSpeedMph"] = pd.to_numeric(df_tp["CrossingSpeedMph"], errors="coerce")
df_tp = df_tp[(df_tp["CrossingSpeedMph"] > 0) & (df_tp["CrossingSpeedMph"] < 120)]

print("Trajs speeds:", len(df_tp))

# =========================================================
# 5. Final statistics for the table
# =========================================================
wp_stats = df_seg["speed_mph"].describe(percentiles=[0.25, 0.5, 0.75])
tp_stats = df_tp["CrossingSpeedMph"].describe(percentiles=[0.25, 0.5, 0.75])

print("\nWaypoint Map-Matched Speed Stats:")
print(wp_stats)

print("\nTrajs Speed Stats:")
print(tp_stats)

# (Optional) build table similar to "Corridor Speed Distribution Analysis"
table = pd.DataFrame.from_dict(
    {
        "Waypoint Map-Matched": {
            "Count": int(wp_stats["count"]),
            "Mean (mph)": wp_stats["mean"],
            "Median (mph)": wp_stats["50%"],
            "Std Dev (mph)": wp_stats["std"],
            "Min (mph)": wp_stats["min"],
            "Max (mph)": wp_stats["max"],
            "Q25 (mph)": wp_stats["25%"],
            "Q75 (mph)": wp_stats["75%"],
        },
        "Trajs": {
            "Count": int(tp_stats["count"]),
            "Mean (mph)": tp_stats["mean"],
            "Median (mph)": tp_stats["50%"],
            "Std Dev (mph)": tp_stats["std"],
            "Min (mph)": tp_stats["min"],
            "Max (mph)": tp_stats["max"],
            "Q25 (mph)": tp_stats["25%"],
            "Q75 (mph)": tp_stats["75%"],
        }
    },
    orient="index"
).round(2)

print("\n=== Corridor Speed Distribution Analysis (using map-matched waypoint) ===")
print(table.to_string())
