In [None]:
import os
import sqlite3
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

PROJECT_ROOT = os.getcwd()

# =========================================================
# 1. DATA LOADING
# =========================================================

# Waypoint raw
WP_RAW_PATH = os.path.join(
    PROJECT_ROOT, "data_cleaning_fusion_datasets", "waypoint", "waypoint.csv"
)
df_wp_raw = pd.read_csv(WP_RAW_PATH, low_memory=False)

# Trajs raw (note: column names are not descriptive)
TP_RAW_PATH = os.path.join(
    PROJECT_ROOT, "data_cleaning_fusion_datasets", "trip path", "trajs.csv"
)
df_tp_raw = pd.read_csv(TP_RAW_PATH, low_memory=False)

print("Raw trajs.csv columns:")
print(df_tp_raw.columns.tolist())

# Waypoint and trajs cleaned from the unified DB
DB_PATH = os.path.join(PROJECT_ROOT, "Output", "database", "unified_database.db")
conn = sqlite3.connect(DB_PATH)
df_wp = pd.read_sql("SELECT * FROM waypoint", conn)
df_tp = pd.read_sql("SELECT * FROM trajs", conn)
conn.close()

# df_seg: waypoint map-matched by segment, with 'speed_mph'
assert "df_seg" in globals(), "df_seg must exist and contain column 'speed_mph'"

# =========================================================
# 2. HELPER FUNCTIONS
# =========================================================

def clean_speed(series):
    s = pd.to_numeric(series, errors="coerce")
    s = s.replace([np.inf, -np.inf], np.nan).dropna()
    return s[(s > 0) & (s < 120)]

# =========================================================
# 3. SPEED SERIES
# =========================================================

# ---- Waypoint: input / cleaned / map-matched ----
wp_input_speed  = clean_speed(df_wp_raw["speed_mph"])
wp_clean_speed  = clean_speed(df_wp["speed_mph"])       # only duplicates removed
wp_mm_speed     = clean_speed(df_seg["speed_mph"])      # per segment

# ---- Trajs: Input (from CSV) + Cleaned (DB) ----
# By tool specification, column 14 of trajs.csv is CrossingSpeedKph
RAW_SPEED_COL = df_tp_raw.columns[14]   # index 14 (0-based)
tp_input_speed_kph = pd.to_numeric(df_tp_raw[RAW_SPEED_COL], errors="coerce")
tp_input_speed = tp_input_speed_kph * 0.621371  # kph → mph
tp_input_speed = clean_speed(tp_input_speed)

# Cleaned + map-matched from DB
tp_clean_speed  = clean_speed(df_tp["CrossingSpeedMph"])

print("Counts:")
print("  Waypoint input:     ", len(wp_input_speed))
print("  Waypoint cleaned:   ", len(wp_clean_speed))
print("  Waypoint mapmatched:", len(wp_mm_speed))
print("  Trajs input:        ", len(tp_input_speed))
print("  Trajs cleaned:      ", len(tp_clean_speed))

# =========================================================
# 4. 4-PANEL FIGURE
# =========================================================

fig, axes = plt.subplots(2, 2, figsize=(16, 10))

# ---------- PANEL 1: WAYPOINT histogram ----------
ax = axes[0, 0]
bins = 100

ax.hist(wp_input_speed, bins=bins, density=True, alpha=0.35, label="Input")
ax.hist(wp_clean_speed, bins=bins, density=True, alpha=0.35, label="Cleaned")
ax.hist(wp_mm_speed,    bins=bins, density=True, alpha=0.35, label="Map-Matched")

ax.set_title("WAYPOINT: Speed Distribution (Input → Cleaned → Map-Matched)", fontsize=12)
ax.set_xlabel("Speed (mph)")
ax.set_ylabel("Density")
ax.set_xlim(0, 140)
ax.legend()

# ---------- PANEL 2: TRAJS histogram (Input vs Cleaned) ----------
ax = axes[0, 1]
bins_tp = 80

ax.hist(tp_input_speed, bins=bins_tp, density=True, alpha=0.35, label="Input")
ax.hist(tp_clean_speed, bins=bins_tp, density=True, alpha=0.35, label="Cleaned + Map-Matched")

ax.set_title("Trajs: Speed Distribution (Input vs Cleaned + Map-Matched)", fontsize=12)
ax.set_xlabel("Speed (mph)")
ax.set_ylabel("Density")
ax.set_xlim(0, 140)
ax.legend()

# ---------- PANEL 3: WAYPOINT boxplots ----------
ax = axes[1, 0]
bp = ax.boxplot(
    [wp_input_speed, wp_clean_speed, wp_mm_speed],
    labels=["Input", "Cleaned", "Map-Matched"],
    patch_artist=True
)

colors = ["lightcoral", "lightblue", "lightgreen"]
for patch, c in zip(bp["boxes"], colors):
    patch.set_facecolor(c)
    patch.set_alpha(0.7)

ax.set_title("WAYPOINT: Speed Statistics Comparison", fontsize=12)
ax.set_ylabel("Speed (mph)")
ax.set_ylim(0, 140)

mu_in,  sd_in  = wp_input_speed.mean(),  wp_input_speed.std()
mu_cl,  sd_cl  = wp_clean_speed.mean(),  wp_clean_speed.std()
mu_mm,  sd_mm  = wp_mm_speed.mean(),     wp_mm_speed.std()

text_wp = (f"Input: μ={mu_in:.1f}, σ={sd_in:.1f}\n"
           f"Cleaned: μ={mu_cl:.1f}, σ={sd_cl:.1f}\n"
           f"Map-Matched: μ={mu_mm:.1f}, σ={sd_mm:.1f}")
ax.text(0.5, 0.95, text_wp, transform=ax.transAxes,
        ha="center", va="top",
        bbox=dict(boxstyle="round", facecolor="w", alpha=0.8), fontsize=9)

# ---------- PANEL 4: TRAJS boxplots ----------
ax = axes[1, 1]
bp2 = ax.boxplot(
    [tp_input_speed, tp_clean_speed],
    labels=["Input", "Cleaned + Map-Matched"],
    patch_artist=True
)

colors2 = ["lightcoral", "lightblue"]
for patch, c in zip(bp2["boxes"], colors2):
    patch.set_facecolor(c)
    patch.set_alpha(0.7)

ax.set_title("Trajs: Speed Statistics Comparison", fontsize=12)
ax.set_ylabel("Speed (mph)")
ax.set_ylim(0, 140)

mu_ti, sd_ti = tp_input_speed.mean(),  tp_input_speed.std()
mu_tc, sd_tc = tp_clean_speed.mean(),  tp_clean_speed.std()

text_tp = (f"Input: μ={mu_ti:.1f}, σ={sd_ti:.1f}\n"
           f"Cleaned + Map-Matched: μ={mu_tc:.1f}, σ={sd_tc:.1f}")
ax.text(0.5, 0.95, text_tp, transform=ax.transAxes,
        ha="center", va="top",
        bbox=dict(boxstyle="round", facecolor="w", alpha=0.8), fontsize=9)

plt.tight_layout()

out_dir = os.path.join(PROJECT_ROOT, "figures_from_tool_db_only")
os.makedirs(out_dir, exist_ok=True)
out_path = os.path.join(out_dir, "speed_distributions_waypoint_trajs_input_clean_mm_FINAL.png")
plt.savefig(out_path, dpi=300)
plt.show()

print("Figure saved to:", out_path)
