In [None]:
# Statistical Summary Comparison

import os
import sqlite3
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# =========================================
# 1. CONFIG: ruta a la base de datos
# =========================================
PROJECT_ROOT = os.getcwd()  # o ponla fija si quieres
DB_PATH = os.path.join(PROJECT_ROOT, "Output", "database", "unified_database.db")
FIG_DIR = os.path.join(PROJECT_ROOT, "figures_from_tool_db_only")
os.makedirs(FIG_DIR, exist_ok=True)

# Nombres de columnas (ajusta si difieren en tu DB)
WAYPOINT_SPEED_COL = "speed_mph"
TRAJS_SPEED_COL = "CrossingSpeedMph"

# =========================================
# 2. Cargar datos de waypoint y trajs
# =========================================
if not os.path.exists(DB_PATH):
    raise FileNotFoundError(f"Database not found at: {DB_PATH}")

conn = sqlite3.connect(DB_PATH)

df_wp = pd.read_sql("SELECT * FROM waypoint", conn)
df_tp = pd.read_sql("SELECT * FROM trajs", conn)

conn.close()

print("waypoint rows:", len(df_wp))
print("trajs rows   :", len(df_tp))

# Tomar solo las columnas de velocidad y eliminar NaNs
if WAYPOINT_SPEED_COL not in df_wp.columns:
    raise KeyError(f"Column '{WAYPOINT_SPEED_COL}' not found in waypoint table.")
if TRAJS_SPEED_COL not in df_tp.columns:
    raise KeyError(f"Column '{TRAJS_SPEED_COL}' not found in trajs table.")

s_wp = df_wp[WAYPOINT_SPEED_COL].dropna().values
s_tp = df_tp[TRAJS_SPEED_COL].dropna().values

# (Opcional) submuestreo para que las gráficas no queden saturadas
max_points = 500000
if len(s_wp) > max_points:
    s_wp = np.random.choice(s_wp, max_points, replace=False)
if len(s_tp) > max_points:
    s_tp = np.random.choice(s_tp, max_points, replace=False)

# =========================================
# 3. Histograma / PDF (como “Speed Distribution Comparison”)
# =========================================
plt.figure(figsize=(8, 5))
bins = 50

plt.hist(s_wp, bins=bins, density=True, alpha=0.5, label="Waypoint")
plt.hist(s_tp, bins=bins, density=True, alpha=0.5, label="Trajs")

plt.xlabel("Speed (mph)")
plt.ylabel("Density")
plt.title("Speed Distribution Comparison (Waypoint vs Trajs)")
plt.legend()
plt.tight_layout()

out_path = os.path.join(FIG_DIR, "speed_distribution_hist_wp_vs_trajs_from_db.png")
plt.savefig(out_path, dpi=300)
plt.close()
print("Saved:", out_path)

# =========================================
# 4. Boxplot (como el “Speed Distribution Box Plot”)
# =========================================
plt.figure(figsize=(6, 5))
plt.boxplot([s_wp, s_tp], labels=["Waypoint", "Trajs"])
plt.ylabel("Speed (mph)")
plt.title("Speed Distribution Box Plot (DB speeds)")
plt.tight_layout()

out_path = os.path.join(FIG_DIR, "speed_boxplot_wp_vs_trajs_from_db.png")
plt.savefig(out_path, dpi=300)
plt.close()
print("Saved:", out_path)

# =========================================
# 5. CDF (Cumulative Distribution Function)
# =========================================
plt.figure(figsize=(8, 5))

for data, label in [(s_wp, "Waypoint"), (s_tp, "Trajs")]:
    data_sorted = np.sort(data)
    y = np.linspace(0, 1, len(data_sorted))
    plt.plot(data_sorted, y, label=label)

plt.xlabel("Speed (mph)")
plt.ylabel("Cumulative Probability")
plt.title("CDF of Speeds (Waypoint vs Trajs, from DB)")
plt.legend()
plt.tight_layout()

out_path = os.path.join(FIG_DIR, "speed_cdf_wp_vs_trajs_from_db.png")
plt.savefig(out_path, dpi=300)
plt.close()
print("Saved:", out_path)

# =========================================
# 6. Resumen estadístico (tabla + bar chart)
# =========================================
def summary_stats(x):
    return {
        "count": len(x),
        "mean": np.mean(x),
        "median": np.median(x),
        "std": np.std(x, ddof=1),
        "min": np.min(x),
        "max": np.max(x),
        "q25": np.percentile(x, 25),
        "q75": np.percentile(x, 75),
    }

st_wp = summary_stats(s_wp)
st_tp = summary_stats(s_tp)

stats_df = pd.DataFrame([st_wp, st_tp], index=["Waypoint", "Trajs"])
print("\nSpeed summary stats from DB:")
print(stats_df)

# Bar chart (mean, median, std)
metrics = ["mean", "median", "std"]
x = np.arange(len(metrics))
width = 0.35

plt.figure(figsize=(7, 5))
plt.bar(x - width/2, [st_wp[m] for m in metrics], width, label="Waypoint")
plt.bar(x + width/2, [st_tp[m] for m in metrics], width, label="Trajs")
plt.xticks(x, metrics)
plt.ylabel("Speed (mph)")
plt.title("Statistical Summary Comparison (from DB)")
plt.legend()
plt.tight_layout()

out_path = os.path.join(FIG_DIR, "speed_stats_summary_wp_vs_trajs_from_db.png")
plt.savefig(out_path, dpi=300)
plt.close()
print("Saved:", out_path)
