In [None]:
# CORRIDOR SPEED DISTRIBUTION ANALYSIS

import os
import sqlite3
import numpy as np
import pandas as pd

# =========================================
# 0. CONFIGURACIÓN DE RUTAS
# =========================================
# PROJECT_ROOT = os.getcwd()  # si corres el script desde la carpeta del proyecto
PROJECT_ROOT = os.getcwd()   # déjalo así si lo estás corriendo en la raíz

DB_PATH = os.path.join(PROJECT_ROOT, "Output", "database", "unified_database.db")

RAW_WP_PATH = os.path.join(
    PROJECT_ROOT, "data_cleaning_fusion_datasets", "waypoint", "waypoint.csv"
)
RAW_TP_PATH = os.path.join(
    PROJECT_ROOT, "data_cleaning_fusion_datasets", "trip path", "trajs.csv"
)

OUTPUT_TABLE_DIR = os.path.join(PROJECT_ROOT, "tables_from_tool")
os.makedirs(OUTPUT_TABLE_DIR, exist_ok=True)

# nombres de columnas de velocidad en la BD (tu caso)
WAYPOINT_SPEED_COL = "speed_mph"
TRAJS_SPEED_COL = "CrossingSpeedMph"

# =========================================
# 1. Cargar INPUTS CRUDOS (antes del cleaning)
# =========================================
if not os.path.exists(RAW_WP_PATH):
    raise FileNotFoundError(f"No encuentro el waypoint crudo en: {RAW_WP_PATH}")
if not os.path.exists(RAW_TP_PATH):
    raise FileNotFoundError(f"No encuentro el trajs crudo en: {RAW_TP_PATH}")
if not os.path.exists(DB_PATH):
    raise FileNotFoundError(f"No encuentro la base: {DB_PATH}")

df_wp_raw = pd.read_csv(RAW_WP_PATH, low_memory=False)
df_tp_raw = pd.read_csv(RAW_TP_PATH, low_memory=False)

wp_input = len(df_wp_raw)
tp_input = len(df_tp_raw)

print("Input records:")
print("  Waypoint:", wp_input)
print("  Trajs   :", tp_input)

# =========================================
# 2. Cargar datos FINALES desde unified_database.db
# =========================================
conn = sqlite3.connect(DB_PATH)
df_wp = pd.read_sql("SELECT * FROM waypoint", conn)
df_tp = pd.read_sql("SELECT * FROM trajs", conn)
conn.close()

wp_final = len(df_wp)
tp_final = len(df_tp)

print("\nFinal corridor records (desde unified_database.db):")
print("  Waypoint:", wp_final)
print("  Trajs   :", tp_final)

# =========================================
# 3. Calcular impactos por etapa
# =========================================
# Por ahora NO conocemos los flags de duplicados/outliers/errores,
# así que los fijamos en 0 y dejamos que TODA la diferencia
# se vaya a "Map Matching Filter".
#
# Si luego encuentras columnas de flags (por ejemplo en la BD o en un CSV),
# solo cambia estos valores o calcula a partir de los datos.

# ------- AJUSTABLES: si sabes los totales de cada etapa, ponlos aquí -------
wp_dup = 0            # registros waypoint removidos por duplicados
wp_out = 0            # registros waypoint removidos por outliers
wp_err = 0            # registros waypoint removidos por error

tp_dup = 0            # registros trajs removidos por duplicados
tp_out = 0            # registros trajs removidos por outliers
tp_err = 0            # registros trajs removidos por error
# ---------------------------------------------------------------------------

wp_removed_total = wp_input - wp_final
tp_removed_total = tp_input - tp_final

wp_mapfilter = max(wp_removed_total - (wp_dup + wp_out + wp_err), 0)
tp_mapfilter = max(tp_removed_total - (tp_dup + tp_out + tp_err), 0)

# =========================================
# 4. Construir tabla "Cleaning Impact Summary"
# =========================================
def fmt_change(count, base):
    """
    Formato tipo: -310,512 (1.59%)
    """
    if base == 0:
        pct = 0.0
    else:
        pct = 100.0 * count / base
    sign = "-" if count > 0 else ""
    return f"{sign}{count:,} ({pct:.2f}%)"

rows = []

rows.append({
    "Cleaning Action": "Input Records",
    "Waypoint Impact": f"{wp_input:,}",
    "Trip Path Impact": f"{tp_input:,}",
})
rows.append({
    "Cleaning Action": "Duplicate Removal",
    "Waypoint Impact": fmt_change(wp_dup, wp_input),
    "Trip Path Impact": fmt_change(tp_dup, tp_input),
})
rows.append({
    "Cleaning Action": "Outlier Removal",
    "Waypoint Impact": fmt_change(wp_out, wp_input),
    "Trip Path Impact": fmt_change(tp_out, tp_input),
})
rows.append({
    "Cleaning Action": "Error Data Removal",
    "Waypoint Impact": fmt_change(wp_err, wp_input),
    "Trip Path Impact": fmt_change(tp_err, tp_input),
})
rows.append({
    "Cleaning Action": "Map Matching Filter",
    "Waypoint Impact": fmt_change(wp_mapfilter, wp_input),
    "Trip Path Impact": fmt_change(tp_mapfilter, tp_input),
})
rows.append({
    "Cleaning Action": "Final Corridor Records",
    "Waypoint Impact": f"{wp_final:,} ({100*wp_final/wp_input:.2f}%)",
    "Trip Path Impact": f"{tp_final:,} ({100*tp_final/tp_input:.2f}%)",
})

cleaning_table = pd.DataFrame(rows)

print("\n=== CLEANING IMPACT SUMMARY ===")
print(cleaning_table.to_string(index=False))

cleaning_csv_path = os.path.join(OUTPUT_TABLE_DIR, "cleaning_impact_summary.csv")
cleaning_table.to_csv(cleaning_csv_path, index=False)
print(f"\nTabla de cleaning guardada en: {cleaning_csv_path}")

# =========================================
# 5. "Corridor Speed Distribution Analysis"
# =========================================
def speed_summary(series):
    s = pd.to_numeric(series, errors="coerce").replace([np.inf, -np.inf], np.nan).dropna()
    return {
        "Count": int(s.size),
        "Mean (mph)": s.mean(),
        "Median (mph)": s.median(),
        "Std Dev (mph)": s.std(),
        "Min (mph)": s.min(),
        "Max (mph)": s.max(),
        "Q25 (mph)": s.quantile(0.25),
        "Q75 (mph)": s.quantile(0.75),
    }

if WAYPOINT_SPEED_COL not in df_wp.columns:
    raise KeyError(f"No encontré la columna {WAYPOINT_SPEED_COL} en la tabla waypoint")
if TRAJS_SPEED_COL not in df_tp.columns:
    raise KeyError(f"No encontré la columna {TRAJS_SPEED_COL} en la tabla trajs")

wp_stats = speed_summary(df_wp[WAYPOINT_SPEED_COL])
tp_stats = speed_summary(df_tp[TRAJS_SPEED_COL])

speed_table = pd.DataFrame.from_dict(
    {
        "Waypoint": wp_stats,
        "Trajs": tp_stats
    },
    orient="index"
).round(2)

speed_table = speed_table[
    ["Count", "Mean (mph)", "Median (mph)", "Std Dev (mph)",
     "Min (mph)", "Max (mph)", "Q25 (mph)", "Q75 (mph)"]
]

print("\n=== CORRIDOR SPEED DISTRIBUTION ANALYSIS ===")
print(speed_table.to_string())

speed_csv_path = os.path.join(OUTPUT_TABLE_DIR, "corridor_speed_distribution.csv")
speed_table.to_csv(speed_csv_path)
print(f"\nTabla de velocidades guardada en: {speed_csv_path}")
