In [None]:
# cleaning_impact_with_mapmatching

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# ==============================
# 1. CONFIGURACIÓN DE RUTAS
# ==============================
PROJECT_ROOT = os.getcwd()

RAW_WP_PATH = os.path.join(
    PROJECT_ROOT, "data_cleaning_fusion_datasets", "waypoint", "waypoint.csv"
)
RAW_TP_PATH = os.path.join(
    PROJECT_ROOT, "data_cleaning_fusion_datasets", "trip path", "trajs.csv"
)
MM_PATH = os.path.join(
    PROJECT_ROOT, "Output", "mapmatching", "mapmatching.csv"
)

# ==============================
# 2. CONTEOS (TUS NÚMEROS)
# ==============================
wp_input = 19_471_725      # waypoint crudo
wp_clean = 19_161_213      # waypoint después de deduplicar
wp_map   = 6_116_804       # waypoint map-matched (len(df_mm))

tp_input = 1_048_575       # trajs crudo
tp_clean = 47_296          # trajs cleaned + mapped (final corridor)

# Para paneles
sources = ["Waypoint", "Trajs"]

# --- Panel 1: counts ---
# Para waypoint tenemos 3 barras, para trajs solo 2 (input y cleaned)
counts_input  = np.array([wp_input, tp_input], dtype=float)
counts_clean  = np.array([wp_clean, tp_clean], dtype=float)
counts_map_wp = wp_map      # solo waypoint

# ==============================
# 3. PORCENTAJES (Panel 2)
#    Usamos el dataset FINAL:
#    - Waypoint -> map-matched
#    - Trajs    -> cleaned + mapped
# ==============================
final_wp = wp_map
final_tp = tp_clean

removed_pct_wp = (wp_input - final_wp) / wp_input * 100.0
retained_pct_wp = 100.0 - removed_pct_wp

removed_pct_tp = (tp_input - final_tp) / tp_input * 100.0
retained_pct_tp = 100.0 - removed_pct_tp

# Cálculo seguro + corrección explícita
removed_pct_wp = (wp_input - final_wp) / wp_input * 100
retained_pct_wp = final_wp / wp_input * 100

removed_pct_tp = (tp_input - final_tp) / tp_input * 100
retained_pct_tp = final_tp / tp_input * 100

removed_pct  = np.array([removed_pct_wp, removed_pct_tp])
retained_pct = np.array([retained_pct_wp, retained_pct_tp])


# ==============================
# 4. FILE SIZES (Panel 3)
# ==============================
# Tamaño real de los archivos de entrada
wp_input_size = os.path.getsize(RAW_WP_PATH) / (1024**3)  # GB
tp_input_size = os.path.getsize(RAW_TP_PATH) / (1024**3)  # GB

# Tamaño real de waypoint map-matched
wp_clean_size = os.path.getsize(MM_PATH) / (1024**3)      # GB

# Para trajs cleaned no tenemos CSV aparte, estimamos por proporción de registros
tp_clean_size = tp_input_size * (tp_clean / tp_input)

sizes_input = np.array([wp_input_size, tp_input_size])
sizes_clean = np.array([wp_clean_size, tp_clean_size])

# ==============================
# 5. DIBUJAR LA FIGURA
# ==============================
fig, axes = plt.subplots(1, 3, figsize=(15, 4), constrained_layout=True)
x = np.arange(len(sources))
width = 0.25   # ancho de barras

# -------- Panel 1: Record counts --------
ax = axes[0]

# Input y cleaned para ambos
ax.bar(x - width, counts_input, width, label="Input Data")
ax.bar(x,         counts_clean, width, label="Cleaned Data")

# Tercera barra: solo waypoint map-matched
mm_counts = np.array([counts_map_wp, np.nan])  # nan para Trajs (no se dibuja barra visible)
ax.bar(x + width, mm_counts, width, label="Waypoint Map-Matched")

ax.set_yscale("log")
ax.set_ylabel("Number of Records")
ax.set_xlabel("Data Source")
ax.set_title("Input vs Cleaned Data: Record Counts")
ax.set_xticks(x)
ax.set_xticklabels(sources)
ax.legend(fontsize=8)

def fmt_count(val):
    if np.isnan(val):
        return ""
    if val >= 1e6:
        return f"{val/1e6:.1f}M"
    elif val >= 1e3:
        return f"{val/1e3:.0f}K"
    else:
        return str(int(val))

for i in range(len(sources)):
    ax.text(x[i] - width, counts_input[i]*1.05, fmt_count(counts_input[i]),
            ha="center", va="bottom", fontsize=7)
    ax.text(x[i], counts_clean[i]*1.05, fmt_count(counts_clean[i]),
            ha="center", va="bottom", fontsize=7)
    if i == 0:  # solo waypoint
        ax.text(x[i] + width, mm_counts[i]*1.05, fmt_count(mm_counts[i]),
                ha="center", va="bottom", fontsize=7)

# -------- Panel 2: Removed vs Retained --------
ax = axes[1]
ax.bar(x - width/2, removed_pct, width, label="Removed %")
ax.bar(x + width/2, retained_pct, width, label="Retained %")

ax.set_ylabel("Percentage (%)")
ax.set_xlabel("Data Source")
ax.set_title("Data Removal and Retention Percentages\n(Final Corridor / Map-Matched)")
ax.set_xticks(x)
ax.set_xticklabels(sources)
ax.set_ylim(0, 100)
ax.legend(fontsize=8)

for i in range(len(sources)):
    ax.text(x[i] - width/2, removed_pct[i] + 1, f"{removed_pct[i]:.1f}%",
            ha="center", va="bottom", fontsize=7)
    ax.text(x[i] + width/2, retained_pct[i] + 1, f"{retained_pct[i]:.1f}%",
            ha="center", va="bottom", fontsize=7)

# -------- Panel 3: File sizes --------
ax = axes[2]
ax.bar(x - width/2, sizes_input, width, label="Input Size")
ax.bar(x + width/2, sizes_clean, width, label="Cleaned / Map-Matched Size")

ax.set_ylabel("File Size (GB)")
ax.set_xlabel("Data Source")
ax.set_title("Input vs Cleaned Data: File Sizes")
ax.set_xticks(x)
ax.set_xticklabels(["Waypoint\n(Map-Matched)", "Trajs\n(cleaned + mapped)"])
ax.legend(fontsize=8)

for i in range(len(sources)):
    ax.text(x[i] - width/2, sizes_input[i] + 0.02,
            f"{sizes_input[i]:.2f} GB", ha="center", va="bottom", fontsize=7)
    ax.text(x[i] + width/2, sizes_clean[i] + 0.02,
            f"{sizes_clean[i]:.2f} GB", ha="center", va="bottom", fontsize=7)

# Guardar figura
out_dir = os.path.join(PROJECT_ROOT, "figures_from_tool_db_only")
os.makedirs(out_dir, exist_ok=True)
out_path = os.path.join(out_dir, "cleaning_impact_with_mapmatching.png")
plt.savefig(out_path, dpi=300)
plt.show()

print("Figure saved to:", out_path)
