In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import glob
import os
import time
from datetime import datetime
import sys

# =======================================
# CONFIGURACI√ìN
# =======================================

AGGREGATED_OUTPUT_PATH = "output_realtime_analysis/aggregated_data"
REFRESH_INTERVAL = 5
TOP_N = 10

plt.ion()
fig, ax = plt.subplots(figsize=(12, 6))

# =======================================
# FUNCIONES
# =======================================

def find_all_csv_files():
    return [
        f for f in glob.glob(f"{AGGREGATED_OUTPUT_PATH}/**/*.csv", recursive=True)
        if f.endswith(".csv")
        and "_SUCCESS" not in f
        and "_temporary" not in f
    ]


def load_all_data():
    """Lee TODOS los CSV y recalcula todo."""
    sales_by_zone = {}

    files = find_all_csv_files()

    if not files:
        return sales_by_zone

    for file in files:
        try:
            df = pd.read_csv(file)

            if df.empty:
                continue

            # Normalizar columnas por seguridad
            df.columns = [c.lower() for c in df.columns]

            for _, row in df.iterrows():
                zone = row.get("pickup_zone", "UNKNOWN")
                sales = row.get("total_sales_zone", 0)

                if pd.notna(sales):
                    sales_by_zone[zone] = sales_by_zone.get(zone, 0) + float(sales)

        except Exception:
            pass  # Spark puede estar escribiendo a√∫n

    return sales_by_zone


def update_plot(sales_by_zone):
    ax.clear()

    if not sales_by_zone:
        ax.text(
            0.5, 0.5,
            "‚è≥ Esperando datos del Consumer...",
            ha="center", va="center", fontsize=14
        )
        ax.set_title("Visualizador en Tiempo Real")
        plt.draw()
        plt.pause(0.1)
        return

    sorted_data = sorted(
        sales_by_zone.items(),
        key=lambda x: x[1],
        reverse=True
    )[:TOP_N]

    zones = [z for z, _ in sorted_data]
    values = [v for _, v in sorted_data]

    y = np.arange(len(zones))

    ax.barh(y, values)
    ax.set_yticks(y)
    ax.set_yticklabels(zones)
    ax.invert_yaxis()

    ax.set_title(
        f"Top {TOP_N} Zonas por Ventas Uber | {datetime.now().strftime('%H:%M:%S')}"
    )
    ax.set_xlabel("Ventas Acumuladas ($)")
    ax.grid(axis="x", linestyle="--", alpha=0.6)

    plt.tight_layout()
    plt.draw()
    plt.pause(0.1)


# =======================================
# MAIN LOOP
# =======================================

if __name__ == "__main__":

    if not os.path.isdir(AGGREGATED_OUTPUT_PATH):
        print("‚ùå No existe la carpeta de salida")
        sys.exit(1)

    print("üìä Visualizador FULL REFRESH iniciado")
    print(f"üìÅ Leyendo TODO desde: {AGGREGATED_OUTPUT_PATH}")

    try:
        while True:
            sales_by_zone = load_all_data()

            print(
                f"[{datetime.now().strftime('%H:%M:%S')}] "
                f"Zonas detectadas: {len(sales_by_zone)}"
            )

            update_plot(sales_by_zone)
            time.sleep(REFRESH_INTERVAL)

    except KeyboardInterrupt:
        print("\nüõë Visualizador detenido")
        plt.close()
