In [None]:
import os
import glob
import time
from datetime import datetime

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# =====================================================
# 0. ARRANQUE ‚Äì PRUEBA IRREFUTABLE
# =====================================================

print("\nüî•üî•üî• VISUALIZADOR REALTIME FINAL EJECUT√ÅNDOSE üî•üî•üî•\n")

# =====================================================
# 1. CONTEXTO DE EJECUCI√ìN
# =====================================================

CWD = os.getcwd()
print(f"üìç Directorio actual (cwd): {CWD}")

RELATIVE_PATH = "output_realtime_analysis/aggregated_data"
ABS_PATH = os.path.abspath(RELATIVE_PATH)

print(f"üìÇ Ruta relativa configurada : {RELATIVE_PATH}")
print(f"üìÇ Ruta absoluta utilizada  : {ABS_PATH}\n")

# =====================================================
# 2. CONFIGURACI√ìN
# =====================================================

REFRESH_INTERVAL = 5   # segundos
TOP_N = 10

plt.ion()
fig, ax = plt.subplots(figsize=(12, 6))

# =====================================================
# 3. UTILIDADES
# =====================================================

def find_csv_files():
    """
    Busca TODOS los CSV reales generados por Spark
    """
    pattern = f"{ABS_PATH}/**/*.csv"
    files = glob.glob(pattern, recursive=True)

    valid_files = [
        f for f in files
        if f.endswith(".csv")
        and "_SUCCESS" not in f
        and "_temporary" not in f
    ]

    print(f"\nüîç Buscando CSV con patr√≥n:")
    print(f"   {pattern}")
    print(f"üìÑ CSV encontrados: {len(valid_files)}")

    for f in valid_files[:5]:
        print(f"   - {f}")
    if len(valid_files) > 5:
        print("   ...")

    return valid_files


def load_all_data():
    """
    Lee TODOS los CSV, SIEMPRE, y recalcula desde cero
    """
    sales_by_zone = {}

    files = find_csv_files()

    if not files:
        print("‚ö†Ô∏è No se encontraron CSV todav√≠a.\n")
        return sales_by_zone

    print("\nüì• Leyendo CSV y acumulando datos...\n")

    for file in files:
        try:
            df = pd.read_csv(file)

            if df.empty:
                print(f"‚ö†Ô∏è CSV vac√≠o: {file}")
                continue

            # Normalizar columnas
            df.columns = [c.lower() for c in df.columns]

            if "pickup_zone" not in df.columns or "total_sales_zone" not in df.columns:
                print(f"‚ùå Columnas esperadas NO encontradas en: {file}")
                print(f"   Columnas reales: {df.columns.tolist()}")
                continue

            for _, row in df.iterrows():
                zone = row.get("pickup_zone", "UNKNOWN")
                sales = row.get("total_sales_zone", 0)

                if pd.notna(sales):
                    sales_by_zone[zone] = sales_by_zone.get(zone, 0) + float(sales)

        except Exception as e:
            print(f"‚ùå Error leyendo {file}: {e}")

    print(f"\nüìä Total de zonas acumuladas: {len(sales_by_zone)}\n")
    return sales_by_zone


def update_plot(sales_by_zone):
    ax.clear()

    if not sales_by_zone:
        ax.text(
            0.5, 0.5,
            "‚è≥ Esperando datos del Consumer...",
            ha="center", va="center", fontsize=14
        )
        ax.set_title("Visualizador en Tiempo Real")
        plt.draw()
        plt.pause(0.1)
        return

    sorted_data = sorted(
        sales_by_zone.items(),
        key=lambda x: x[1],
        reverse=True
    )[:TOP_N]

    zones = [z for z, _ in sorted_data]
    values = [v for _, v in sorted_data]

    y = np.arange(len(zones))

    ax.barh(y, values)
    ax.set_yticks(y)
    ax.set_yticklabels(zones)
    ax.invert_yaxis()

    ax.set_title(
        f"Top {TOP_N} Zonas por Ventas Uber | {datetime.now().strftime('%H:%M:%S')}"
    )
    ax.set_xlabel("Ventas Acumuladas ($)")
    ax.grid(axis="x", linestyle="--", alpha=0.6)

    plt.tight_layout()
    plt.draw()
    plt.pause(0.1)

# =====================================================
# 4. MAIN
# =====================================================

if __name__ == "__main__":

    if not os.path.isdir(ABS_PATH):
        print("\n‚ùå ERROR CR√çTICO")
        print(f"La carpeta NO EXISTE: {ABS_PATH}")
        exit(1)

    print("\nüöÄ Iniciando visualizador en tiempo real...\n")

    try:
        while True:
            sales_by_zone = load_all_data()

            print(
                f"[{datetime.now().strftime('%H:%M:%S')}] "
                f"Zonas detectadas: {len(sales_by_zone)}"
            )

            update_plot(sales_by_zone)
            time.sleep(REFRESH_INTERVAL)

    except KeyboardInterrupt:
        print("\nüõë Visualizador detenido por el usuario")
        plt.close()



üî•üî•üî• VISUALIZADOR REALTIME FINAL EJECUT√ÅNDOSE üî•üî•üî•

üìç Directorio actual (cwd): /home/admin/Proyectos/ProyectoFInal
üìÇ Ruta relativa configurada : output_realtime_analysis/aggregated_data
üìÇ Ruta absoluta utilizada  : /home/admin/Proyectos/ProyectoFInal/output_realtime_analysis/aggregated_data

üìä Monitor Kafka ‚Äî seguimiento de eventos y latencia

‚ö†Ô∏è A√∫n no existe el archivo CSV. Esperando datos...
‚ö†Ô∏è A√∫n no existe el archivo CSV. Esperando datos...
