## 0) Parámetros (widgets) y Configuración
- Crea parámetros con `dbutils.widgets` para rutas y flags.
- Activa `Adaptive Query Execution (AQE)` vía `spark.conf`.
- Verifica configuraciones clave en `Environment` y plan SQL.

In [0]:
# Widgets (se pueden fijar desde la UI también)
dbutils.widgets.text("base_path", "/Volumes/sesion_10/default/storage", "base_path")
dbutils.widgets.text("table_name", "lab_bp_events", "table_name")
dbutils.widgets.dropdown("enable_aqe", "true", ["true","false"], "enable_aqe")

base_path = dbutils.widgets.get("base_path")
table_name = dbutils.widgets.get("table_name")
enable_aqe = dbutils.widgets.get("enable_aqe")

In [0]:
print("AQE enabled:", spark.conf.get("spark.sql.adaptive.enabled"))
print("Broadcast threshold (bytes):", spark.conf.get("spark.sql.autoBroadcastJoinThreshold"))

In [0]:
# Configuración de AQE y umbral de broadcast join (ajústalo si tu tabla pequeña cabe en memoria)
spark.conf.set("spark.sql.adaptive.enabled", enable_aqe)
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", "true")
spark.conf.set("spark.sql.adaptive.skewJoin.enabled", "true")
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", 20 * 1024 * 1024)  # 20MB

print("AQE enabled:", spark.conf.get("spark.sql.adaptive.enabled"))
print("Broadcast threshold (bytes):", spark.conf.get("spark.sql.autoBroadcastJoinThreshold"))

# Limpia carpeta de trabajo para reproducibilidad
dbutils.fs.rm(base_path, True)

# Rutas útiles
delta_path = f"{base_path}/delta/events"
dim_path   = f"{base_path}/delta/dim_small"

## 1) Estructura y utilidades
- Separar el código en **secciones** y **funciones** reutilizables.
- Medir tiempos de forma estandarizada.

In [0]:
import time
from pyspark.sql.functions import *
from pyspark.sql.types import *

def tic():
    return time.time()

def toc(t0, label="Elapsed"):
    dt = time.time() - t0
    print(f"{label}: {dt:0.2f} s")
    return dt

## 2) Datos de ejemplo
- Generamos ~50M registros sintéticos con distribución simple.

In [0]:
N = 50_000_000  # tamaño manejable
cities = ["Lima","Arequipa","Cusco","Trujillo","Piura"]
kinds  = ["web","app","store"]

t0 = tic()

events = (
    spark.range(N).selectExpr(
        "id as event_id",
        "cast(rand()*100000 as int) as user_id",
        "date_sub(current_date(), cast(rand()*90 as int)) as event_date",
        "cast(rand()*100 as int) as amount",
        "CASE WHEN rand() < 0.45 THEN 'Lima' ELSE element_at(array('Arequipa','Cusco','Trujillo','Piura'), cast(floor(rand()*4)+1 as int)) END as city",
        "element_at(array('web','app','store'), cast(floor(rand()*3)+1 as int)) as channel"
    )
)

toc(t0, "Generación Delta de events")

In [0]:
t0 = tic()

events.write.format("delta").mode("overwrite").save(delta_path)

toc(t0, "Escritura Delta de events")

In [0]:
t0=tic()

delta_path_coalesce = f"{delta_path}_coalesce"
events.coalesce(1).write.format("delta").mode("overwrite").save(delta_path_coalesce)

toc(t0, "Overwrite Delta con 1 writer")

In [0]:
spark.conf.set("spark.sql.parquet.compression.codec", "gzip")  # default es snappy

In [0]:
t0=tic()

delta_path_coalesce_gzip = f"{delta_path}_coalesce_gzip"
events.coalesce(1).write.format("delta").mode("overwrite").save(delta_path_coalesce_gzip)

toc(t0, "Overwrite Delta con 1 writer y compresion gzip")

In [0]:
spark.conf.set("spark.sql.parquet.compression.codec", "snappy")

In [0]:
t0=tic()

writers = 16
delta_path_repartition = f"{delta_path}_repartition"
events.repartition(writers).write.format("delta").mode("overwrite").save(delta_path_repartition)

toc(t0, "Overwrite Delta con 16 writer y compresion snappy por defecto")

In [0]:
t0 = tic()

dim_small = spark.createDataFrame(
    [(c, k) for c in cities for k in kinds],
    schema=T.StructType([T.StructField("city", T.StringType()), T.StructField("channel", T.StringType())])
).withColumn("priority", (F.rand()*100).cast("int"))

dim_small.write.format("delta").mode("overwrite").save(dim_path)

toc(t0, "Generación + escritura Delta de dim_small")

In [0]:
spark.read.format("delta").load(delta_path).createOrReplaceTempView("events")
spark.read.format("delta").load(dim_path).createOrReplaceTempView("dim_small")

In [0]:
t0 = tic()

join_shuffle = (
    spark.table("events")
    .join(spark.table("dim_small"), on=["city", "channel"], how="left")
    .groupBy("city", "channel")
    .agg(sum("amount").alias("total_amount"))
)
join_shuffle.count()

toc(t0, "Join sin broadcast (shuffle)")

In [0]:
from pyspark.sql.functions import broadcast

t0 = tic()
dim_b = broadcast(spark.table("dim_small"))
join_broadcast = (
    spark.table("events")
    .join(dim_b, on=["city", "channel"], how="left")
    .groupBy("city", "channel")
    .agg(sum("amount").alias("total_amount"))
)
join_broadcast.count()
toc(t0, "Join con broadcast")

In [0]:
part_path = f"{base_path}/delta/transactions"

# Simulamos microarchivos: repartition alto = muchos archivos chicos
transactions = (
  spark.range(50_000_000)  # 50M filas
  .withColumn("customer_id", (rand()*1000).cast("int"))
  .withColumn("product_id", (rand()*100).cast("int"))
  .withColumn("amount", (rand()*100).cast("int"))
)

In [0]:
spark.read.format("delta").load(part_path).createOrReplaceTempView("transactions")

In [0]:
spark.sql("""
    CREATE TABLE IF NOT EXISTS sesion_10.default.transactions
    USING DELTA
    AS SELECT * FROM transactions
""")

In [0]:
t0 = tic()

spark.sql("SELECT SUM(amount) FROM sesion_10.default.transactions WHERE product_id=42").show()

toc(t0, "Consulta baseline (sin optimize)")

In [0]:
spark.sql("OPTIMIZE sesion_10.default.transactions;")

In [0]:
t0 = tic()

spark.sql("SELECT SUM(amount) FROM sesion_10.default.transactions WHERE product_id=42").show()

toc(t0, "Consulta baseline (sin optimize)")

In [0]:
spark.sql("OPTIMIZE sesion_10.default.transactions ZORDER BY (customer_id, product_id);")

In [0]:
t0 = tic()

spark.sql("SELECT * FROM sesion_10.default.transactions WHERE customer_id=123 AND product_id=42").count()

toc(t0, "Consulta con ZORDER aplicado")

In [0]:
spark.sql("VACUUM sesion_10.default.transactions RETAIN 168 HOURS;")