### Objetivo del laboratorio
Ingerir un lote base y un lote incremental con duplicados/actualizaciones, deduplicar y consolidar una tabla Silver consistente.

**Capas:**
- **Bronze**: lee los 1 CSV (ventas_base) y los escribe como Delta raw.

- **Silver**: limpieza + deduplicación + upsert con MERGE

Info: 
`id_venta, id_linea` BK(Business Key) 

In [0]:
file = dbutils.widgets.text("file", "")
file = dbutils.widgets.get("file")

In [0]:
# Creamos catalogo y schema
catalog_name = "sesion_08"
schema_bronze = "bronze"
schema_silver = "silver"
schema_gold   = "gold"

In [0]:
spark.sql(f"""
CREATE OR REPLACE TABLE {catalog_name}.{schema_bronze}.ventas(
  id_venta string,
  id_linea string,
  id_tienda string,
  id_cliente string,
  id_producto string,
  fecha_venta string,
  cantidad string,
  precio_unitario string,
  updated_at string,
  ingest_at timestamp
)
"""
)

In [0]:
spark.sql(f"""
CREATE OR REPLACE TABLE {catalog_name}.{schema_silver}.ventas(
  id_venta string,
  id_linea integer,
  id_tienda integer,
  id_cliente integer,
  id_producto integer,
  fecha_venta date,
  cantidad integer,
  precio_unitario decimal(18,2),
  updated_at timestamp
)
"""
)

### Bronze

In [0]:
from pyspark.sql.functions import col, to_date, to_timestamp, row_number, current_timestamp, current_date, sha2, concat_ws, lit, expr
from pyspark.sql.window import Window
from pyspark.sql.utils import AnalysisException
from delta.tables import DeltaTable
import uuid

In [0]:
# Definimos rutas de origen
path_base = "/Volumes/sesion_08/default/source"

path_ventas  = f"{path_base}/{file}.csv"

In [0]:
# Lectura
ventas = (
    spark.read.option("header", True).csv(path_ventas).withColumn("ingest_at", current_timestamp())
)

In [0]:
ventas.createOrReplaceTempView("ventas_bronze")

spark.sql(f"""
INSERT INTO {catalog_name}.{schema_bronze}.ventas
SELECT * FROM ventas_bronze          
""")

In [0]:
spark.table(f"{catalog_name}.{schema_bronze}.ventas").display()

In [0]:
silver_ventas = (
    ventas
    .withColumn("id_linea", col("id_linea").cast("int"))
    .withColumn("id_tienda", col("id_tienda").cast("int"))
    .withColumn("id_cliente", col("id_cliente").cast("int"))
    .withColumn("id_producto", col("id_producto").cast("int"))
    .withColumn("fecha_venta", to_date(col("fecha_venta")))
    .withColumn("cantidad", col("cantidad").cast("int"))
    .withColumn("precio_unitario", col("precio_unitario").cast("decimal(18,2)"))
    .withColumn("updated_at", to_timestamp(col("updated_at")))
    .drop("ingest_at")
)

In [0]:
windows_dedup = Window.partitionBy("id_venta", "id_linea").orderBy(col("updated_at").desc())

silver_ventas_dedup = (
    silver_ventas
    .withColumn("dedup", row_number().over(windows_dedup))
    .filter(col("dedup") == 1)
    .drop("depup")
)

In [0]:
target = DeltaTable.forName(spark, f"{catalog_name}.{schema_silver}.ventas")

merge = (
    target.alias("m")
    .merge(
        silver_ventas_dedup.alias("in"),
        (col("m.id_venta") == col("in.id_venta")) &
        (col("m.id_linea") == col("in.id_linea"))
    )
    .whenMatchedUpdate(
        condition= "in.updated_at > m.updated_at",
        set = {
            "id_tienda": col("in.id_tienda"),
            "id_cliente": col("in.id_cliente"),
            "id_producto": col("in.id_producto"),
            "fecha_venta": col("in.fecha_venta"),
            "cantidad": col("in.cantidad"),
            "precio_unitario": col("in.precio_unitario"),
            "updated_at": col("in.updated_at")
        }
    )
    .whenNotMatchedInsert(
        values = {
            "id_venta": col("in.id_venta"),
            "id_linea": col("in.id_linea"),
            "id_tienda": col("in.id_tienda"),
            "id_cliente": col("in.id_cliente"),
            "id_producto": col("in.id_producto"),
            "fecha_venta": col("in.fecha_venta"),
            "cantidad": col("in.cantidad"),
            "precio_unitario": col("in.precio_unitario"),
            "updated_at": col("in.updated_at")
        }
    )
    .execute()
)

In [0]:
spark.table(f"{catalog_name}.{schema_silver}.ventas").display()

In [0]:
spark.table(f"{catalog_name}.{schema_silver}.ventas").display()