In [1]:
from smartpool_config import *

spark = create_spark("smartpool-ingestion")

print("Spark OK:", spark.version)
print("BASE:", BASE)
print("BRONZE:", BRONZE)
print("SILVER:", SILVER)
print("GOLD:", GOLD)
print("STATE:", STATE)
print("JDBC:", JDBC_URL)


:: loading settings :: url = jar:file:/opt/conda/lib/python3.11/site-packages/pyspark/jars/ivy-2.5.3.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /home/jovyan/.ivy2.5.2/cache
The jars for the packages stored in: /home/jovyan/.ivy2.5.2/jars
io.delta#delta-spark_2.13 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-af1d6aef-95a3-4365-b176-b7e02c118292;1.0
	confs: [default]
	found io.delta#delta-spark_2.13;4.0.0 in central
	found io.delta#delta-storage;4.0.0 in central
	found org.antlr#antlr4-runtime;4.13.1 in central
:: resolution report :: resolve 123ms :: artifacts dl 5ms
	:: modules in use:
	io.delta#delta-spark_2.13;4.0.0 from central in [default]
	io.delta#delta-storage;4.0.0 from central in [default]
	org.antlr#antlr4-runtime;4.13.1 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf  

Spark OK: 4.0.1
BASE: s3a://spark/medallion
BRONZE: s3a://spark/medallion/bronze
SILVER: s3a://spark/medallion/silver
GOLD: s3a://spark/medallion/gold
STATE: s3a://spark/medallion/_state
JDBC: jdbc:sqlserver://sqlserver:1433;databaseName=smartpool;encrypt=true;trustServerCertificate=true;


In [2]:
def jdbc_read(query: str):
    return (
        spark.read.format("jdbc")
        .option("url", JDBC_URL)
        .option("query", query)
        .option("user", JDBC_USER)
        .option("password", JDBC_PASS)
        .option("driver", JDBC_DRIVER)
        .load()
    )

def state_path(table: str) -> str:
    return f"{STATE}/{table}"

def read_watermark(table: str):
    """
    Devuelve (last_updated_at_str, last_pk).
    last_updated_at_str en formato ISO con 7 decimales: 2026-01-25T18:37:37.8426983
    """
    p = state_path(table)
    if DeltaTable.isDeltaTable(spark, p):
        r = spark.read.format("delta").load(p).limit(1).collect()[0]
        return r["last_updated_at_str"], int(r["last_pk"])
    return None, 0

def write_watermark(table: str, last_updated_at_str: str, last_pk: int):
    p = state_path(table)
    df_state = spark.createDataFrame(
        [(last_updated_at_str, int(last_pk))],
        ["last_updated_at_str", "last_pk"]
    )
    (
        df_state.write.format("delta")
        .mode("overwrite")
        .option("overwriteSchema", "true")
        .save(p)
    )
    print(f"[state:{table}] last_updated_at_str={last_updated_at_str}, last_pk={last_pk} -> {p}")

def ingest_incremental(table: str, pk: str, updated_col: str = "updated_at"):
    bronze_path = f"{BRONZE}/{table}"
    last_ts_str, last_pk = read_watermark(table)

    if last_ts_str is None:
        print(f"[{table}] No hay state, FULL LOAD inicial")
        where = "1=1"
    else:
        print(f"[{table}] State detectado, last_updated_at_str={last_ts_str}, last_pk={last_pk}")
        where = f"""
            {updated_col} > CAST('{last_ts_str}' AS datetime2(7))
            OR ({updated_col} = CAST('{last_ts_str}' AS datetime2(7)) AND {pk} > {last_pk})
        """

    query = f"""
        SELECT
            *,
            CONVERT(varchar(33), {updated_col}, 126) AS {updated_col}_str
        FROM dbo.{table}
        WHERE {where}
    """

    df_new = jdbc_read(query)
    cnt = df_new.count()
    print(f"[{table}] Nuevas filas leídas: {cnt}")

    if cnt == 0:
        print(f"[{table}] Nada que cargar, fin.")
        return

    # Append Bronze (raw incremental)
    (
        df_new.drop(f"{updated_col}_str")
        .write.format("delta")
        .mode("append")
        .save(bronze_path)
    )
    print(f"[{table}] Append Bronze OK -> {bronze_path}")

    # Watermark exacto (string con 7 decimales) + tie-breaker por PK
    max_ts_str = df_new.agg(F.max(f"{updated_col}_str").alias("max_ts")).collect()[0]["max_ts"]
    max_pk = (
        df_new.filter(F.col(f"{updated_col}_str") == F.lit(max_ts_str))
        .agg(F.max(pk).alias("max_pk"))
        .collect()[0]["max_pk"]
    )
    write_watermark(table, max_ts_str, int(max_pk))


In [18]:
ingest_incremental(table="pools_dim", pk="pool_id", updated_col="updated_at")


[pools_dim] State detectado, last_updated_at_str=2026-01-25T19:07:16.2612979, last_pk=6
[pools_dim] Nuevas filas leídas: 0
[pools_dim] Nada que cargar, fin.


In [19]:
ingest_incremental(table="maintenance_events", pk="id", updated_col="updated_at")

                                                                                

[maintenance_events] State detectado, last_updated_at_str=2026-01-25T01:10:41.8562930, last_pk=14
[maintenance_events] Nuevas filas leídas: 2


                                                                                

[maintenance_events] Append Bronze OK -> s3a://spark/medallion/bronze/maintenance_events
[state:maintenance_events] last_updated_at_str=2026-01-25T19:07:21.8205643, last_pk=1003 -> s3a://spark/medallion/_state/maintenance_events


In [20]:
print("=== BRONZE pools_dim ===")
spark.read.format("delta").load(f"{BRONZE}/pools_dim").show()
print("=== STATE pools_dim ===")
spark.read.format("delta").load(f"{STATE}/pools_dim").show(truncate=False)

print("=== BRONZE maintenance_events ===")
spark.read.format("delta").load(f"{BRONZE}/maintenance_events").show()
print("=== STATE maintenance_events ===")
spark.read.format("delta").load(f"{STATE}/maintenance_events").show(truncate=False)


=== BRONZE pools_dim ===
+-------+--------------------+--------------------+-------------+---------+-------------+--------------------+
|pool_id|           pool_name|            location|volume_liters|is_heated|   owner_type|          updated_at|
+-------+--------------------+--------------------+-------------+---------+-------------+--------------------+
|      1| Piscina Casa Pueblo|Valdeganga (Albac...|        40000|    false|      private|2026-01-25 18:37:...|
|      2|  Piscina Villa Mila|Valdeganga (Albac...|        70000|    false|      private|2026-01-25 18:37:...|
|      3|Piscina Airbnb Rural|              Cuenca|        35000|    false|       airbnb|2026-01-25 18:37:...|
|      4|Piscina Hotel Centro|              Madrid|        60000|     true|        hotel|2026-01-25 18:37:...|
|      5|Piscina Polidepor...|         Ciudad Real|        80000|     true|sports_center|2026-01-25 18:37:...|
|      6|Piscina Prueba In...|            Albacete|        42000|     true|      privat

                                                                                

+----+-------+--------------------+-----------------+------------+--------------+--------------------+--------------------+
|  id|pool_id|          event_time|intervention_type|product_type|product_amount|               notes|          updated_at|
+----+-------+--------------------+-----------------+------------+--------------+--------------------+--------------------+
|   1|      1|2026-01-15 01:10:...|         chlorine|    dichloro|         250.0|Tratamiento de ch...|2026-01-25 01:10:...|
|   2|      1|2026-01-18 01:10:...|    ph_correction|       minus|         150.0|Ajuste de pH tras...|2026-01-25 01:10:...|
|   3|      1|2026-01-22 01:10:...|  filter_backwash|        NULL|          NULL|Lavado de filtro ...|2026-01-25 01:10:...|
|   4|      2|2026-01-17 01:10:...|         chlorine|    tricloro|         200.0|Mantenimiento rut...|2026-01-25 01:10:...|
|   5|      2|2026-01-21 01:10:...|           refill|        NULL|        1500.0|Relleno por evapo...|2026-01-25 01:10:...|
|   6|  

                                                                                

+---------------------------+-------+
|last_updated_at_str        |last_pk|
+---------------------------+-------+
|2026-01-25T19:07:21.8205643|1003   |
+---------------------------+-------+



In [None]:
spark.stop()