In [2]:
from pyspark.sql import SparkSession
from delta import configure_spark_with_delta_pip

# =========
# MINIO / S3A
# =========
MINIO_ENDPOINT = "http://minio:9000"
MINIO_ACCESS_KEY = "minioadmin"
MINIO_SECRET_KEY = "minioadmin123"
MINIO_BUCKET = "spark"

BASE   = f"s3a://{MINIO_BUCKET}/medallion"
BRONZE = f"{BASE}/bronze"
SILVER = f"{BASE}/silver"
GOLD   = f"{BASE}/gold"

# =========
# JARS EXTRA
# =========
EXTRA_JARS = ",".join([
    "/opt/spark/jars/hadoop-aws-3.4.0.jar",
    "/opt/spark/jars/hadoop-common-3.4.0.jar",
    "/opt/spark/jars/aws-java-sdk-bundle-2.23.19.jar",
    "/opt/spark/jars/mssql-jdbc-12.10.2.jre11.jar",
    "/opt/spark/jars/delta-spark_2.13-4.0.0.jar",
    "/opt/spark/jars/delta-storage-4.0.0.jar",
    "/opt/spark/jars/antlr4-runtime-4.13.1.jar",
    "/opt/spark/jars/spark-sql-kafka-0-10_2.13-4.0.1.jar",
    "/opt/spark/jars/kafka-clients-3.9.1.jar",
])

builder = (
    SparkSession.builder
    .appName("test-smartpool")
    .master("local[*]")  # spark://spark-master:7077 más adelante

    # Delta
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

    # MinIO / S3A
    .config("spark.hadoop.fs.s3a.endpoint", MINIO_ENDPOINT)
    .config("spark.hadoop.fs.s3a.access.key", MINIO_ACCESS_KEY)
    .config("spark.hadoop.fs.s3a.secret.key", MINIO_SECRET_KEY)
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")

    # Jars extra
    .config("spark.jars", EXTRA_JARS)
)

spark = configure_spark_with_delta_pip(builder).getOrCreate()
spark.sparkContext.setLogLevel("WARN")

print("Spark OK:", spark.version)
print("BASE:", BASE)


:: loading settings :: url = jar:file:/opt/conda/lib/python3.11/site-packages/pyspark/jars/ivy-2.5.3.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /home/jovyan/.ivy2.5.2/cache
The jars for the packages stored in: /home/jovyan/.ivy2.5.2/jars
io.delta#delta-spark_2.13 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-502ee05e-e713-45fa-ac89-e7656071afd6;1.0
	confs: [default]
	found io.delta#delta-spark_2.13;4.0.0 in central
	found io.delta#delta-storage;4.0.0 in central
	found org.antlr#antlr4-runtime;4.13.1 in central
:: resolution report :: resolve 103ms :: artifacts dl 4ms
	:: modules in use:
	io.delta#delta-spark_2.13;4.0.0 from central in [default]
	io.delta#delta-storage;4.0.0 from central in [default]
	org.antlr#antlr4-runtime;4.13.1 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf  

Spark OK: 4.0.1
BASE: s3a://spark/medallion


In [3]:
jdbc_url = (
    "jdbc:sqlserver://sqlserver:1433;"
    "databaseName=smartpool;"
    "encrypt=true;"
    "trustServerCertificate=true;"
)

df_pools = (
    spark.read
    .format("jdbc")
    .option("url", jdbc_url)
    .option("dbtable", "dbo.pools_dim")
    .option("user", "sa")
    .option("password", "Password1234%")  # pon aquí exactamente el SA_PASSWORD del docker-compose
    .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver")
    .load()
)

df_pools.show()
df_pools.printSchema()


+-------+--------------------+--------------------+-------------+---------+-------------+
|pool_id|           pool_name|            location|volume_liters|is_heated|   owner_type|
+-------+--------------------+--------------------+-------------+---------+-------------+
|      1| Piscina Casa Pueblo|Valdeganga (Albac...|        40000|    false|      private|
|      2|  Piscina Villa Mila|Valdeganga (Albac...|        70000|    false|      private|
|      3|Piscina Airbnb Rural|              Cuenca|        35000|    false|       airbnb|
|      4|Piscina Hotel Centro|              Madrid|        60000|     true|        hotel|
|      5|Piscina Polidepor...|         Ciudad Real|        80000|     true|sports_center|
+-------+--------------------+--------------------+-------------+---------+-------------+

root
 |-- pool_id: integer (nullable = true)
 |-- pool_name: string (nullable = true)
 |-- location: string (nullable = true)
 |-- volume_liters: integer (nullable = true)
 |-- is_heated: b

In [4]:
bronze_pools_path = f"{BRONZE}/pools_dim"

(
    df_pools
    .write
    .format("delta")
    .mode("overwrite")
    .save(bronze_pools_path)
)

print("Escrito en:", bronze_pools_path)


26/01/25 01:32:57 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".
SLF4J: Defaulting to no-operation (NOP) logger implementation
SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.
26/01/25 01:32:59 WARN S3ABlockOutputStream: Application invoked the Syncable API against stream writing to medallion/bronze/pools_dim/part-00000-ae806f2f-718b-4fbf-a420-8d27393b56e2-c000.snappy.parquet. This is Unsupported
                                                                                

Escrito en: s3a://spark/medallion/bronze/pools_dim


In [5]:
df_pools_bronze = (
    spark.read
    .format("delta")
    .load(bronze_pools_path)
)

df_pools_bronze.show()
df_pools_bronze.printSchema()


26/01/25 01:33:01 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

+-------+--------------------+--------------------+-------------+---------+-------------+
|pool_id|           pool_name|            location|volume_liters|is_heated|   owner_type|
+-------+--------------------+--------------------+-------------+---------+-------------+
|      1| Piscina Casa Pueblo|Valdeganga (Albac...|        40000|    false|      private|
|      2|  Piscina Villa Mila|Valdeganga (Albac...|        70000|    false|      private|
|      3|Piscina Airbnb Rural|              Cuenca|        35000|    false|       airbnb|
|      4|Piscina Hotel Centro|              Madrid|        60000|     true|        hotel|
|      5|Piscina Polidepor...|         Ciudad Real|        80000|     true|sports_center|
+-------+--------------------+--------------------+-------------+---------+-------------+

root
 |-- pool_id: integer (nullable = true)
 |-- pool_name: string (nullable = true)
 |-- location: string (nullable = true)
 |-- volume_liters: integer (nullable = true)
 |-- is_heated: b

In [7]:
df_heated = df_pools_bronze.filter("is_heated = true")
df_heated.show()

+-------+--------------------+-----------+-------------+---------+-------------+
|pool_id|           pool_name|   location|volume_liters|is_heated|   owner_type|
+-------+--------------------+-----------+-------------+---------+-------------+
|      4|Piscina Hotel Centro|     Madrid|        60000|     true|        hotel|
|      5|Piscina Polidepor...|Ciudad Real|        80000|     true|sports_center|
+-------+--------------------+-----------+-------------+---------+-------------+



In [8]:
from pyspark.sql import functions as F

jdbc_url = (
    "jdbc:sqlserver://sqlserver:1433;"
    "databaseName=smartpool;"
    "encrypt=true;"
    "trustServerCertificate=true;"
)

df_maint = (
    spark.read
    .format("jdbc")
    .option("url", jdbc_url)
    .option("dbtable", "dbo.maintenance_events")
    .option("user", "sa")
    .option("password", "Password1234%")
    .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver")
    .load()
)

df_maint.show()
df_maint.printSchema()
print("Registros en maintenance_events:", df_maint.count())

+---+-------+--------------------+-----------------+------------+--------------+--------------------+--------------------+
| id|pool_id|          event_time|intervention_type|product_type|product_amount|               notes|          updated_at|
+---+-------+--------------------+-----------------+------------+--------------+--------------------+--------------------+
|  1|      1|2026-01-15 01:10:...|         chlorine|    dichloro|         250.0|Tratamiento de ch...|2026-01-25 01:10:...|
|  2|      1|2026-01-18 01:10:...|    ph_correction|       minus|         150.0|Ajuste de pH tras...|2026-01-25 01:10:...|
|  3|      1|2026-01-22 01:10:...|  filter_backwash|        NULL|          NULL|Lavado de filtro ...|2026-01-25 01:10:...|
|  4|      2|2026-01-17 01:10:...|         chlorine|    tricloro|         200.0|Mantenimiento rut...|2026-01-25 01:10:...|
|  5|      2|2026-01-21 01:10:...|           refill|        NULL|        1500.0|Relleno por evapo...|2026-01-25 01:10:...|
|  6|      2|202

In [9]:
bronze_maint_path = f"{BRONZE}/maintenance_events"

(
    df_maint
    .write
    .format("delta")
    .mode("overwrite")  # full load inicial
    .save(bronze_maint_path)
)

print("Maintenance events escritos en:", bronze_maint_path)

Maintenance events escritos en: s3a://spark/medallion/bronze/maintenance_events


In [10]:
df_maint_bronze = (
    spark.read
    .format("delta")
    .load(bronze_maint_path)
)

df_maint_bronze.show()
df_maint_bronze.printSchema()
print("Registros en Bronze:", df_maint_bronze.count())


+---+-------+--------------------+-----------------+------------+--------------+--------------------+--------------------+
| id|pool_id|          event_time|intervention_type|product_type|product_amount|               notes|          updated_at|
+---+-------+--------------------+-----------------+------------+--------------+--------------------+--------------------+
|  1|      1|2026-01-15 01:10:...|         chlorine|    dichloro|         250.0|Tratamiento de ch...|2026-01-25 01:10:...|
|  2|      1|2026-01-18 01:10:...|    ph_correction|       minus|         150.0|Ajuste de pH tras...|2026-01-25 01:10:...|
|  3|      1|2026-01-22 01:10:...|  filter_backwash|        NULL|          NULL|Lavado de filtro ...|2026-01-25 01:10:...|
|  4|      2|2026-01-17 01:10:...|         chlorine|    tricloro|         200.0|Mantenimiento rut...|2026-01-25 01:10:...|
|  5|      2|2026-01-21 01:10:...|           refill|        NULL|        1500.0|Relleno por evapo...|2026-01-25 01:10:...|
|  6|      2|202

In [11]:
from pyspark.sql import functions as F
from pyspark.sql.utils import AnalysisException

jdbc_url = (
    "jdbc:sqlserver://sqlserver:1433;"
    "databaseName=smartpool;"
    "encrypt=true;"
    "trustServerCertificate=true;"
)

bronze_maint_path = f"{BRONZE}/maintenance_events"
state_maint_path  = f"{STATE}/maintenance_events_state"


def delta_table_exists(path: str) -> bool:
    try:
        (
            spark.read
            .format("delta")
            .load(path)
            .limit(1)
            .collect()
        )
        return True
    except AnalysisException:
        return False


def load_last_event_time(path: str):
    """
    Devuelve el último event_time guardado en la tabla de estado,
    o None si aún no existe.
    """
    try:
        df_state = (
            spark.read
            .format("delta")
            .load(path)
        )
        row = (
            df_state
            .orderBy(F.col("last_event_time").desc())
            .limit(1)
            .collect()
        )
        if row:
            return row[0]["last_event_time"]
        return None
    except AnalysisException:
        return None


NameError: name 'STATE' is not defined