In [1]:
from smartpool_config import *

spark = create_spark("smartpool-ingest-electricity-csv")

print("Spark OK:", spark.version)
print("BASE:", BASE)
print("BRONZE:", BRONZE)
print("SILVER:", SILVER)
print("GOLD:", GOLD)
print("STATE:", STATE)
print("JDBC:", JDBC_URL)


:: loading settings :: url = jar:file:/opt/conda/lib/python3.11/site-packages/pyspark/jars/ivy-2.5.3.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /home/jovyan/.ivy2.5.2/cache
The jars for the packages stored in: /home/jovyan/.ivy2.5.2/jars
io.delta#delta-spark_2.13 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-52acdccb-0ddd-4e1c-89d9-63ce3456415f;1.0
	confs: [default]
	found io.delta#delta-spark_2.13;4.0.0 in central
	found io.delta#delta-storage;4.0.0 in central
	found org.antlr#antlr4-runtime;4.13.1 in central
:: resolution report :: resolve 101ms :: artifacts dl 3ms
	:: modules in use:
	io.delta#delta-spark_2.13;4.0.0 from central in [default]
	io.delta#delta-storage;4.0.0 from central in [default]
	org.antlr#antlr4-runtime;4.13.1 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf  

Spark OK: 4.0.1
BASE: s3a://spark/medallion
BRONZE: s3a://spark/medallion/bronze
SILVER: s3a://spark/medallion/silver
GOLD: s3a://spark/medallion/gold
STATE: s3a://spark/medallion/_state
JDBC: jdbc:sqlserver://sqlserver:1433;databaseName=smartpool;encrypt=true;trustServerCertificate=true;


In [2]:
# Landing (CSV) -> Medallion
LANDING_ELEC = f"s3a://{MINIO_BUCKET}/landing/electricity_prices"
BRONZE_ELEC  = f"{BRONZE}/electricity_prices"
SILVER_ELEC  = f"{SILVER}/electricity_prices"
STATE_ELEC   = f"{STATE}/electricity_prices_files"

In [3]:
def delta_exists(path: str) -> bool:
    try:
        return DeltaTable.isDeltaTable(spark, path)
    except Exception:
        return False

def read_last_date():
    if delta_exists(STATE_ELEC):
        r = spark.read.format("delta").load(STATE_ELEC).limit(1).collect()[0]
        return r["last_date"]
    return None

def write_last_date(last_date: str):
    df = spark.createDataFrame([(last_date,)], ["last_date"])
    (df.write.format("delta")
        .mode("overwrite")
        .option("overwriteSchema", "true")
        .save(STATE_ELEC))
    print(f"[state:electricity] last_date={last_date} -> {STATE_ELEC}")

In [4]:
last_date = read_last_date()
print("last_date state:", last_date)

raw = (
    spark.read
    .option("header", "true")
    .option("inferSchema", "false")  # controlamos schema nosotros
    # .option("basePath", LANDING_ELEC)
    # .csv(f"{LANDING_ELEC}/date=*/")  # lee particiones date=YYYY-MM-DD
    .option("recursiveFileLookup", "true")
    .csv(LANDING_ELEC)
    .withColumn("source_file", F.input_file_name())
    .withColumn("ingest_ts", F.current_timestamp())
    .withColumn("ingest_date", F.to_date(F.col("ingest_ts")))
)

# Normaliza/castea columnas esperadas
df = (raw
    .select(
        F.col("ts_utc").cast("string").alias("ts_utc"),
        F.col("date").cast("string").alias("date_str"),
        F.col("hour").cast("int").alias("hour"),
        F.col("price_eur_mwh").cast("double").alias("price_eur_mwh"),
        F.col("price_eur_kwh").cast("double").alias("price_eur_kwh"),
        F.col("region").cast("string").alias("region"),
        F.col("source").cast("string").alias("source"),
        F.col("source_file"),
        F.col("ingest_ts"),
        F.col("ingest_date"),
    )
    .withColumn("date", F.to_date("date_str"))
    .drop("date_str")
)

# Incremental: solo dates > last_date (si hay state)
if last_date:
    df_new = df.filter(F.col("date") > F.to_date(F.lit(last_date)))
else:
    df_new = df

print("Rows leídas landing:", df.count())
print("Rows nuevas (por date):", df_new.count())

df_new.orderBy(F.col("date").desc(), F.col("hour").asc()).show(10, truncate=False)


26/01/25 22:39:36 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".
SLF4J: Defaulting to no-operation (NOP) logger implementation
SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.


last_date state: None


                                                                                

Rows leídas landing: 720
Rows nuevas (por date): 720
+--------------------+----+-------------+-------------+------+---------+--------------------------------------------------------------------------------------------------------+--------------------------+-----------+----------+
|ts_utc              |hour|price_eur_mwh|price_eur_kwh|region|source   |source_file                                                                                             |ingest_ts                 |ingest_date|date      |
+--------------------+----+-------------+-------------+------+---------+--------------------------------------------------------------------------------------------------------+--------------------------+-----------+----------+
|2026-02-13T00:00:00Z|0   |101.301      |0.101301     |ES    |synthetic|s3a://spark/landing/electricity_prices/date=2026-02-13/prices_2026-02-13_2026-01-25T22:38:57.012Z_29.csv|2026-01-25 22:39:52.800108|2026-01-25 |2026-02-13|
|2026-02-13T01:00:00Z|1   |86.611  

In [5]:
if df_new.count() == 0:
    print("[BRONZE:electricity] Nada nuevo que cargar.")
else:
    (df_new.write
        .format("delta")
        .mode("append")
        .option("mergeSchema", "true")
        .partitionBy("date")
        .save(BRONZE_ELEC))

    max_date = df_new.agg(F.max("date").alias("max_date")).collect()[0]["max_date"]
    write_last_date(str(max_date))

    print("[BRONZE:electricity] OK ->", BRONZE_ELEC)

                                                                                

[state:electricity] last_date=2026-02-13 -> s3a://spark/medallion/_state/electricity_prices_files
[BRONZE:electricity] OK -> s3a://spark/medallion/bronze/electricity_prices


In [6]:
bronze = spark.read.format("delta").load(BRONZE_ELEC)

# DQ mínima: valores válidos
silver_src = (bronze
    .filter(F.col("region").isNotNull())
    .filter(F.col("ts_utc").isNotNull())
    .filter(F.col("date").isNotNull())
    .filter(F.col("hour").between(0, 23))
    .filter(F.col("price_eur_mwh").isNotNull())
    .filter(F.col("price_eur_mwh") >= 0)
)

# Dedupe: última fila por (region, ts_utc) según ingest_ts y source_file
w = Window.partitionBy("region", "ts_utc").orderBy(F.col("ingest_ts").desc(), F.col("source_file").desc())

silver = (silver_src
    .withColumn("rn", F.row_number().over(w))
    .filter(F.col("rn") == 1)
    .drop("rn")
    .withColumn("silver_ingest_ts", F.current_timestamp())
)

# Escribe Silver como snapshot (overwrite) particionado por date
(silver.write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .partitionBy("date")
    .save(SILVER_ELEC))

print("[SILVER:electricity] OK ->", SILVER_ELEC, "rows:", silver.count())

26/01/25 22:40:03 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
[Stage 24:>                                                         (0 + 4) / 4]

[SILVER:electricity] OK -> s3a://spark/medallion/silver/electricity_prices rows: 720


                                                                                

In [7]:
s = spark.read.format("delta").load(SILVER_ELEC)

print("Silver rows:", s.count())
s.groupBy("date").agg(F.count("*").alias("rows")).orderBy(F.col("date").desc()).show(20, truncate=False)

# Esperable: 24 filas por día y región (si region es única)
s.groupBy("date", "region").agg(F.count("*").alias("rows")).orderBy(F.col("date").desc()).show(20, truncate=False)

# Ver 10 registros
s.orderBy(F.col("date").desc(), F.col("hour").asc()).show(10, truncate=False)


                                                                                

Silver rows: 720
+----------+----+
|date      |rows|
+----------+----+
|2026-02-13|24  |
|2026-02-12|24  |
|2026-02-11|24  |
|2026-02-10|24  |
|2026-02-09|24  |
|2026-02-08|24  |
|2026-02-07|24  |
|2026-02-06|24  |
|2026-02-05|24  |
|2026-02-04|24  |
|2026-02-03|24  |
|2026-02-02|24  |
|2026-02-01|24  |
|2026-01-31|24  |
|2026-01-30|24  |
|2026-01-29|24  |
|2026-01-28|24  |
|2026-01-27|24  |
|2026-01-26|24  |
|2026-01-25|24  |
+----------+----+
only showing top 20 rows
+----------+------+----+
|date      |region|rows|
+----------+------+----+
|2026-02-13|ES    |24  |
|2026-02-12|ES    |24  |
|2026-02-11|ES    |24  |
|2026-02-10|ES    |24  |
|2026-02-09|ES    |24  |
|2026-02-08|ES    |24  |
|2026-02-07|ES    |24  |
|2026-02-06|ES    |24  |
|2026-02-05|ES    |24  |
|2026-02-04|ES    |24  |
|2026-02-03|ES    |24  |
|2026-02-02|ES    |24  |
|2026-02-01|ES    |24  |
|2026-01-31|ES    |24  |
|2026-01-30|ES    |24  |
|2026-01-29|ES    |24  |
|2026-01-28|ES    |24  |
|2026-01-27|ES    |24  |
|

In [8]:
spark.stop()