In [0]:
#CELL 1 — Imports
from pyspark.sql import functions as F
from pyspark.sql.types import *
from delta.tables import DeltaTable
from datetime import datetime

print("✔ Libraries imported")

#CELL 2 — Paths
raw_path = "/Volumes/dbw_cti_processing/default/raw_data/ev_range_analysis_subset.csv"
bronze_path = "/mnt/cti/bronze/ev_data"

print("Raw:", raw_path)
print("Bronze:", bronze_path)

#CELL 3 — File Check
print("Checking file...")

try:
    spark.read.csv(raw_path, header=True).limit(1).show()
    print("✔ CSV file found.")
except Exception as e:
    print("❌ File missing:", e)

#CELL 4 — Read CSV
df_raw = (
    spark.read
    .format("csv")
    .option("header", "true")
    .option("inferSchema", "true")
    .load(raw_path)
)

display(df_raw)
df_raw.printSchema()

print("✔ Records loaded:", df_raw.count())

#CELL 5 — Sample
df_raw.show(5, truncate=False)

#CELL 6 — No extraction needed
df_processed = df_raw
print("✔ Using CSV data as-is.")

#CELL 7 — Sanitize Columns
def sanitize(df):
    for col in df.columns:
        df = df.withColumnRenamed(col, col.strip().replace(" ", "_"))
    return df

df_clean = sanitize(df_processed)

#CELL 8 — Generate Strong Unique record_id
#Fix: SHA256 hash of all columns, ensuring no duplicates ever
# Build record_id from all columns (safe)
cols = [F.coalesce(F.col(c).cast("string"), F.lit("NULL")) for c in df_clean.columns]

df_bronze_ready = (
    df_clean
        .withColumn(
            "record_id",
            F.sha2(F.concat_ws("||", *cols), 256)
        )
        .withColumn("ingestion_timestamp", F.current_timestamp())
        .withColumn("source", F.lit("EV_Range_Data"))
        .withColumn("file_name", F.lit(raw_path))
        .withColumn("year", F.year(F.current_timestamp()))
        .withColumn("month", F.month(F.current_timestamp()))
        .withColumn("day", F.dayofmonth(F.current_timestamp()))
)

#CELL 9 — Deduplicate the source batch
df_source = df_bronze_ready.dropDuplicates(["record_id"])

print("Source before:", df_bronze_ready.count())
print("Source after dedup:", df_source.count())

#CELL 10 — Repair existing Bronze table (only if exists)
#✔ Removes bad duplicate record_id from old table
if DeltaTable.isDeltaTable(spark, bronze_path):
    print("🔧 Cleaning existing Bronze table...")

    df_target = spark.read.format("delta").load(bronze_path)

    from pyspark.sql.window import Window
    w = Window.partitionBy("record_id").orderBy(F.col("ingestion_timestamp").desc())

    df_target_clean = (
        df_target
            .withColumn("rn", F.row_number().over(w))
            .filter("rn = 1")
            .drop("rn")
    )

    print("Target before:", df_target.count())
    print("Target after dedup:", df_target_clean.count())

    (
        df_target_clean.write
            .format("delta")
            .mode("overwrite")
            .option("overwriteSchema", "true")
            .save(bronze_path)
    )

    print("✔ Bronze table repaired.")
else:
    print("✔ No existing Bronze table found — skipping repair.")

#CELL 11 — Single MERGE (Clean Source → Clean Target)
print("💾 Writing to Bronze...")

if DeltaTable.isDeltaTable(spark, bronze_path):
    print("✔ MERGE into existing Bronze table")

    delta = DeltaTable.forPath(spark, bronze_path)

    (
        delta.alias("t")
            .merge(
                df_source.alias("s"),
                "t.record_id = s.record_id"
            )
            .whenMatchedUpdateAll()
            .whenNotMatchedInsertAll()
            .execute()
    )

    print("✔ MERGE completed.")
else:
    print("✔ Creating new Bronze table")

    (
        df_source.write
            .format("delta")
            .mode("overwrite")
            .partitionBy("year", "month", "day")
            .save(bronze_path)
    )

    print("✔ New Bronze table created.")

#CELL 12 — Enable CDF
spark.sql(f"""
    ALTER TABLE delta.`{bronze_path}`
    SET TBLPROPERTIES (delta.enableChangeDataFeed = true)
""")
print("✔ CDF enabled.")

#CELL 13 — Summary
df_verify = spark.read.format("delta").load(bronze_path)

print("Total records:", df_verify.count())
df_verify.groupBy("source").count().show()
df_verify.groupBy("year","month","day").count().show()

#CELL 14 — Final Sample
display(
    df_verify.select(
        "Model_Year", "Make", "Model", "Electric_Range",
        "latitude", "longitude", "ingestion_timestamp"
    )
)
print("✔ Bronze ingestion complete.")