In [0]:
# =========================
# WALMART ETL PIPELINE
# =========================

# 1️⃣ EXTRACT (read raw table from Catalog)
bronze_df = spark.read.table("workspace.default.walmart")

display(bronze_df)


# 2️⃣ TRANSFORM → SILVER (cleaning)
silver_df = (
    bronze_df
    .dropna()            # remove null values
    .dropDuplicates()    # remove duplicate rows
)

display(silver_df)


# Save SILVER table
silver_df.write.format("delta") \
    .mode("overwrite") \
    .saveAsTable("workspace.mlops.walmart_silver")


# 3️⃣ TRANSFORM → GOLD (features for analytics / ML)
from pyspark.sql.functions import col

gold_df = silver_df.select("*")   # we will refine later for ML

display(gold_df)


# Save GOLD table
gold_df.write.format("delta") \
    .mode("overwrite") \
    .saveAsTable("workspace.mlops.walmart_gold")


print("✅ ETL Pipeline Completed: Bronze → Silver → Gold")
