In [0]:
%sql
CREATE VOLUME IF NOT EXISTS workspace.ecommerce.bronze;
CREATE VOLUME IF NOT EXISTS workspace.ecommerce.silver;
CREATE VOLUME IF NOT EXISTS workspace.ecommerce.gold;

BRONZE LAYER

In [0]:
from pyspark.sql.functions import current_timestamp

# Read raw CSV data
oct_df = spark.read.csv(
    "/Volumes/workspace/ecommerce/ecommerce_data/2019-Oct.csv",
    header=True,
    inferSchema=True
)

# Add ingestion metadata
bronze_df = oct_df.withColumn("ingestion_time", current_timestamp())

# Bronze Delta path
bronze_path = "/Volumes/workspace/ecommerce/bronze/ecommerce_events"

# Write to Bronze Delta table
bronze_df.write \
    .format("delta") \
    .mode("overwrite") \
    .save(bronze_path)


In [0]:
# 1. Check row count
print("Bronze row count:", bronze_df.count())

# 2. Preview data
display(bronze_df.limit(10))

# 3. Check schema
bronze_df.printSchema()

SILVER LAYER

In [0]:
from pyspark.sql.functions import col

# Read Bronze data
bronze_df = spark.read.format("delta").load(
    "/Volumes/workspace/ecommerce/bronze/ecommerce_events"
)

# Clean and validate data
silver_df = (
    bronze_df
    .filter(col("user_id").isNotNull())
    .filter(col("event_type").isin("view", "cart", "purchase"))
    .filter((col("price").isNull()) | (col("price") >= 0))
    .dropDuplicates()
)

# Silver Delta path
silver_path = "/Volumes/workspace/ecommerce/silver/ecommerce_events_clean"

# Write to Silver Delta table
silver_df.write \
    .format("delta") \
    .mode("overwrite") \
    .save(silver_path)


In [0]:
# 1. Compare row counts
print("Bronze rows:", bronze_df.count())
print("Silver rows:", silver_df.count())

# 2. Check invalid user_id
print("Null user_id count:",
      silver_df.filter(col("user_id").isNull()).count())

# 3. Check invalid event types
silver_df.groupBy("event_type").count().show()

# 4. Check negative prices
print("Negative price count:",
      silver_df.filter(col("price") < 0).count())

# 5. Preview cleaned data
display(silver_df.limit(10))

GOLD LAYER

In [0]:
from pyspark.sql.functions import col, to_date, sum, count, countDistinct

# Read Silver data
silver_df = spark.read.format("delta").load(
    "/Volumes/workspace/ecommerce/silver/ecommerce_events_clean"
)

# Create business aggregates
gold_df = (
    silver_df
    .filter(col("event_type") == "purchase")
    .withColumn("event_date", to_date("event_time"))
    .groupBy("event_date")
    .agg(
        sum("price").alias("total_revenue"),
        count("*").alias("total_orders"),
        countDistinct("user_id").alias("unique_customers")
    )
)

# Gold Delta path
gold_path = "/Volumes/workspace/ecommerce/gold/daily_sales_metrics"

# Write to Gold Delta table
gold_df.write \
    .format("delta") \
    .mode("overwrite") \
    .save(gold_path)


In [0]:

# 1. Preview metrics
display(gold_df.orderBy("event_date").limit(10))

# 2. Check for duplicate dates
print("Duplicate dates:",
      gold_df.count() - gold_df.select("event_date").distinct().count())

# 3. Basic sanity checks
gold_df.select(
    "event_date",
    "total_revenue",
    "total_orders",
    "unique_customers"
).summary().show()