In [0]:
from pyspark.sql import functions as F
from pyspark.sql.types import *

In [0]:
data = [
    (1, 101, "view", "2024-01-01 10:00:00", 0.0),
    (1, 101, "purchase", "2024-01-01 10:05:00", 299.0),
    (2, 102, "view", "2024-01-01 11:00:00", 0.0),
    (2, 102, "purchase", "2024-01-01 11:10:00", 499.0),
    (3, 103, "view", "2024-01-02 09:00:00", 0.0),
    (3, 103, "purchase", "2024-01-02 09:20:00", 199.0),
]

schema = StructType([
    StructField("user_id", IntegerType()),
    StructField("product_id", IntegerType()),
    StructField("event_type", StringType()),
    StructField("event_time", StringType()),
    StructField("price", DoubleType())
])

raw_df = spark.createDataFrame(data, schema) \
    .withColumn("event_time", F.to_timestamp("event_time"))

raw_df.show()


+-------+----------+----------+-------------------+-----+
|user_id|product_id|event_type|         event_time|price|
+-------+----------+----------+-------------------+-----+
|      1|       101|      view|2024-01-01 10:00:00|  0.0|
|      1|       101|  purchase|2024-01-01 10:05:00|299.0|
|      2|       102|      view|2024-01-01 11:00:00|  0.0|
|      2|       102|  purchase|2024-01-01 11:10:00|499.0|
|      3|       103|      view|2024-01-02 09:00:00|  0.0|
|      3|       103|  purchase|2024-01-02 09:20:00|199.0|
+-------+----------+----------+-------------------+-----+



In [0]:
bronze_df = raw_df.withColumn(
    "ingestion_ts",
    F.current_timestamp()
)

bronze_df.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("ecommerce.bronze_events")


In [0]:
silver_df = spark.table("ecommerce.bronze_events") \
    .filter((F.col("price") >= 0) & (F.col("price") < 10000)) \
    .dropDuplicates(["user_id", "product_id", "event_time"]) \
    .withColumn("event_date", F.to_date("event_time")) \
    .withColumn(
        "price_tier",
        F.when(F.col("price") == 0, "free")
         .when(F.col("price") < 200, "budget")
         .when(F.col("price") < 400, "mid")
         .otherwise("premium")
    )

silver_df.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("ecommerce.silver_events")


In [0]:
gold_df = spark.table("ecommerce.silver_events") \
    .groupBy("product_id") \
    .agg(
        F.count(F.when(F.col("event_type") == "view", True)).alias("views"),
        F.count(F.when(F.col("event_type") == "purchase", True)).alias("purchases"),
        F.sum(F.when(F.col("event_type") == "purchase", F.col("price"))).alias("revenue")
    ) \
    .withColumn(
        "conversion_rate",
        F.when(F.col("views") > 0,
               F.col("purchases") / F.col("views") * 100)
    )

gold_df.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("ecommerce.gold_product_metrics")


In [0]:
%sql
DROP TABLE IF EXISTS ecommerce.events_delta;


In [0]:
silver_df.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("ecommerce.events_delta")


In [0]:
updates_df = spark.createDataFrame(
    [
        (1, 101, "purchase", "2024-01-01 10:05:00", 319.0),  # price update
        (4, 104, "purchase", "2024-01-03 12:00:00", 599.0)  # new record
    ],
    schema
).withColumn("event_time", F.to_timestamp("event_time"))


In [0]:
from delta.tables import DeltaTable

delta_table = DeltaTable.forName(
    spark,
    "ecommerce.events_delta"
)

delta_table.alias("t").merge(
    updates_df.alias("s"),
    """
    t.user_id = s.user_id AND
    t.product_id = s.product_id AND
    t.event_time = s.event_time
    """
).whenMatchedUpdate(
    set={
        "price": "s.price",
        "event_type": "s.event_type"
    }
).whenNotMatchedInsert(
    values={
        "user_id": "s.user_id",
        "product_id": "s.product_id",
        "event_time": "s.event_time",
        "event_type": "s.event_type",
        "price": "s.price",
        "event_date": "to_date(s.event_time)",
        "price_tier": "'premium'"
    }
).execute()


DataFrame[num_affected_rows: bigint, num_updated_rows: bigint, num_deleted_rows: bigint, num_inserted_rows: bigint]

In [0]:
%sql
SELECT * FROM ecommerce.events_delta;


user_id,product_id,event_type,event_time,price,ingestion_ts,event_date,price_tier
1,101,view,2024-01-01T10:00:00.000Z,0.0,2026-01-14T17:37:07.491Z,2024-01-01,free
3,103,view,2024-01-02T09:00:00.000Z,0.0,2026-01-14T17:37:07.491Z,2024-01-02,free
3,103,purchase,2024-01-02T09:20:00.000Z,199.0,2026-01-14T17:37:07.491Z,2024-01-02,budget
2,102,purchase,2024-01-01T11:10:00.000Z,499.0,2026-01-14T17:37:07.491Z,2024-01-01,premium
2,102,view,2024-01-01T11:00:00.000Z,0.0,2026-01-14T17:37:07.491Z,2024-01-01,free
1,101,purchase,2024-01-01T10:05:00.000Z,319.0,2026-01-14T17:37:07.491Z,2024-01-01,mid
4,104,purchase,2024-01-03T12:00:00.000Z,599.0,,2024-01-03,premium
