In [0]:
# BRONZE: Raw ingestion

from pyspark.sql.functions import current_timestamp
raw = spark.read.csv("/Volumes/workspace/ecommerce/ecommerce_data/2019-Oct.csv", header=True, inferSchema=True)
bronze = raw.withColumn("ingestion_ts", current_timestamp())
bronze.write.format("delta").mode("overwrite").save("/Volumes/workspace/ecommerce/bronze/events/")

In [0]:
from pyspark.sql import functions as F
# SILVER: Cleaned Data
bronze= spark.read.format("delta").load("/Volumes/workspace/ecommerce/bronze/events/")
silver = bronze.filter(F.col("price") > 0) \
                .filter(F.col("price") < 10000) \
                    .dropDuplicates(["user_session", "event_time"]) \
                        .withColumn("event_date", F.to_date("event_time")) \
                        .withColumn("price_tier", F.when(F.col("price") < 10, "budget")
                                    .when(F.col("price") < 50, "mid")
                                    .otherwise("premium"))
                        
silver.write.format("delta").mode("overwrite").save("/Volumes/workspace/ecommerce/silver/events/")

In [0]:
# GOLD: Aggregates
silver = spark.read.format("delta").load("/Volumes/workspace/ecommerce/silver/events/")
silver = silver.withColumn("view_user_id", F.when(F.col("event_type") == "view", F.col("user_id"))) \
               .withColumn("purchase_user_id", F.when(F.col("event_type") == "purchase", F.col("user_id"))) \
               .withColumn("purchase_price", F.when(F.col("event_type") == "purchase", F.col("price")))
product_perf = silver.groupBy("event_date","category_code","product_id", "brand") \
    .agg(
        F.countDistinct("view_user_id").alias("views"),
        F.countDistinct("purchase_user_id").alias("purchases"),
        F.sum("purchase_price").alias("revenue")
    ) \
    .withColumn(
        "conversion_rate",
        F.when(F.col("views") > 0, F.col("purchases") / F.col("views") * 100)
         .otherwise(None)
    )
product_perf.write.format("delta").mode("overwrite").save("/Volumes/workspace/ecommerce/gold/products/")