In [0]:
print("=== DAY 6: MEDALLION ARCHITECTURE ===\n")

from pyspark.sql import functions as F

# Base path for Delta tables
base_path = "/Volumes/workspace/ecommerce/ecommerce_data"

=== DAY 6: MEDALLION ARCHITECTURE ===



##1: Design 3-layer architecture

In [0]:
print("--- TASK 1: 3-Layer Architecture ---")
print("Bronze: Raw data (as-is from CSV)")
print("Silver: Cleaned data (validated, deduplicated)")
print("Gold: Aggregated data (business metrics)")
print("✓ Architecture designed\n")

--- TASK 1: 3-Layer Architecture ---
Bronze: Raw data (as-is from CSV)
Silver: Cleaned data (validated, deduplicated)
Gold: Aggregated data (business metrics)
✓ Architecture designed



##2: Build Bronze layer (raw ingestion)

In [0]:
print("--- TASK 2: Building BRONZE Layer ---")

# Read raw CSV
raw_csv_path = "/Volumes/workspace/ecommerce/ecommerce_data/2019-Oct.csv"
raw = spark.read.csv(raw_csv_path, header=True, inferSchema=True)

print(f"Raw CSV loaded: {raw.count():,} rows")

# Save to Bronze using a working path
bronze_path = f"{base_path}/medallion/bronze/events"
raw.withColumn("ingestion_timestamp", F.current_timestamp()) \
   .withColumn("source_file", F.lit("2019-Oct.csv")) \
   .write.format("delta").mode("overwrite").save(bronze_path)

print(f"✓ BRONZE saved: {bronze_path}")
print(f"Bronze rows: {spark.read.format('delta').load(bronze_path).count():,}")

--- TASK 2: Building BRONZE Layer ---
Raw CSV loaded: 42,448,764 rows
✓ BRONZE saved: /Volumes/workspace/ecommerce/ecommerce_data/medallion/bronze/events
Bronze rows: 42,448,764


##3: Build Silver layer (cleaned data)

In [0]:
print("\n--- TASK 3: Building SILVER Layer (FIXED) ---")

# Read from Bronze
bronze = spark.read.format("delta").load(bronze_path)

# Apply PROPER cleaning and validation
silver = bronze.filter(
    (F.col("price").isNotNull()) &
    (F.col("price") > 0) &
    (F.col("price") < 100000)
).filter(
    F.col("event_type").isin(["view", "cart", "purchase"])
).filter(
    F.col("event_time").isNotNull()
).dropDuplicates(["user_id", "product_id", "event_time"]) \
    .withColumn("brand_cleaned", 
                F.when(F.col("brand").isNull(), "unknown").otherwise(F.col("brand"))) \
    .withColumn("category_cleaned",
                F.when(F.col("category_code").isNull(), "uncategorized").otherwise(F.col("category_code"))) \
    .withColumn("event_date", F.to_date("event_time")) \
    .withColumn("event_hour", F.hour("event_time")) \
    .withColumn("price_segment",
        F.when(F.col("price") < 50, "low")
         .when(F.col("price") < 200, "medium")
         .when(F.col("price") < 500, "high")
         .otherwise("premium")) \
    .drop("brand", "category_code") \
    .withColumnRenamed("brand_cleaned", "brand") \
    .withColumnRenamed("category_cleaned", "category_code")

# Save to Silver
silver_path = f"{base_path}/medallion/silver/events"
silver.write.format("delta").mode("overwrite").save(silver_path)

print(f"✓ SILVER saved: {silver_path}")
print(f"Silver rows: {spark.read.format('delta').load(silver_path).count():,}")

print("\nSample from cleaned Silver (no nulls in brand/category):")
display(silver.limit(5))


--- TASK 3: Building SILVER Layer (FIXED) ---
✓ SILVER saved: /Volumes/workspace/ecommerce/ecommerce_data/medallion/silver/events
Silver rows: 42,339,185

Sample from cleaned Silver (no nulls in brand/category):


event_time,event_type,product_id,category_id,price,user_id,user_session,ingestion_timestamp,source_file,brand,category_code,event_date,event_hour,price_segment
2019-10-13T06:26:09.000Z,view,5100395,2053013553341792533,37.32,555925739,ab17936d-8f29-4c12-afd0-4d32d7349fb7,2026-01-14T13:04:19.212Z,2019-Oct.csv,wonlex,electronics.clocks,2019-10-13,6,low
2019-10-13T06:27:55.000Z,view,1003317,2053013555631882655,952.15,524823618,ed04bca2-3539-496b-a415-58ffa6e5b5b5,2026-01-14T13:04:19.212Z,2019-Oct.csv,apple,electronics.smartphone,2019-10-13,6,premium
2019-10-13T06:29:09.000Z,view,1004857,2053013555631882655,130.71,559559345,42fee0bf-9cd7-49f7-b15e-90b3e36f50a8,2026-01-14T13:04:19.212Z,2019-Oct.csv,samsung,electronics.smartphone,2019-10-13,6,medium
2019-10-13T06:29:53.000Z,view,1002544,2053013555631882655,460.51,559245653,0c13818b-375a-4662-bd8f-e2b1866c5fb3,2026-01-14T13:04:19.212Z,2019-Oct.csv,apple,electronics.smartphone,2019-10-13,6,high
2019-10-13T06:31:36.000Z,view,3701328,2053013565983425517,1261.27,556917297,36e89b73-f98b-459d-93ea-852f3d20d68e,2026-01-14T13:04:19.212Z,2019-Oct.csv,irobot,appliances.environment.vacuum,2019-10-13,6,premium


##4: Build Gold layer (aggregates)

In [0]:
print("\n--- TASK 4: Building GOLD Layer (FIXED) ---")

# Read from Silver
silver_df = spark.read.format("delta").load(silver_path)

# GOLD 1: Product Performance
print("\n1. Product Performance Gold Table")
product_gold = silver_df.groupBy("product_id", "category_code") \
    .agg(
        F.count("*").alias("total_events"),
        F.sum(F.when(F.col("event_type") == "view", 1).otherwise(0)).alias("views"),
        F.sum(F.when(F.col("event_type") == "cart", 1).otherwise(0)).alias("cart_adds"),
        F.sum(F.when(F.col("event_type") == "purchase", 1).otherwise(0)).alias("purchases"),
        F.sum(F.when(F.col("event_type") == "purchase", F.col("price"))).alias("revenue"),
        F.countDistinct("user_id").alias("unique_users")
    ).withColumn("conversion_rate", 
                 F.when(F.col("views") > 0, 
                        F.round(F.col("purchases") / F.col("views") * 100, 2))
                  .otherwise(0)) \
     .withColumn("cart_to_purchase_rate",
                 F.when(F.col("cart_adds") > 0,
                        F.round(F.col("purchases") / F.col("cart_adds") * 100, 2))
                  .otherwise(0)) \
     .withColumn("avg_revenue_per_user",
                 F.when(F.col("unique_users") > 0,
                        F.round(F.col("revenue") / F.col("unique_users"), 2))
                  .otherwise(0))

product_gold_path = f"{base_path}/medallion/gold/product_performance"

# Save with schema evolution option
product_gold.write.format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .save(product_gold_path)

print(f"✓ Product Gold saved: {product_gold_path}")

# Show sample results
print("\nGold table schema:")
product_gold.printSchema()

print("\nTop 5 products by revenue:")
top_products = spark.read.format("delta").load(product_gold_path) \
    .orderBy(F.desc("revenue")) \
    .limit(5)
display(top_products)


--- TASK 4: Building GOLD Layer (FIXED) ---

1. Product Performance Gold Table
✓ Product Gold saved: /Volumes/workspace/ecommerce/ecommerce_data/medallion/gold/product_performance

Gold table schema:
root
 |-- product_id: integer (nullable = true)
 |-- category_code: string (nullable = true)
 |-- total_events: long (nullable = false)
 |-- views: long (nullable = true)
 |-- cart_adds: long (nullable = true)
 |-- purchases: long (nullable = true)
 |-- revenue: double (nullable = true)
 |-- unique_users: long (nullable = false)
 |-- conversion_rate: double (nullable = true)
 |-- cart_to_purchase_rate: double (nullable = true)
 |-- avg_revenue_per_user: double (nullable = true)


Top 5 products by revenue:


product_id,category_code,total_events,views,cart_adds,purchases,revenue,conversion_rate,cart_to_purchase_rate,unique_users,avg_revenue_per_user
1005115,electronics.smartphone,355251,327673,15037,12541,12404835.950000076,3.83,83.4,171002,72.54
1005105,electronics.smartphone,215278,197865,10120,7293,10239248.68,3.69,72.07,114823,89.17
1004249,electronics.smartphone,230560,207359,14113,9088,6728639.860000001,4.38,64.39,96997,69.37
1005135,electronics.smartphone,108477,100196,5063,3218,5567806.640000001,3.21,63.56,62652,88.87
1004767,electronics.smartphone,436494,378548,36142,21804,5430222.719999998,5.76,60.33,175611,30.92


### VERIFICATION

In [0]:
print("\n" + "="*60)
print("DAY 6 COMPLETED!")
print("="*60)

print("\nCreated layers:")
print(f"1. BRONZE: {bronze_path}")
print(f"2. SILVER: {silver_path}")
print(f"3. GOLD - Products: {product_gold_path}")
print(f"4. GOLD - Users: {user_gold_path}")

#  sample from each layer
print("\nSample from each layer:")

print("\nBRONZE (raw):")
display(spark.read.format("delta").load(bronze_path).limit(3))

print("\nSILVER (cleaned):")
display(spark.read.format("delta").load(silver_path).limit(3))

print("\nGOLD - Products:")
display(spark.read.format("delta").load(product_gold_path).limit(3))


DAY 6 COMPLETED!

Created layers:
1. BRONZE: /Volumes/workspace/ecommerce/ecommerce_data/medallion/bronze/events
2. SILVER: /Volumes/workspace/ecommerce/ecommerce_data/medallion/silver/events
3. GOLD - Products: /Volumes/workspace/ecommerce/ecommerce_data/medallion/gold/product_performance
4. GOLD - Users: /Volumes/workspace/ecommerce/ecommerce_data/medallion/gold/user_summary

Sample from each layer:

BRONZE (raw):


event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session,ingestion_timestamp,source_file
2019-10-23T09:44:06.000Z,view,2800638,2053013563835941749,appliances.kitchen.refrigerators,,194.32,523560041,f2da82d4-3aca-4ebb-a5a7-2a176537da35,2026-01-14T13:04:19.212Z,2019-Oct.csv
2019-10-23T09:44:06.000Z,view,1306746,2053013558920217191,computers.notebook,lenovo,873.61,532561846,d2a62c6a-8529-4692-b562-f815296b5c34,2026-01-14T13:04:19.212Z,2019-Oct.csv
2019-10-23T09:44:06.000Z,view,4700549,2053013560899928785,auto.accessories.videoregister,neoline,143.89,550105456,d4d16d85-4c7a-48eb-b1dc-64a836985aff,2026-01-14T13:04:19.212Z,2019-Oct.csv



SILVER (cleaned):


event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session,ingestion_timestamp,source_file,event_date,event_hour,price_segment
2019-10-13T06:26:05.000Z,purchase,1004857,2053013555631882655,electronics.smartphone,samsung,130.71,559693152,de3b170f-89fc-479f-945c-5b1f3574a870,2026-01-14T13:04:19.212Z,2019-Oct.csv,2019-10-13,6,medium
2019-10-13T06:26:18.000Z,view,1003317,2053013555631882655,electronics.smartphone,apple,952.15,512691790,2484906b-0bd2-aae9-0706-437a660106b5,2026-01-14T13:04:19.212Z,2019-Oct.csv,2019-10-13,6,premium
2019-10-13T06:26:32.000Z,view,1307444,2053013558920217191,computers.notebook,asus,424.7,559393464,bf15f947-9418-40c9-a10d-82eb0a267e7f,2026-01-14T13:04:19.212Z,2019-Oct.csv,2019-10-13,6,high



GOLD - Products:


product_id,category_code,total_events,views,cart_adds,purchases,revenue,conversion_rate,cart_to_purchase_rate,unique_users,avg_revenue_per_user
45300050,kids.swing,336,333,0,3,136.91,0.9,0.0,208,0.66
1004529,electronics.smartphone,13868,13688,40,140,54104.63999999999,1.02,350.0,7774,6.96
26201108,uncategorized,1757,1719,0,38,3306.0,2.21,0.0,1178,2.81
