In [0]:
dbutils.widgets.text("source_path", "/Volumes/workspace/ecommerce", "Source Path")
dbutils.widgets.dropdown("layer", "bronze", ["bronze","silver","gold"], "Layer")

# Read parameter values
source_path = dbutils.widgets.get("source_path")
layer = dbutils.widgets.get("layer")

print(f"Running ETL for layer: {layer} using source path: {source_path}")

from pyspark.sql.functions import current_timestamp, col, to_date, sum, count, countDistinct

def run_layer(layer_name, source_path):
    
    if layer_name == "bronze":
        print("Executing Bronze Layer...")
        
        # Create Bronze volume if not exists (SQL cell)
        spark.sql("CREATE VOLUME IF NOT EXISTS workspace.ecommerce.bronze")
        
        # Read raw CSV data
        oct_df = spark.read.csv(
            f"{source_path}/ecommerce_data/2019-Oct.csv",
            header=True,
            inferSchema=True
        )
        
        # Add ingestion timestamp
        bronze_df = oct_df.withColumn("ingestion_time", current_timestamp())
        
        # Bronze Delta path
        bronze_path = f"{source_path}/bronze/ecommerce_events"
        
        # Write Bronze data
        bronze_df.write.format("delta").mode("overwrite").save(bronze_path)
        
        # Sanity checks
        print("Bronze row count:", bronze_df.count())
        display(bronze_df.limit(10))
        bronze_df.printSchema()
    
    elif layer_name == "silver":
        print("Executing Silver Layer...")
        
        # Create Silver volume
        spark.sql("CREATE VOLUME IF NOT EXISTS workspace.ecommerce.silver")
        
        # Read Bronze data
        bronze_df = spark.read.format("delta").load(f"{source_path}/bronze/ecommerce_events")
        
        # Clean and validate
        silver_df = (
            bronze_df
            .filter(col("user_id").isNotNull())
            .filter(col("event_type").isin("view", "cart", "purchase"))
            .filter((col("price").isNull()) | (col("price") >= 0))
            .dropDuplicates()
        )
        
        # Silver Delta path
        silver_path = f"{source_path}/silver/ecommerce_events_clean"
        
        # Write Silver data
        silver_df.write.format("delta").mode("overwrite").save(silver_path)
        
        # Checks
        print("Bronze rows:", bronze_df.count())
        print("Silver rows:", silver_df.count())
        print("Null user_id count:", silver_df.filter(col("user_id").isNull()).count())
        silver_df.groupBy("event_type").count().show()
        print("Negative price count:", silver_df.filter(col("price") < 0).count())
        display(silver_df.limit(10))
    
    elif layer_name == "gold":
        print("Executing Gold Layer...")
        
        # Create Gold volume
        spark.sql("CREATE VOLUME IF NOT EXISTS workspace.ecommerce.gold")
        
        # Read Silver data
        silver_df = spark.read.format("delta").load(f"{source_path}/silver/ecommerce_events_clean")
        
        # Aggregates for analytics
        gold_df = (
            silver_df
            .filter(col("event_type") == "purchase")
            .withColumn("event_date", to_date("event_time"))
            .groupBy("event_date")
            .agg(
                sum("price").alias("total_revenue"),
                count("*").alias("total_orders"),
                countDistinct("user_id").alias("unique_customers")
            )
        )
        
        # Gold Delta path
        gold_path = f"{source_path}/gold/daily_sales_metrics"
        
        # Write Gold data
        gold_df.write.format("delta").mode("overwrite").save(gold_path)
        
        # Checks
        display(gold_df.orderBy("event_date").limit(10))
        print("Duplicate dates:", gold_df.count() - gold_df.select("event_date").distinct().count())
        gold_df.select(
            "event_date",
            "total_revenue",
            "total_orders",
            "unique_customers"
        ).summary().show()
    
    else:
        raise ValueError(f"Unknown layer: {layer_name}")

# Execute the layers
run_layer(layer, source_path)

Running ETL for layer: silver using source path: /Volumes/workspace/ecommerce
Executing Silver Layer...
Bronze rows: 42448764
Silver rows: 42418544
Null user_id count: 0
+----------+--------+
|event_type|   count|
+----------+--------+
|  purchase|  742773|
|      cart|  898443|
|      view|40777328|
+----------+--------+

Negative price count: 0


event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session,ingestion_time
2019-10-13T06:27:52.000Z,view,14100052,2053013557670314521,electronics.audio.acoustic,yamaha,117.09,520838884,0425c94c-c4bf-4a47-b60c-8e037619c153,2026-01-16T13:03:41.849Z
2019-10-13T06:27:18.000Z,view,26403418,2053013563651392361,,,123.81,518923889,13efbbea-3a9d-4fbc-b4ad-b93d742a3a73,2026-01-16T13:03:41.849Z
2019-10-13T06:27:32.000Z,view,1004856,2053013555631882655,electronics.smartphone,samsung,131.64,523993629,02400a23-1627-406a-bea2-a633bb8cc976,2026-01-16T13:03:41.849Z
2019-10-13T06:27:37.000Z,view,34700109,2061717937420501730,,,6.04,559431257,b05396aa-b436-4a4b-829a-208046aacdac,2026-01-16T13:03:41.849Z
2019-10-13T06:28:37.000Z,cart,1004720,2053013555631882655,electronics.smartphone,huawei,126.9,547139919,50b7f361-4483-4af0-94c1-2458ce96c0f5,2026-01-16T13:03:41.849Z
2019-10-13T06:26:39.000Z,purchase,1307135,2053013558920217191,computers.notebook,hp,262.3,529774733,c764bbfb-b514-4c89-91c0-499ef37bbccf,2026-01-16T13:03:41.849Z
2019-10-13T06:26:49.000Z,view,2601793,2053013563970159485,,artel,138.69,558158002,482e12e5-30e3-4298-90f0-ebd975f5f2df,2026-01-16T13:03:41.849Z
2019-10-13T06:27:44.000Z,view,3701244,2053013565983425517,appliances.environment.vacuum,elenberg,33.43,512606317,55c21396-4e8e-4527-9ada-8eea22c1a5e4,2026-01-16T13:03:41.849Z
2019-10-13T06:28:32.000Z,view,3400018,2053013555178897795,,panasonic,177.84,516242874,92652680-cefe-41da-ab3b-2136032d74ad,2026-01-16T13:03:41.849Z
2019-10-13T06:26:11.000Z,view,5100575,2053013553341792533,electronics.clocks,apple,435.78,513383052,fe42f7d7-8050-4471-ba06-ed732b0895fa,2026-01-16T13:03:41.849Z
