In [None]:
# ==================================================================
# BRONZE TO SILVER PROCESSING
# ==================================================================
# Initialize Spark Session
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = SparkSession.builder \
    .appName("Ecom_Bronze_to_Silver") \
    .config("spark.sql.sources.partitionOverwriteMode", "dynamic") \
    .getOrCreate()

# Mount ADLS containers (run once per cluster)
def mount_adls(container_name, mount_point):
    dbutils.fs.mount(
        source = f"wasbs://{container_name}@adlsretail.blob.core.windows.net",
        mount_point = f"/mnt/{mount_point}",
        extra_configs = {"fs.azure.account.key.adlsretail.blob.core.windows.net": dbutils.secrets.get("key-vault", "storage-key")}
    )

# Mount all containers
mount_adls("bronze", "bronze")
mount_adls("silver", "silver")
mount_adls("gold", "gold")

# Read raw data from Bronze layer
events_df = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load("/mnt/bronze/events/*.csv")

products_df = spark.read.format("csv") \
    .option("header", "true") \
    .load("/mnt/bronze/products/*.csv")

users_df = spark.read.format("csv") \
    .option("header", "true") \
    .load("/mnt/bronze/users/*.csv")

# ================ EVENTS TRANSFORMATION ================
# Clean and structure events data
silver_events = events_df \
    .withColumn("event_time", to_timestamp(col("event_time"))) \
    .withColumn("event_date", to_date(col("event_time"))) \
    .withColumn("event_type",
        when(col("event_type") == "view", "product_view")
        .when(col("event_type") == "cart", "cart_add")
        .when(col("event_type") == "purchase", "purchase")
        .otherwise("other")
    ) \
    .filter(col("event_type").isin(["product_view", "cart_add", "purchase"])) \
    .dropDuplicates(["user_id", "product_id", "event_time"]) \
    .withColumn("price", col("price").cast(DecimalType(10,2))) \
    .select(
        "event_time", "event_date", "event_type",
        "user_id", "product_id", "category_id", "price"
    )

# ================ PRODUCTS TRANSFORMATION ================
# Clean product catalog
silver_products = products_df \
    .withColumnRenamed("category_id", "category_code") \
    .withColumn("category_name", split(col("category_code"), "\.").getItem(0)) \
    .withColumn("brand", initcap(col("brand"))) \
    .withColumn("price", col("price").cast(DecimalType(10,2))) \
    .filter(col("price") > 0) \
    .dropDuplicates(["product_id"]) \
    .select(
        "product_id", "brand", "price",
        "category_code", "category_name"
    )

# ================ USERS TRANSFORMATION ================
# Clean user data
silver_users = users_df \
    .withColumn("signup_date", to_date(col("signup_date"))) \
    .withColumn("location",
        when(col("location") == "", "unknown")
        .otherwise(col("location"))
    ) \
    .withColumn("country", split(col("location"), ",").getItem(-1)) \
    .dropDuplicates(["user_id"]) \
    .select("user_id", "signup_date", "country", "location")

# Write to Silver layer (Delta format)
silver_events.write.format("delta") \
    .partitionBy("event_date") \
    .mode("overwrite") \
    .save("/mnt/silver/fact_events")

silver_products.write.format("delta") \
    .mode("overwrite") \
    .save("/mnt/silver/dim_products")

silver_users.write.format("delta") \
    .mode("overwrite") \
    .save("/mnt/silver/dim_users")

# Create Delta tables
spark.sql("""
    CREATE TABLE IF NOT EXISTS silver.fact_events
    USING DELTA
    LOCATION '/mnt/silver/fact_events'
""")

spark.sql("""
    CREATE TABLE IF NOT EXISTS silver.dim_products
    USING DELTA
    LOCATION '/mnt/silver/dim_products'
""")

spark.sql("""
    CREATE TABLE IF NOT EXISTS silver.dim_users
    USING DELTA
    LOCATION '/mnt/silver/dim_users'
""")