Bronze to Silver: Data Cleansing and Transformation

In [0]:
from pyspark.sql.types import StructField, StructType, StringType, IntegerType, FloatType, DateType, TimestampType
import pyspark.sql.functions as F

catalog_name = "ecommerce"

In [0]:
df = spark.table(f"{catalog_name}.bronze.brz_order_items")
display(df.limit(5))

In [0]:
df.printSchema()

### Transformations

In [0]:
# drop duplicates 
df_silver = df.dropDuplicates(["order_id","item_seq"])

# Transformation: convert 'Two' -> 2 and cast into integer
df_silver = df_silver.withColumn(
    "quantity",
    F.when(F.col("quantity") == "Two" , 2).otherwise(F.col("quantity")).cast("int")
)

# REmove '$' or any other symbols in unit_price, keep it as numeric
df_silver = df_silver.withColumn(
    "unit_price",
    F.regexp_replace("unit_price", "[$]","").cast("double")
)

# Remove '%' from discount_pct, keep it as numeric
df_silver = df_silver.withColumn(
    "discount_pct",
    F.regexp_replace("discount_pct", "%","").cast("double")
)

# cooupon code processing to lower case
df_silver = df_silver.withColumn(
    "coupon_code",
    F.lower(F.trim(F.col("coupon_code")))
)

# channel processing
df_silver = df_silver.withColumn(
    "channel",
    F.when(F.col("channel") == "web", "Website")
      .when(F.col("channel") == "app", "Mobile")
      .otherwise(F.col("channel"))
)

### Transformation: datatype conversions

In [0]:
# convert dt (string -> date)
df_silver = df_silver.withColumn(
    "dt",
    F.to_date(F.col("dt"), "yyyy-MM-dd")
)

#  convert order_ts (string -> timestamp)
df_silver = df_silver.withColumn(
  "order_ts",
  F.coalesce(
    F.to_timestamp("order_ts","yyyy-MM-dd HH:mm:ss"),  # matches 2025-08-02 22:35:13
    F.to_timestamp("order_ts","dd-MM-yyyy HH:mm")      # fallback for 02-08-2025 22:35
  )
)

# convert item_seq (string -> integer)
df_silver = df_silver.withColumn(
  "item_seq",
  F.col("item_seq").cast("int")
)

# convert tax_amount (strin -> double, strip  non-numeric characters)
df_silver= df_silver.withColumn(
  "tax_amount",
  F.regexp_replace("tax_amount", r"[^0-9.\-]","").cast("double")
)

# ADD processing time
df_silver = df_silver.withColumn(
  "proccessed_time", F.current_timestamp()
)

In [0]:
display(df_silver.limit(10))

In [0]:
df_silver.printSchema()

In [0]:
df_silver.write.format("delta")\
    .mode("overwrite")\
    .option("mergeSchema", "true")\
    .saveAsTable(f"{catalog_name}.silver.slv_order_items")