In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

# Read from bronze layer
bronze_df = spark.table("stock_market.bronze_stock_data")

print(f"Bronze records: {bronze_df.count()}")
bronze_df.printSchema()

In [0]:
# Silver transformations
silver_df = bronze_df \
    .withColumn("date", to_date(col("date"))) \
    .withColumn("daily_change", col("close") - col("open")) \
    .withColumn("daily_change_pct", round((col("close") - col("open")) / col("open") * 100, 2)) \
    .withColumn("processing_timestamp", current_timestamp())

# Show the transformed data
silver_df.orderBy(col("date").desc()).show(5)

In [0]:
# Data quality: Remove any null or invalid records
clean_df = silver_df \
    .filter(col("date").isNotNull()) \
    .filter(col("close") > 0) \
    .filter(col("volume") > 0)

# Remove duplicates (in case we accidentally ingest same data twice)
dedup_df = clean_df.dropDuplicates(["symbol", "date"])

# Check how many records we filtered out
print(f"Records after quality checks: {dedup_df.count()} (started with {bronze_df.count()})")

dedup_df.show(3)

In [0]:
# Write to silver layer table
dedup_df.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("stock_market.silver_stock_data")

print(f"Successfully wrote {dedup_df.count()} records to silver layer")

# Verify
silver_table = spark.table("stock_market.silver_stock_data")
print(f"\nSilver table now has {silver_table.count()} records")
silver_table.orderBy(col("date").desc()).select("date", "symbol", "close", "daily_change", "daily_change_pct").show(5)