In [0]:
# ===================================================================
# NOTEBOOK: 03_Bronze_to_Silver
# PURPOSE: Transform raw Bronze data into clean, validated Silver layer
# AUTHOR: Jose Veliz - Space Cowboy 
# DATE: 2025-10-19
# ===================================================================

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window

# Spark is already initialized in Databricks
spark = spark

print("Libraries imported")
print("Space Cowboy's Silver Layer Transformation")
print("=" * 60)

In [0]:
# ===================================================================
# READ FROM BRONZE LAYER
# ===================================================================

print("Reading data from Bronze layer...")

# Read from Bronze table
bronze_df = spark.table("bronze_stock_prices")

print(f"\n BRONZE DATA LOADED:")
print(f"   Total rows: {bronze_df.count():,}")
print(f"   Columns: {len(bronze_df.columns)}")

print(f"\n BRONZE SCHEMA:")
bronze_df.printSchema()

print(f"\n SAMPLE BRONZE DATA:")
bronze_df.orderBy(desc("date")).show(5, truncate=False)

print("\n" + "=" * 60)
# ```

# **Click Run ▶️**

# **You should see:**
# ```
# 📖 Reading data from Bronze layer...

# ✅ BRONZE DATA LOADED:
#    Total rows: 500
#    Columns: 9

# [Schema and sample data displayed]

In [0]:
# ===================================================================
# DATA QUALITY CHECKS (Before Transformation)
# ===================================================================

print(" Running data quality checks on Bronze data...")
print("=" * 60)

# Check 1: Null counts in critical columns
print("\n NULL VALUE CHECK:")
null_counts = bronze_df.select([
    count(when(col(c).isNull(), c)).alias(c) 
    for c in ["symbol", "date", "close", "volume"]
])
null_counts.show()

# Check 2: Duplicate records
print(" DUPLICATE CHECK:")
duplicate_count = bronze_df.groupBy("symbol", "date").count().filter("count > 1").count()
print(f"   Duplicate records (symbol + date): {duplicate_count}")

# Check 3: Data type issues (non-numeric prices)
print("\n DATA TYPE CHECK:")
print("   Checking if price columns are numeric...")
try:
    bronze_df.select(
        col("open").cast("double"),
        col("high").cast("double"),
        col("low").cast("double"),
        col("close").cast("double")
    ).count()
    print("   All price columns are numeric")
except Exception as e:
    print(f"   Data type issue found: {e}")

# Check 4: Date range
print("\n DATE RANGE CHECK:")
date_stats = bronze_df.agg(
    min("date").alias("earliest_date"),
    max("date").alias("latest_date"),
    countDistinct("date").alias("unique_dates")
).collect()[0]
print(f"   Earliest date: {date_stats['earliest_date']}")
print(f"   Latest date: {date_stats['latest_date']}")
print(f"   Unique dates: {date_stats['unique_dates']}")

# Check 5: Volume validation (should be positive)
print("\n VOLUME VALIDATION:")
negative_volume = bronze_df.filter(col("volume").cast("long") < 0).count()
print(f"   Records with negative volume: {negative_volume}")

print("\n DATA QUALITY CHECKS COMPLETE!")
print("=" * 60)

In [0]:
# ===================================================================
# SILVER LAYER TRANSFORMATIONS
# ===================================================================

print(" Applying Silver layer transformations...")
print("=" * 60)

# Transformation 1: Remove duplicates
print("\n Removing duplicates based on symbol + date...")
silver_df = bronze_df.dropDuplicates(["symbol", "date"])
rows_removed = bronze_df.count() - silver_df.count()
print(f"   Rows removed: {rows_removed}")
print(f"   Rows remaining: {silver_df.count():,}")

# Transformation 2: Filter out nulls in critical columns
print("\n Filtering out records with null critical values...")
initial_count = silver_df.count()
silver_df = silver_df.filter(
    col("symbol").isNotNull() &
    col("date").isNotNull() &
    col("close").isNotNull() &
    col("volume").isNotNull()
)
rows_filtered = initial_count - silver_df.count()
print(f"   Rows filtered: {rows_filtered}")
print(f"   Rows remaining: {silver_df.count():,}")

# Transformation 3: Convert data types
print("\n Converting data types to proper formats...")
silver_df = silver_df \
    .withColumn("date", to_date(col("date"))) \
    .withColumn("open", col("open").cast("double")) \
    .withColumn("high", col("high").cast("double")) \
    .withColumn("low", col("low").cast("double")) \
    .withColumn("close", col("close").cast("double")) \
    .withColumn("volume", col("volume").cast("long"))
print("   Data types converted")

# Transformation 4: Add derived columns (Business Logic!)
print("\n Adding derived/calculated columns...")
silver_df = silver_df \
    .withColumn("daily_change", col("close") - col("open")) \
    .withColumn("daily_change_pct", 
                round(((col("close") - col("open")) / col("open")) * 100, 2)) \
    .withColumn("price_range", col("high") - col("low")) \
    .withColumn("price_range_pct", 
                round(((col("high") - col("low")) / col("open")) * 100, 2))

print("   Added: daily_change (close - open)")
print("   Added: daily_change_pct (% change from open to close)")
print("   Added: price_range (high - low)")
print("   Added: price_range_pct (% range based on open)")

# Transformation 5: Add date components (for partitioning/analysis)
print("\n Adding date components...")
silver_df = silver_df \
    .withColumn("year", year(col("date"))) \
    .withColumn("month", month(col("date"))) \
    .withColumn("quarter", quarter(col("date"))) \
    .withColumn("day_of_week", dayofweek(col("date"))) \
    .withColumn("week_of_year", weekofyear(col("date")))

print("   Added: year, month, quarter, day_of_week, week_of_year")

# Transformation 6: Add processing timestamp
print("\n Adding metadata columns...")
silver_df = silver_df \
    .withColumn("silver_processing_timestamp", current_timestamp()) \
    .withColumn("data_quality_flag", lit("VALID"))

print("   Added: silver_processing_timestamp")
print("   Added: data_quality_flag")

print("\n ALL TRANSFORMATIONS COMPLETE!")
print("=" * 60)

# 🔄 Applying Silver layer transformations...
# ============================================================
# 1️⃣ Removing duplicates based on symbol + date...
# 2️⃣ Filtering out records with null critical values...
# 3️⃣ Converting data types to proper formats...
# 4️⃣ Adding derived/calculated columns...
# 5️⃣ Adding date components...
# 6️⃣ Adding metadata columns...
# ✅ ALL TRANSFORMATIONS COMPLETE!

In [0]:
# ===================================================================
# REVIEW SILVER LAYER DATA
# ===================================================================

print(" Reviewing transformed Silver data...")
print("=" * 60)

print("\n SILVER SCHEMA (After Transformations):")
silver_df.printSchema()

print(f"\n SILVER DATA SUMMARY:")
print(f"   Total rows: {silver_df.count():,}")
print(f"   Total columns: {len(silver_df.columns)}")
print(f"   Symbols: {silver_df.select('symbol').distinct().count()}")
print(f"   Date range: {silver_df.agg(min('date'), max('date')).collect()[0]}")

print("\n SAMPLE SILVER DATA (Most Recent):")
silver_df.select(
    "symbol", "date", "open", "close", 
    "daily_change", "daily_change_pct", 
    "price_range", "volume"
).orderBy(desc("date")).show(10)

print("\n STATISTICS (Daily Change % by Symbol):")
silver_df.groupBy("symbol").agg(
    round(avg("daily_change_pct"), 2).alias("avg_daily_change"),
    round(min("daily_change_pct"), 2).alias("biggest_loss"),
    round(max("daily_change_pct"), 2).alias("biggest_gain"),
    count("*").alias("trading_days")
).orderBy("symbol").show()

print("\n SILVER DATA LOOKS GOOD!")
print("=" * 60)

In [0]:
# ===================================================================
# SAVE TO SILVER LAYER (MANAGED TABLE)
# ===================================================================

print(" Saving data to Silver layer...")
print("=" * 60)

# Select final columns in proper order
silver_final = silver_df.select(
    # Identifiers
    "symbol",
    "date",
    "year",
    "month",
    "quarter",
    "week_of_year",
    "day_of_week",
    
    # OHLCV data
    "open",
    "high",
    "low",
    "close",
    "volume",
    
    # Derived metrics
    "daily_change",
    "daily_change_pct",
    "price_range",
    "price_range_pct",
    
    # Metadata
    "ingestion_timestamp",
    "silver_processing_timestamp",
    "data_quality_flag",
    "source"
)

# Create temporary view
silver_final.createOrReplaceTempView("silver_stock_prices_temp")

# Write to managed Delta table
spark.sql("""
    CREATE OR REPLACE TABLE silver_stock_prices
    USING DELTA
    PARTITIONED BY (year, month)
    AS SELECT * FROM silver_stock_prices_temp
""")

print("\n DATA WRITTEN TO SILVER LAYER!")
print(f"   Table name: silver_stock_prices")
print(f"   Format: Delta Lake (managed)")
print(f"   Partitions: year, month")
print(f"   Total rows: {silver_final.count():,}")
print(f"   Total columns: {len(silver_final.columns)}")

print("\n" + "=" * 60)

#  Saving data to Silver layer...
# ============================================================

#  DATA WRITTEN TO SILVER LAYER!
#    Table name: silver_stock_prices
#    Format: Delta Lake (managed)
#    Partitions: year, month
#    Total rows: 500
#    Total columns: 20

In [0]:
# ===================================================================
# VERIFY SILVER LAYER
# ===================================================================

print(" Verifying Silver layer...")
print("=" * 60)

# Read back from Silver table
silver_verify = spark.table("silver_stock_prices")

print(f"\n SILVER LAYER VERIFICATION:")
print(f"   Row count: {silver_verify.count():,}")
print(f"   Unique symbols: {silver_verify.select('symbol').distinct().count()}")
print(f"   Unique dates: {silver_verify.select('date').distinct().count()}")
print(f"   Date range: {silver_verify.agg(min('date'), max('date')).collect()[0]}")

print(f"\n TOP GAINERS (Single Day):")
silver_verify.select(
    "symbol", "date", "open", "close", "daily_change_pct"
).orderBy(desc("daily_change_pct")).limit(5).show()

print(f"\n BIGGEST LOSERS (Single Day):")
silver_verify.select(
    "symbol", "date", "open", "close", "daily_change_pct"
).orderBy("daily_change_pct").limit(5).show()

print(f"\n AVERAGE DAILY CHANGE BY SYMBOL:")
silver_verify.groupBy("symbol").agg(
    round(avg("daily_change_pct"), 2).alias("avg_daily_change_pct"),
    round(avg("price_range_pct"), 2).alias("avg_volatility_pct"),
    count("*").alias("trading_days")
).orderBy("symbol").show()

print("\n SILVER LAYER COMPLETE!")
print("=" * 60)
print("\n Ready for Gold layer! ")
