# 02 - Silver Data Processing

**Purpose:** Transform and clean Bronze layer data into standardized format

**Author:** Jonah A.  
**Created:** 2025-07-30

**Architecture Layer:** Silver (Clean Data)

**Input:** Bronze layer temporary view `bronze_market_data`  
**Output:** `silver_daily_prices` view with validated, cleaned data

**Business Value:** Quality-assured market data ready for risk analytics

In [0]:
%pip install yfinance

import yfinance as yf
from datetime import datetime
from pyspark.sql import Row, functions as F
from pyspark.sql.types import *

# Download AAPL test data
test_data = yf.download("AAPL", period="5d")

# Convert to Bronze format
bronze_rows = []
ingestion_time = datetime.now()

for date_idx, row in test_data.iterrows():
    bronze_row = Row(
        ingestion_timestamp=ingestion_time,
        symbol="AAPL",
        date=str(date_idx.date()),
        open=float(row[('Open', 'AAPL')]),
        high=float(row[('High', 'AAPL')]),
        low=float(row[('Low', 'AAPL')]),
        close=float(row[('Close', 'AAPL')]),
        volume=int(row[('Volume', 'AAPL')]),
        data_source="yahoo_finance"
    )
    bronze_rows.append(bronze_row)

# Create Bronze DataFrame and temp view
bronze_schema = StructType([
    StructField("ingestion_timestamp", TimestampType(), True),
    StructField("symbol", StringType(), True),
    StructField("date", StringType(), True),
    StructField("open", DoubleType(), True),
    StructField("high", DoubleType(), True),
    StructField("low", DoubleType(), True),
    StructField("close", DoubleType(), True),
    StructField("volume", LongType(), True),
    StructField("data_source", StringType(), True)
])

bronze_df = spark.createDataFrame(bronze_rows, bronze_schema)
bronze_df.createOrReplaceTempView("bronze_market_data")

print(f"✅ Bronze data ready for Silver processing: {bronze_df.count()} rows")

In [0]:
# Silver layer data quality checks
print("🔍 Silver Layer Data Quality Assessment:")

# Check for missing values
bronze_df.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in bronze_df.columns]).show()

# Check data types and ranges
print("\n📊 Price validation:")
price_stats = bronze_df.select(
    F.min("close").alias("min_close"),
    F.max("close").alias("max_close"),
    F.avg("close").alias("avg_close")
).collect()[0]

print(f"Close price range: ${price_stats['min_close']:.2f} - ${price_stats['max_close']:.2f}")
print(f"Average close: ${price_stats['avg_close']:.2f}")

# Check for duplicate dates
duplicate_check = bronze_df.groupBy("symbol", "date").count().filter("count > 1")
duplicate_count = duplicate_check.count()
print(f"\n🔄 Duplicate records: {duplicate_count}")

In [0]:
# Transform Bronze data to Silver format
silver_df = bronze_df.select(
    F.col("symbol"),
    F.to_date(F.col("date")).alias("trading_date"),          # Convert string to proper date
    F.round(F.col("open"), 2).alias("open_price"),           # Round to 2 decimals
    F.round(F.col("high"), 2).alias("high_price"),
    F.round(F.col("low"), 2).alias("low_price"), 
    F.round(F.col("close"), 2).alias("close_price"),
    F.col("volume").alias("trading_volume"),
    F.col("data_source"),
    F.col("ingestion_timestamp").alias("processed_timestamp")
).orderBy("trading_date")

# Create Silver temporary view
silver_df.createOrReplaceTempView("silver_daily_prices")

print(f"✅ Silver layer created: {silver_df.count()} rows")
print("\n📊 Silver data sample:")
silver_df.show()

In [0]:
# Add business calculations to Silver layer
from pyspark.sql.window import Window

# Define window for calculations (ordered by date for this symbol)
window_spec = Window.partitionBy("symbol").orderBy("trading_date")

# Calculate business metrics
silver_enhanced_df = silver_df.withColumn(
    "daily_return_pct", 
    F.round(
        ((F.col("close_price") - F.lag("close_price").over(window_spec)) 
         / F.lag("close_price").over(window_spec) * 100), 4
    )
).withColumn(
    "price_change_dollar",
    F.round(F.col("close_price") - F.lag("close_price").over(window_spec), 2)
).withColumn(
    "volume_ma_3d",
    F.round(F.avg("trading_volume").over(
        Window.partitionBy("symbol").orderBy("trading_date").rowsBetween(-2, 0)
    ), 0)
)

# Create enhanced Silver view
silver_enhanced_df.createOrReplaceTempView("silver_daily_prices_enhanced")

print("✅ Silver layer enhanced with business calculations")
print("\n📈 Enhanced Silver data:")
silver_enhanced_df.select("symbol", "trading_date", "close_price", "daily_return_pct", "price_change_dollar", "volume_ma_3d").show()