# 01 - Bronze Data Ingestion

**Purpose:** Ingest raw market data from Yahoo Finance and Alpha Vantage APIs

**Author:** Jonah A.
**Created:** 2025-07-29

**Architecture Layer:** Bronze (Raw Data)

**Input:** Market APIs (Yahoo Finance, Alpha Vantage)  
**Output:** `bronze_market_data_raw` Delta table

**Business Value:** Foundation for AI portfolio risk analytics

In [0]:
# Configuration setup for Bronze layer
from datetime import datetime, timedelta

# Portfolio configuration
portfolio_config = {
    "tier_1": ["NVDA", "MSFT", "GOOGL", "AMZN", "META", "AAPL"],
    "tier_2": ["AMD", "CRM", "ORCL"], 
    "tier_3": ["PLTR", "AI", "SNOW", "MDB", "SMCI"],
    "benchmark": ["BOTZ"]
}

# Flatten all symbols
all_symbols = []
for tier_symbols in portfolio_config.values():
    all_symbols.extend(tier_symbols)

# Date configuration
end_date = datetime.now().date()
start_date = end_date - timedelta(days=365)

# API configuration
ALPHA_VANTAGE_API_KEY = "5BNQA7JVK3N8FAMO"
ALPHA_VANTAGE_BASE_URL = "https://www.alphavantage.co/query"

print("📥 Configuration Loaded:")
print(f"Total symbols: {len(all_symbols)}")
print(f"Date range: {start_date} to {end_date}")
print(f"API Key configured: ✅")

In [0]:
%pip install yfinance requests # Install and import required libraries

# Import libraries
import yfinance as yf
import requests
from pyspark.sql import functions as F
from pyspark.sql.types import *
import json

print("📚 Libraries imported:")
print("✅ yfinance - for Yahoo Finance historical data")
print("✅ requests - for Alpha Vantage API calls")
print("✅ pyspark.sql - for Delta Lake operations")

In [0]:
# Test Yahoo Finance connection with one stock
test_symbol = "AAPL"
print(f"🧪 Testing Yahoo Finance with {test_symbol}...")

try:
    # Download 5 days of data as test
    test_data = yf.download(test_symbol, period="5d")
    print(f"✅ Yahoo Finance connection successful!")
    print(f"Downloaded {len(test_data)} days of data for {test_symbol}")
    
    # Fix multi-level column access
    latest_close = float(test_data['Close']['AAPL'].iloc[-1])
    print(f"Latest close price: ${latest_close:.2f}")
except Exception as e:
    print(f"❌ Yahoo Finance test failed: {e}")

In [0]:
# Fix the price display and show the data structure
print(f"📊 AAPL Test Data Details:")
print(f"Number of trading days: {len(test_data)}")
print(f"Data columns: {list(test_data.columns)}")

# Fix price display (convert to float first)
latest_close = float(test_data['Close'].iloc[-1])
print(f"Latest close price: ${latest_close:.2f}")

# Show first few rows of data
print(f"\n📈 Sample data:")
print(test_data.head(2))

In [0]:
# Define Bronze layer Delta table schema
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, LongType, TimestampType

# Bronze layer schema: immutable raw data storage
# Supports both Yahoo Finance and Alpha Vantage data sources
bronze_schema = StructType([
    StructField("ingestion_timestamp", TimestampType(), True),  # When data was ingested
    StructField("symbol", StringType(), True),                 # Stock ticker (AAPL, NVDA, etc.)
    StructField("date", StringType(), True),                   # Trading date
    StructField("open", DoubleType(), True),                   # Opening price
    StructField("high", DoubleType(), True),                   # Daily high price
    StructField("low", DoubleType(), True),                    # Daily low price
    StructField("close", DoubleType(), True),                  # Closing price
    StructField("volume", LongType(), True),                   # Trading volume
    StructField("data_source", StringType(), True)             # yahoo_finance | alpha_vantage
])

# Temporary: print schema to verify structure (will remove later)
print("🏗️ Bronze schema structure:")
for field in bronze_schema.fields:
    print(f"  {field.name}: {field.dataType}")

In [0]:
# Convert Yahoo Finance pandas DataFrame to Bronze format
from datetime import datetime
from pyspark.sql import Row

def convert_yahoo_data_to_bronze(pandas_df, symbol, data_source="yahoo_finance"):
    """Convert Yahoo Finance pandas DataFrame to Bronze format for Delta Lake"""
    
    bronze_rows = []
    ingestion_time = datetime.now()
    
    # Handle Yahoo Finance multi-level columns (same logic as your local code!)
    for date_idx, row in pandas_df.iterrows():
        bronze_row = Row(
            ingestion_timestamp=ingestion_time,
            symbol=symbol,
            date=str(date_idx.date()),  # Convert timestamp to date string
            open=float(row[('Open', symbol)]),    # Multi-level column access
            high=float(row[('High', symbol)]),
            low=float(row[('Low', symbol)]),
            close=float(row[('Close', symbol)]),
            volume=int(row[('Volume', symbol)]),
            data_source=data_source
        )
        bronze_rows.append(bronze_row)
    
    return bronze_rows



In [0]:
# Re-download AAPL test data and test Bronze conversion
test_symbol = "AAPL"
test_data = yf.download(test_symbol, period="5d")

# Convert to Bronze format
bronze_rows = convert_yahoo_data_to_bronze(test_data, "AAPL")

print(f"✅ Converted {len(bronze_rows)} rows to Bronze format")
print("\n📊 Sample Bronze row:")
print(f"Symbol: {bronze_rows[0].symbol}")
print(f"Date: {bronze_rows[0].date}")
print(f"Close: ${bronze_rows[0].close:.2f}")
print(f"Data Source: {bronze_rows[0].data_source}")

In [0]:
# Create Bronze Delta table and save our test data
# Convert bronze_rows to PySpark DataFrame
bronze_df = spark.createDataFrame(bronze_rows, bronze_schema)

# Display the DataFrame structure and sample data
bronze_df.printSchema()
bronze_df.show(3)

In [0]:
# Create temporary view instead of persistent Delta table (Free Edition workaround)
bronze_df.createOrReplaceTempView("bronze_market_data")

# Verify the view was created and query it
result = spark.sql("SELECT COUNT(*) as row_count FROM bronze_market_data")
row_count = result.collect()[0]['row_count']

print(f"✅ Temporary view created with {row_count} rows")

# Show sample data from the view
spark.sql("SELECT symbol, date, close, data_source FROM bronze_market_data ORDER BY date").show()