# 01 - Bronze Data Ingestion

**Purpose:** Ingest raw market data from Yahoo Finance and Alpha Vantage APIs

**Author:** Jonah A.
**Created:** 2025-07-29

**Architecture Layer:** Bronze (Raw Data)

**Input:** Market APIs (Yahoo Finance, Alpha Vantage)  
**Output:** `bronze_market_data_raw` Delta table

**Business Value:** Foundation for AI portfolio risk analytics

In [0]:
# =============================================================================
# CONFIGURATION
# PURPOSE: Configure API credentials and security settings
# DEPENDENCIES: None (foundational setup)
# =============================================================================

import os
from datetime import datetime, timedelta

# Load API configuration securely
try:
    # For Databricks: Use secrets scope (preferred for production)
    ALPHA_VANTAGE_API_KEY = dbutils.secrets.get(scope="market-risk", key="alpha_vantage_api_key")
    print("✅ Using Databricks secrets")
except:
    # Fallback: Use a default for development (will be replaced by proper secrets)
    ALPHA_VANTAGE_API_KEY = "5BNQA7JVK3N8FAMO"
    print("⚠️ Using fallback API key - set up Databricks secrets for production")

ALPHA_VANTAGE_BASE_URL = "https://www.alphavantage.co/query"

# Portfolio configuration
portfolio_config = {
    "tier_1": ["NVDA", "MSFT", "GOOGL", "AMZN", "META", "AAPL"],
    "tier_2": ["AMD", "CRM", "ORCL"], 
    "tier_3": ["PLTR", "AI", "SNOW", "MDB", "SMCI"],
    "benchmark": ["BOTZ"]
}

# Flatten all symbols for processing
all_symbols = []
for tier_symbols in portfolio_config.values():
    all_symbols.extend(tier_symbols)

print(f"📊 Portfolio configured: {len(all_symbols)} symbols")

In [0]:
# =============================================================================
# LIBRARY IMPORTS
# PURPOSE: Install required packages and import necessary libraries
# DEPENDENCIES: None (foundational setup)
# =============================================================================

%pip install yfinance requests

# Import libraries
import yfinance as yf
import requests
from pyspark.sql import functions as F, Row
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, LongType, TimestampType
import json

print("📚 Libraries imported:")
print("✅ yfinance - for Yahoo Finance historical data")
print("✅ requests - for Alpha Vantage API calls")
print("✅ pyspark.sql - for Delta Lake operations")

In [0]:
# =============================================================================
# DATE CONFUGURATION
# PURPOSE: Define historical data date range for 1+ year analysis
# DEPENDENCIES: datetime libraries from Cell 1
# =============================================================================

# Historical data range (1+ year for risk analysis)
end_date = datetime.now().date()
start_date = end_date - timedelta(days=365)  # 1 year of data

print(f"📅 Historical Data Range:")
print(f"Start Date: {start_date}")
print(f"End Date: {end_date}")
print(f"Total Period: {(end_date - start_date).days} days")

# Convert to string format for yfinance
start_date_str = start_date.strftime("%Y-%m-%d")
end_date_str = end_date.strftime("%Y-%m-%d")

print(f"API Format: {start_date_str} to {end_date_str}")

In [0]:
# =============================================================================
# BULK HISTORICAL DOWNLOAD
# PURPOSE: Download 1+ year of historical data for multiple stocks efficiently
# DEPENDENCIES: yfinance from Cell 2, date configuration from Cell 3, portfolio from Cell 1
# =============================================================================

# Historical data bulk download function
def download_bulk_historical_data(symbols, start_date, end_date):
    """Download historical data for multiple symbols efficiently"""
    
    print(f"📥 Starting bulk download:")
    print(f"Symbols: {len(symbols)} stocks")
    print(f"Period: {start_date} to {end_date}")
    print(f"Expected trading days: ~{(end_date - start_date).days * 5/7:.0f}")
    
    # Use yfinance bulk download (more efficient than individual calls)
    try:
        historical_data = yf.download(
            symbols, 
            start=start_date, 
            end=end_date,
            group_by='ticker',
            progress=True
        )
        
        print(f"✅ Bulk download completed successfully")
        print(f"Data shape: {historical_data.shape}")
        return historical_data
        
    except Exception as e:
        print(f"❌ Bulk download failed: {e}")
        return None

# Test with first 3 stocks before scaling to all 15
test_symbols = ["AAPL", "NVDA", "MSFT"]
print(f"🧪 Testing bulk download with {len(test_symbols)} stocks for 1 year...")

# Download historical data
historical_data = download_bulk_historical_data(test_symbols, start_date, end_date)

if historical_data is not None:
    print(f"📊 Sample data structure:")
    print(f"Columns: {historical_data.columns.names}")
    print(f"Sample data for AAPL:")
    print(historical_data['AAPL'].head(3))

In [0]:
# =============================================================================
# BRONZE SCHEMA DEFINITION
# PURPOSE: Define Delta Lake schema for Bronze layer immutable storage
# DEPENDENCIES: PySpark data types from Cell 2
# =============================================================================

# Bronze layer schema: immutable raw data storage
# Supports both Yahoo Finance and Alpha Vantage data sources
bronze_schema = StructType([
    StructField("ingestion_timestamp", TimestampType(), True),  # When data was ingested
    StructField("symbol", StringType(), True),                 # Stock ticker (AAPL, NVDA, etc.)
    StructField("date", StringType(), True),                   # Trading date
    StructField("open", DoubleType(), True),                   # Opening price
    StructField("high", DoubleType(), True),                   # Daily high price
    StructField("low", DoubleType(), True),                    # Daily low price
    StructField("close", DoubleType(), True),                  # Closing price
    StructField("volume", LongType(), True),                   # Trading volume
    StructField("data_source", StringType(), True)             # yahoo_finance | alpha_vantage
])

# TODO: Remove before production - development feedback only
print("🏗️ Bronze schema defined with 9 fields") 

In [0]:
# =============================================================================
# BRONZE DATA PROCESSING & TEMPORARY VIEW CREATION
# PURPOSE: Transform historical market data into Bronze layer format and create queryable view
# DEPENDENCIES: historical_data (Cell 4), conversion function (Cell 6), bronze_schema (Cell 5)
# =============================================================================

# Process the full historical dataset (250 days × 3 stocks)
print("🔄 Processing full historical dataset...")
print(f"Historical data shape: {historical_data.shape}")

# Process each stock from the historical dataset
all_bronze_rows = []
ingestion_time = datetime.now()

for symbol in ["AAPL", "NVDA", "MSFT"]:
    print(f"Processing {symbol}...")
    stock_data = historical_data[symbol]  # Get data for this stock
    
    # Convert each row to Bronze format (handle multi-stock download structure)
    for date_idx, row in stock_data.iterrows():
        bronze_row = Row(
            ingestion_timestamp=ingestion_time,
            symbol=symbol,
            date=str(date_idx.date()),
            open=float(row['Open']),      # Simple column access for multi-stock download
            high=float(row['High']),
            low=float(row['Low']),
            close=float(row['Close']),
            volume=int(row['Volume']),
            data_source="yahoo_finance"
        )
        all_bronze_rows.append(bronze_row)
    
    print(f"  ✅ {len(stock_data)} rows processed for {symbol}")

print(f"\n📊 Total Bronze rows: {len(all_bronze_rows)}")

# Create PySpark DataFrame and temporary view
bronze_df = spark.createDataFrame(all_bronze_rows, bronze_schema)
bronze_df.createOrReplaceTempView("bronze_market_data")

# Validation
row_count = bronze_df.count()
print(f"✅ Bronze temporary view created with {row_count} rows")

# Sample data verification
print("\n📈 Sample Bronze data:")
bronze_df.select("symbol", "date", "close", "data_source").orderBy("symbol", "date").show(6)