In [0]:
# ===================================================================
# NOTEBOOK: 02_Ingest_to_Bronze
# PURPOSE: Fetch stock data from Alpha Vantage API and save to Bronze layer
# AUTHOR: Jose Veliz
# DATE: 2025-10-17
# ===================================================================

import requests
import json
from datetime import datetime
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

# Initialize Spark (already running in Databricks)
spark = spark

print("✅ Libraries imported successfully")

In [0]:
# ===================================================================
# CONFIGURATION
# ===================================================================

# Your Alpha Vantage API key
API_KEY = "5DUCAA0WEXYCTWHG"  # ← REPLACE THIS WITH YOUR ACTUAL KEY

# API endpoint
BASE_URL = "https://www.alphavantage.co/query"

# Stocks to track (start with 5)
SYMBOLS = ["AAPL", "MSFT", "GOOGL", "AMZN", "TSLA"]

# Bronze layer path (where we'll save raw data)
BRONZE_PATH = "/FileStore/bronze_stock_prices"

print(f"✅ Configuration set")
print(f"   Tracking {len(SYMBOLS)} stocks: {', '.join(SYMBOLS)}")
print(f"   Bronze path: {BRONZE_PATH}")

In [0]:
# ===================================================================
# FUNCTION: Fetch stock data from API
# ===================================================================

def fetch_stock_data(symbol):
    """
    Fetch daily stock prices for a given symbol from Alpha Vantage API
    
    Args:
        symbol (str): Stock symbol (e.g., 'AAPL')
    
    Returns:
        list: List of records with stock data
    """
    params = {
        "function": "TIME_SERIES_DAILY",
        "symbol": symbol,
        "apikey": API_KEY,
        "outputsize": "compact"  # Last 100 days
    }
    
    try:
        response = requests.get(BASE_URL, params=params)
        data = response.json()
        
        # Check for errors
        if "Time Series (Daily)" not in data:
            error_msg = data.get("Note", data.get("Error Message", "Unknown error"))
            print(f"❌ Error fetching {symbol}: {error_msg}")
            return None
        
        # Extract time series data
        time_series = data["Time Series (Daily)"]
        
        # Convert to list of records
        records = []
        for date_str, values in time_series.items():
            record = {
                "symbol": symbol,
                "date": date_str,
                "open": values["1. open"],
                "high": values["2. high"],
                "low": values["3. low"],
                "close": values["4. close"],
                "volume": values["5. volume"],
                "ingestion_timestamp": datetime.now().isoformat(),
                "source": "alphavantage"
            }
            records.append(record)
        
        print(f"✅ Fetched {len(records)} records for {symbol}")
        return records
    
    except Exception as e:
        print(f"❌ Exception fetching {symbol}: {str(e)}")
        return None

print("✅ Function defined: fetch_stock_data()")

In [0]:
# ===================================================================
# FETCH DATA FOR ALL SYMBOLS
# ===================================================================

import time

all_records = []

print(f"\n🔄 Starting data fetch for {len(SYMBOLS)} symbols...")
print("=" * 60)

for i, symbol in enumerate(SYMBOLS, 1):
    print(f"\n[{i}/{len(SYMBOLS)}] Fetching {symbol}...")
    
    records = fetch_stock_data(symbol)
    
    if records:
        all_records.extend(records)
        print(f"    ✅ Success! Total records so far: {len(all_records)}")
    
    # Be nice to the API (5 calls/minute limit)
    # Wait 15 seconds between calls
    if i < len(SYMBOLS):  # Don't wait after last symbol
        print(f"    ⏳ Waiting 15 seconds (API rate limit)...")
        time.sleep(15)

print("=" * 60)
print(f"\n✅ DATA FETCH COMPLETE!")
print(f"   Total records fetched: {len(all_records)}")
print(f"   From {len(SYMBOLS)} stocks")
# ```

#**Click Run**

# **You should see:**
# ```
# [1/5] Fetching AAPL...
#     ✅ Success! Total records so far: 100
#     ⏳ Waiting 15 seconds (API rate limit)...

# [2/5] Fetching MSFT...
#     ✅ Success! Total records so far: 200
#     ⏳ Waiting 15 seconds (API rate limit)...

# ... (continues for all 5 stocks)

# ✅ DATA FETCH COMPLETE!
#    Total records fetched: 500
#    From 5 stocks

In [0]:
# ===================================================================
# CONVERT TO SPARK DATAFRAME
# ===================================================================

# Create Spark DataFrame from records
df = spark.createDataFrame(all_records)

print("✅ Spark DataFrame created")
print(f"\n📊 DATAFRAME SUMMARY:")
print(f"   Rows: {df.count()}")
print(f"   Columns: {len(df.columns)}")

print(f"\n📋 SCHEMA:")
df.printSchema()

print(f"\n👀 SAMPLE DATA (first 10 rows):")
df.show(10, truncate=False)

In [0]:
# ===================================================================
# SAVE TO BRONZE LAYER (AS MANAGED TABLE)
# ===================================================================

print("💾 Writing data to Bronze layer (as managed table)...")

# Create a temporary view first
df.createOrReplaceTempView("bronze_stock_prices_temp")

# Write to managed Delta table (stored in default database)
spark.sql("""
    CREATE OR REPLACE TABLE bronze_stock_prices
    USING DELTA
    PARTITIONED BY (date)
    AS SELECT * FROM bronze_stock_prices_temp
""")

print("\n✅ DATA WRITTEN TO BRONZE LAYER!")
print(f"   Total rows: {df.count()}")
print(f"   Format: Delta Table (managed)")
print(f"   Table name: bronze_stock_prices")

In [0]:
# ===================================================================
# VERIFY BRONZE LAYER (READ FROM TABLE)
# ===================================================================

print("🔍 Verifying Bronze layer...")

# Read data back from Bronze table
bronze_df = spark.table("bronze_stock_prices")

print(f"\n✅ BRONZE LAYER VERIFICATION:")
print(f"   Row count: {bronze_df.count()}")
print(f"   Unique symbols: {bronze_df.select('symbol').distinct().count()}")
print(f"   Date range: {bronze_df.agg(min('date'), max('date')).collect()[0]}")

print(f"\n👀 SAMPLE OF BRONZE DATA:")
bronze_df.orderBy(desc("date")).show(10)

print("\n🎉 BRONZE LAYER COMPLETE!")
# ```

# ---

# # 🎯 WHAT TO DO NOW:

# ## **Step 1: Update Cell 6 (2 minutes)**

# 1. **Scroll to Cell 6** in your notebook
# 2. **Delete everything** in that cell
# 3. **Copy-paste the new Cell 6 code** (from above - the one that creates a TABLE)
# 4. **Click Run**

# **You should see:**
# ```
# 💾 Writing data to Bronze layer (as managed table)...

# ✅ DATA WRITTEN TO BRONZE LAYER!
#    Total rows: 500
#    Format: Delta Table (managed)
#    Table name: bronze_stock_prices
# ```

# ---

# ## **Step 2: Update Cell 7 (2 minutes)**

# 1. **Scroll to Cell 7**
# 2. **Delete everything** in that cell
# 3. **Copy-paste the new Cell 7 code** (from above - the one that reads from TABLE)
# 4. **Click Run**

# **You should see:**
# ```
# 🔍 Verifying Bronze layer...

# ✅ BRONZE LAYER VERIFICATION:
#    Row count: 500
#    Unique symbols: 5
#    Date range: Row(min(date)='2025-07-10', max(date)='2025-10-17')

# 👀 SAMPLE OF BRONZE DATA:
# [Table showing your stock data]

# 🎉 BRONZE LAYER COMPLETE!