<a href="https://colab.research.google.com/github/hck717/side-project-quant-pipeline/blob/main/colab_scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# --- PARAMETERS ---
KAFKA_BROKER = None  # Papermill injects this

# --- CONFIG ---
crypto_tickers = ["BTC-USD", "ETH-USD", "SOL-USD", "ADA-USD", "XRP-USD"]
equity_tickers = ["AAPL", "MSFT", "AMZN", "TSLA", "NVDA", "JPM", "XOM", "META", "GOOGL", "NFLX"]
treasury_tickers = ["^TNX", "^IRX", "^FVX", "^TYX"]

crypto_topic = "crypto_ticks"
equity_topic = "equities_ticks"
bonds_topic = "bonds_data"

# --- INSTALL DEPENDENCIES ---
!pip install --quiet yfinance confluent-kafka pandas

import yfinance as yf
import json
from confluent_kafka import Producer
from datetime import datetime, timezone
import pandas as pd

# --- BROKER SETUP ---
if not KAFKA_BROKER:
    KAFKA_BROKER = "localhost:9092"

producer = Producer({'bootstrap.servers': KAFKA_BROKER})

def produce_latest(symbols, topic, interval="1m"):
    print(f"\n=== Fetching {topic} ===")
    df = yf.download(symbols, period="1d", interval=interval, group_by='ticker', threads=True)
    for sym in symbols:
        try:
            sym_df = df[sym].reset_index()
        except (KeyError, AttributeError):
            sym_df = df.reset_index()
        # Drop rows with missing datetime or OHLCV
        sym_df = sym_df.dropna(subset=["Datetime", "Open", "High", "Low", "Close", "Volume"])
        if sym_df.empty:
            print(f"No valid data for {sym}")
            continue
        latest_ts = sym_df["Datetime"].max()
        latest_rows = sym_df[sym_df["Datetime"] == latest_ts]
        for _, row in latest_rows.iterrows():
            msg = {
                "symbol": sym,
                "timestamp": row["Datetime"].isoformat(),
                "open": float(row["Open"]),
                "high": float(row["High"]),
                "low": float(row["Low"]),
                "close": float(row["Close"]),
                "volume": float(row["Volume"]),
                "ingested_at": datetime.now(timezone.utc).isoformat()
            }
            print(f"[SCRAPE DEBUG] {topic} {msg}")
            producer.produce(topic, json.dumps(msg).encode('utf-8'))
    producer.flush()

# --- RUN SCRAPES ---
produce_latest(crypto_tickers, crypto_topic, interval="1m")
produce_latest(equity_tickers, equity_topic, interval="1m")
produce_latest(treasury_tickers, bonds_topic, interval="1d")

print("\n✅ All data produced to Kafka topics.")




[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.9 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.5/3.9 MB[0m [31m17.3 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/3.9 MB[0m [31m19.5 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━[0m [32m2.6/3.9 MB[0m [31m25.7 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m3.9/3.9 MB[0m [31m28.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.9/3.9 MB[0m [31m23.3 MB/s[0m eta [36m0:00:00[0m
[?25h
=== Fetching crypto_ticks ===


  df = yf.download(symbols, period="1d", interval=interval, group_by='ticker', threads=True)
[*********************100%***********************]  5 of 5 completed

[SCRAPE DEBUG] crypto_ticks {'symbol': 'BTC-USD', 'timestamp': '2025-09-18T12:53:00+00:00', 'open': 117142.6171875, 'high': 117142.6171875, 'low': 117142.6171875, 'close': 117142.6171875, 'volume': 4546560.0, 'ingested_at': '2025-09-18T12:55:12.332999'}



  "ingested_at": datetime.utcnow().isoformat()


TypeError: float() argument must be a string or a real number, not 'NaTType'