<a href="https://colab.research.google.com/github/hck717/side-project-quant-pipeline/blob/main/colab_scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# --- PARAMETERS CELL ---
# Tag this cell with "parameters" in Jupyter/Colab metadata
KAFKA_BROKER = ""  # Papermill will override this


# --- CONFIG ---
crypto_tickers = ["BTC-USD", "ETH-USD", "SOL-USD", "ADA-USD", "XRP-USD"]
equity_tickers = ["AAPL", "MSFT", "AMZN", "TSLA", "NVDA", "JPM", "XOM", "META", "GOOGL", "NFLX"]
treasury_tickers = ["^TNX", "^IRX", "^FVX", "^TYX"]

crypto_topic = "crypto_ticks"
equity_topic = "equities_ticks"
bonds_topic = "bonds_data"




In [None]:
# --- INSTALL DEPENDENCIES ---
!pip install --quiet yfinance confluent-kafka pandas


In [None]:
# --- IMPORTS ---
import yfinance as yf
import json
from confluent_kafka import Producer
from datetime import datetime, timezone
import pandas as pd


In [None]:
# --- BROKER SETUP ---
if not KAFKA_BROKER:
    KAFKA_BROKER = "localhost:9092"

print(f"📡 Connecting to Kafka broker: {KAFKA_BROKER}")
producer = Producer({'bootstrap.servers': KAFKA_BROKER})


In [None]:
# --- PRODUCE FUNCTION ---
def produce_latest(symbols, topic, interval="1m"):
    print(f"\n=== Fetching {topic} ===")
    df = yf.download(symbols, period="1d", interval=interval, group_by='ticker', threads=True)

    for sym in symbols:
        try:
            sym_df = df[sym].reset_index()
        except (KeyError, AttributeError):
            sym_df = df.reset_index()

        sym_df = sym_df.dropna(subset=["Datetime", "Open", "High", "Low", "Close", "Volume"])
        if sym_df.empty:
            print(f"⚠️ No valid data for {sym}")
            continue

        latest_ts = sym_df["Datetime"].max()
        latest_rows = sym_df[sym_df["Datetime"] == latest_ts]

        for _, row in latest_rows.iterrows():
            msg = {
                "symbol": sym,
                "timestamp": row["Datetime"].isoformat(),
                "open": float(row["Open"]),
                "high": float(row["High"]),
                "low": float(row["Low"]),
                "close": float(row["Close"]),
                "volume": float(row["Volume"]),
                "ingested_at": datetime.now(timezone.utc).isoformat()
            }
            print(f"[SCRAPE DEBUG] {topic} {msg}")
            try:
                producer.produce(topic, json.dumps(msg).encode('utf-8'))
            except BufferError as e:
                print(f"❌ Failed to produce message for {sym}: {e}")

    producer.flush()


In [None]:
# --- RUN SCRAPES ---
produce_latest(crypto_tickers, crypto_topic, interval="1m")
produce_latest(equity_tickers, equity_topic, interval="1m")
produce_latest(treasury_tickers, bonds_topic, interval="1d")

print("\n✅ All data produced to Kafka topics.")

