<a href="https://colab.research.google.com/github/hck717/side-project-quant-pipeline/blob/main/yfinance_ingest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install yfinance confluent_kafka minio pyarrow --quiet

import yfinance as yf
import json
from confluent_kafka import Producer
from minio import Minio
import pandas as pd
import sys

import sys

# Parameters from command line (Airflow/GitHub Actions) or defaults for Colab
if len(sys.argv) < 6:
    print("⚠️ Not enough command line arguments, using defaults for Colab testing.")
    KAFKA_BROKER = "localhost:9092"
    MINIO_ENDPOINT = "play.min.io:9000"
    MINIO_ACCESS_KEY = "minioadmin"
    MINIO_SECRET_KEY = "minioadmin"
    BUCKET = "demo-bucket"
else:
    KAFKA_BROKER = sys.argv[1]
    MINIO_ENDPOINT = sys.argv[2]
    MINIO_ACCESS_KEY = sys.argv[3]
    MINIO_SECRET_KEY = sys.argv[4]
    BUCKET = sys.argv[5]

CRYPTO_SYMBOLS = ["BTC-USD", "ETH-USD", "SOL-USD", "ADA-USD", "XRP-USD"]
EQUITY_SYMBOLS = ["AAPL", "MSFT", "GOOG"]
BOND_SYMBOLS = ["^TNX", "^TYX"]

def fetch(symbols, period, interval):
    return yf.download(symbols, period=period, interval=interval, group_by='ticker', threads=True)

# Fetch
crypto_df = fetch(CRYPTO_SYMBOLS, "5d", "1m")
equities_intraday_df = fetch(EQUITY_SYMBOLS, "5d", "1m")
equities_eod_df = fetch(EQUITY_SYMBOLS, "1mo", "1d")
bonds_eod_df = fetch(BOND_SYMBOLS, "1mo", "1d")

# Kafka producer
p = Producer({
    'bootstrap.servers': KAFKA_BROKER,
    'security.protocol': 'ssl',
    'ssl.endpoint.identification.algorithm': 'https'
})

for sym in CRYPTO_SYMBOLS:
    try:
        sym_df = crypto_df[sym].reset_index().dropna(subset=["Datetime"])
        latest_ts = sym_df["Datetime"].max()
        latest_rows = sym_df[sym_df["Datetime"] == latest_ts]
        for _, row in latest_rows.iterrows():
            msg = {
                "symbol": sym,
                "timestamp": row["Datetime"].isoformat() if pd.notna(row["Datetime"]) else None,
                "open": float(row["Open"]) if pd.notna(row["Open"]) else None,
                "high": float(row["High"]) if pd.notna(row["High"]) else None,
                "low": float(row["Low"]) if pd.notna(row["Low"]) else None,
                "close": float(row["Close"]) if pd.notna(row["Close"]) else None,
                "volume": float(row["Volume"]) if pd.notna(row["Volume"]) else None
            }
            p.produce("crypto.ticks", json.dumps(msg).encode('utf-8'))
    except KeyError:
        pass

# MinIO client
client = Minio(MINIO_ENDPOINT, access_key=MINIO_ACCESS_KEY, secret_key=MINIO_SECRET_KEY, secure=True)

for sym, df in [("equities_eod", equities_eod_df), ("bonds_eod", bonds_eod_df)]:
    for ticker in df.columns.levels[0]:
        try:
            sym_df = df[ticker].reset_index().dropna(subset=["Date"])
            latest_date = sym_df["Date"].max().date()
            latest_rows = sym_df[sym_df["Date"].dt.date == latest_date]
            file_name = f"{ticker}.parquet"
            latest_rows.to_parquet(file_name, index=False)
            s3_path = f"eod/{sym}/date={latest_date}/{file_name}"
            client.fput_object(BUCKET, s3_path, file_name)
        except KeyError:
            pass

print("✅ Colab ingestion complete")

