In [ ]:
# Phase A: Data Preparation — EODHD News Download + FinBERT Embedding
# This notebook handles large-scale news data acquisition and embedding.
# Run on Colab with GPU for FinBERT encoding.

import os, sys, time
import numpy as np
import pandas as pd

# --- Environment Setup ---
try:
    import google.colab
    from google.colab import drive
    drive.mount('/content/drive')
    DRIVE_FOLDER = "/content/drive/MyDrive/GNN测试"
    os.makedirs(DRIVE_FOLDER, exist_ok=True)
    os.chdir(DRIVE_FOLDER)
    print(f"Colab: working directory set to {DRIVE_FOLDER}")
except ImportError:
    from pathlib import Path
    os.chdir(Path("/Users/heruixi/Desktop/GNN-Testing"))
    print(f"Local: working directory set to {os.getcwd()}")

os.makedirs("data", exist_ok=True)
print(f"Working dir: {os.getcwd()}")
print(f"Data files: {[f for f in os.listdir('data') if not f.startswith('.')]}")

---
## Step 1: Download News from EODHD API

**API Info:**
- Endpoint: `https://eodhd.com/api/news?s={TICKER}.US`
- Max 1000 articles per request, paginated via `offset`
- Each request costs 5 API calls (paid plan: 100k calls/day → 20k requests/day)
- Response includes: title, content, date, symbols, sentiment (polarity/neg/neu/pos), tags
- Date range: 2021-01-29 to 2026-01-28 (aligned with price data)

In [ ]:
# === EODHD News Download ===
# Supports resume: if interrupted, re-run and it picks up where it left off.

import requests
import json
import time

# ---------- CONFIG ----------
API_TOKEN = "YOUR_API_TOKEN_HERE"  # <-- Replace with your EODHD API token
DATE_FROM = "2021-01-29"
DATE_TO   = "2026-01-28"
LIMIT     = 1000                    # max per request
SLEEP     = 0.3                     # seconds between requests (stay well under 1000/min)
OUTPUT    = "data/sp500_news_eodhd.parquet"
PROGRESS  = "data/_download_progress.json"  # tracks completed tickers for resume
# ----------------------------

# Load S&P 500 ticker list
sectors = pd.read_csv("data/sp500_sectors.csv", index_col=0)
all_tickers = sectors.index.tolist()
print(f"Total tickers to download: {len(all_tickers)}")

# Resume support: load progress
if os.path.exists(PROGRESS):
    with open(PROGRESS) as f:
        progress = json.load(f)
    done_tickers = set(progress.get("done", []))
    print(f"Resuming: {len(done_tickers)} tickers already downloaded")
else:
    progress = {"done": []}
    done_tickers = set()

# Collect all articles
all_rows = []

# Load existing partial data if available
PARTIAL = "data/_news_partial.parquet"
if os.path.exists(PARTIAL):
    existing = pd.read_parquet(PARTIAL)
    all_rows = existing.to_dict('records')
    print(f"Loaded {len(all_rows)} existing articles from partial save")

remaining = [t for t in all_tickers if t not in done_tickers]
print(f"Remaining tickers: {len(remaining)}")

errors = []
for i, ticker in enumerate(remaining):
    ticker_articles = 0
    offset = 0
    
    while True:
        url = (
            f"https://eodhd.com/api/news"
            f"?s={ticker}.US&from={DATE_FROM}&to={DATE_TO}"
            f"&offset={offset}&limit={LIMIT}"
            f"&api_token={API_TOKEN}&fmt=json"
        )
        try:
            resp = requests.get(url, timeout=30)
            resp.raise_for_status()
            data = resp.json()
        except Exception as e:
            errors.append((ticker, offset, str(e)))
            print(f"  ERROR {ticker} offset={offset}: {e}")
            break
        
        if not data or not isinstance(data, list):
            break
        
        for article in data:
            sentiment = article.get("sentiment", {}) or {}
            all_rows.append({
                "date": article.get("date", ""),
                "title": article.get("title", ""),
                "content": article.get("content", ""),
                "link": article.get("link", ""),
                "symbols": ";".join(article.get("symbols", [])),
                "tags": ";".join(article.get("tags", [])),
                "polarity": sentiment.get("polarity", None),
                "neg": sentiment.get("neg", None),
                "neu": sentiment.get("neu", None),
                "pos": sentiment.get("pos", None),
                "query_ticker": ticker,
            })
        
        ticker_articles += len(data)
        
        if len(data) < LIMIT:
            break
        offset += LIMIT
        time.sleep(SLEEP)
    
    # Mark ticker as done
    done_tickers.add(ticker)
    progress["done"] = list(done_tickers)
    
    # Print progress
    total_done = len(done_tickers)
    if (total_done) % 10 == 0 or i == len(remaining) - 1:
        print(f"  [{total_done}/{len(all_tickers)}] {ticker}: {ticker_articles} articles | Total: {len(all_rows)}")
    
    # Save progress every 50 tickers
    if total_done % 50 == 0:
        with open(PROGRESS, 'w') as f:
            json.dump(progress, f)
        pd.DataFrame(all_rows).to_parquet(PARTIAL, index=False)
        print(f"  >> Checkpoint saved ({len(all_rows)} articles)")
    
    time.sleep(SLEEP)

# Final save
df_news = pd.DataFrame(all_rows)
df_news.to_parquet(OUTPUT, index=False)
with open(PROGRESS, 'w') as f:
    json.dump(progress, f)

print(f"\n=== Download Complete ===")
print(f"Total articles: {len(df_news)}")
print(f"Tickers completed: {len(done_tickers)}")
if errors:
    print(f"Errors ({len(errors)}): {errors[:5]}...")

# Cleanup partial files
for f in [PARTIAL, PROGRESS]:
    if os.path.exists(f):
        os.remove(f)
        print(f"Cleaned up {f}")

---
## Step 2: Data Validation & Cleaning

In [ ]:
# === Load & Validate Downloaded News ===

df_news = pd.read_parquet("data/sp500_news_eodhd.parquet")
print(f"Raw dataset: {len(df_news)} rows, {df_news.columns.tolist()}")
print(f"\nMissing values:\n{df_news.isna().sum()}")

# Parse dates
df_news["date"] = pd.to_datetime(df_news["date"], utc=True, errors="coerce")
df_news["date_only"] = df_news["date"].dt.date

print(f"\nDate range: {df_news['date'].min()} to {df_news['date'].max()}")
print(f"Unique query_tickers: {df_news['query_ticker'].nunique()}")

# Articles per ticker distribution
per_ticker = df_news.groupby("query_ticker").size()
print(f"\nArticles per ticker:")
print(f"  Mean:   {per_ticker.mean():.0f}")
print(f"  Median: {per_ticker.median():.0f}")
print(f"  Min:    {per_ticker.min()} ({per_ticker.idxmin()})")
print(f"  Max:    {per_ticker.max()} ({per_ticker.idxmax()})")

# Tickers with very few articles
low_coverage = per_ticker[per_ticker < 10]
if len(low_coverage) > 0:
    print(f"\n⚠ {len(low_coverage)} tickers with <10 articles:")
    print(low_coverage.sort_values().head(20))

# Sentiment coverage
has_sentiment = df_news["polarity"].notna().sum()
print(f"\nSentiment coverage: {has_sentiment}/{len(df_news)} ({has_sentiment/len(df_news)*100:.1f}%)")
print(f"Polarity stats: mean={df_news['polarity'].mean():.3f}, std={df_news['polarity'].std():.3f}")

In [ ]:
# === Deduplication & Cleaning ===

before = len(df_news)

# 1. Drop articles with no title or date
df_news = df_news.dropna(subset=["title", "date"])
print(f"After dropping null title/date: {len(df_news)} (removed {before - len(df_news)})")

# 2. Deduplicate by (title, date) — same article may appear for multiple query_tickers
#    Keep the article but preserve all ticker associations via 'symbols' field
before2 = len(df_news)
df_news = df_news.drop_duplicates(subset=["title", "date"], keep="first")
print(f"After dedup by (title, date): {len(df_news)} (removed {before2 - len(df_news)} duplicates)")

# 3. Filter to our date range
df_news = df_news[
    (df_news["date"] >= "2021-01-29") & (df_news["date"] <= "2026-01-28")
]
print(f"After date filter: {len(df_news)}")

# 4. Parse symbols into clean ticker list (remove exchange suffixes)
def clean_symbols(sym_str):
    """'AAPL;AAPL.US;MSFT' -> 'AAPL;MSFT'"""
    if not isinstance(sym_str, str) or not sym_str:
        return ""
    tickers = set()
    for s in sym_str.split(";"):
        s = s.strip()
        if "." in s:
            s = s.split(".")[0]  # remove .US suffix
        if s:
            tickers.add(s)
    return ";".join(sorted(tickers))

df_news["tickers_clean"] = df_news["symbols"].apply(clean_symbols)

# Save cleaned version
df_news.to_parquet("data/sp500_news_clean.parquet", index=False)
print(f"\nSaved cleaned data: data/sp500_news_clean.parquet ({len(df_news)} articles)")

In [ ]:
# === Build Event-Level Dataset ===
# Explode articles to one row per (article, ticker) pair, align with next-day return

# Load price data for return computation
prices = pd.read_csv("data/sp500_5y_prices.csv", index_col=0, parse_dates=True)
returns = prices.pct_change()
next_ret = returns.shift(-1)  # t+1 return for label
valid_tickers = set(prices.columns)
print(f"Price data: {prices.shape}, valid tickers: {len(valid_tickers)}")

# Explode: one row per (article, ticker)
df_news["ticker_list"] = df_news["tickers_clean"].str.split(";")
df_exploded = df_news.explode("ticker_list").rename(columns={"ticker_list": "ticker"})
df_exploded = df_exploded[df_exploded["ticker"].isin(valid_tickers)]
print(f"After explode & filter to S&P 500: {len(df_exploded)} rows")

# Align with next trading day return
def get_next_return(dt, ticker):
    """Get the next available trading day return after date dt."""
    try:
        date_only = dt.normalize()  # strip time
        future = next_ret.loc[next_ret.index > date_only, ticker]
        if future.empty:
            return None
        val = future.iloc[0]
        return val if pd.notna(val) else None
    except (KeyError, IndexError):
        return None

print("Computing next-day returns (this may take a few minutes)...")
df_exploded["return_next"] = [
    get_next_return(dt, tk) 
    for dt, tk in zip(df_exploded["date"], df_exploded["ticker"])
]

# Drop rows without return data
before = len(df_exploded)
df_events = df_exploded.dropna(subset=["return_next"]).copy()
df_events["label"] = (df_events["return_next"] > 0).astype(int)
print(f"After return alignment: {len(df_events)} events (dropped {before - len(df_events)})")

# Select output columns
out_cols = ["date", "ticker", "title", "content", "polarity", "neg", "neu", "pos",
            "tags", "return_next", "label"]
df_events = df_events[out_cols].reset_index(drop=True)

# Summary
print(f"\n=== Event Dataset Summary ===")
print(f"Total events: {len(df_events)}")
print(f"Unique tickers: {df_events['ticker'].nunique()}")
print(f"Label distribution: {df_events['label'].value_counts(normalize=True).to_dict()}")
print(f"Date range: {df_events['date'].min()} to {df_events['date'].max()}")
print(f"\nTop 10 tickers by count:")
print(df_events["ticker"].value_counts().head(10))

# Save
df_events.to_parquet("data/sp500_news_events.parquet", index=False)
print(f"\nSaved: data/sp500_news_events.parquet")

---
## Step 3: FinBERT Embedding

Encode news titles with ProsusAI/finbert (768-dim financial text embeddings).
- FinBERT: pre-trained on financial text (10-K, earnings reports, analyst reports)
- 768-dim output vs MiniLM's 384-dim → 2x information capacity
- Also extract sentiment logits (3-dim: positive/negative/neutral) as auxiliary features

In [ ]:
# === Install & Load FinBERT ===
!pip install --quiet transformers accelerate

import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {device}")

MODEL_NAME = "ProsusAI/finbert"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
finbert = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME).to(device)
finbert.eval()

print(f"FinBERT loaded: {sum(p.numel() for p in finbert.parameters())/1e6:.1f}M parameters")
print(f"Hidden size: {finbert.config.hidden_size}")

In [ ]:
# === Batch Encode Titles with FinBERT ===
# Extracts:
#   1. [CLS] embedding (768-dim) from the base model
#   2. Sentiment logits (3-dim: positive, negative, neutral) from classification head

df_events = pd.read_parquet("data/sp500_news_events.parquet")
titles = df_events["title"].fillna("").tolist()
print(f"Encoding {len(titles)} titles...")

BATCH_SIZE = 128
all_embeddings = []
all_sentiments = []  # 3-dim logits

for i in range(0, len(titles), BATCH_SIZE):
    batch = titles[i:i+BATCH_SIZE]
    inputs = tokenizer(
        batch, padding=True, truncation=True,
        max_length=128, return_tensors="pt"
    ).to(device)
    
    with torch.no_grad():
        outputs = finbert(**inputs, output_hidden_states=True)
        # [CLS] token embedding from last hidden state
        cls_emb = outputs.hidden_states[-1][:, 0, :]  # (batch, 768)
        cls_emb = F.normalize(cls_emb, dim=1)  # L2 normalize
        # Sentiment logits
        sent_logits = outputs.logits  # (batch, 3): positive, negative, neutral
        sent_probs = F.softmax(sent_logits, dim=1)
    
    all_embeddings.append(cls_emb.cpu().numpy())
    all_sentiments.append(sent_probs.cpu().numpy())
    
    if (i // BATCH_SIZE + 1) % 50 == 0:
        print(f"  Encoded {i + len(batch)}/{len(titles)}")

embeddings = np.vstack(all_embeddings).astype(np.float16)
sentiments = np.vstack(all_sentiments).astype(np.float16)

print(f"\nEmbeddings shape: {embeddings.shape}")
print(f"Sentiments shape: {sentiments.shape}")
print(f"NaN check - embeddings: {np.isnan(embeddings).any()}, sentiments: {np.isnan(sentiments).any()}")

In [ ]:
# === Save Embeddings & Metadata ===

# Save embeddings (768-dim FinBERT)
np.save("data/sp500_news_emb_finbert.npy", embeddings)
print(f"Saved: data/sp500_news_emb_finbert.npy {embeddings.shape}")

# Save sentiment probabilities (3-dim: positive, negative, neutral)
np.save("data/sp500_news_sentiment_finbert.npy", sentiments)
print(f"Saved: data/sp500_news_sentiment_finbert.npy {sentiments.shape}")

# Save metadata
meta = df_events[["date", "ticker", "label", "return_next", "polarity"]].copy()
meta["idx"] = np.arange(len(meta))
meta.to_parquet("data/sp500_news_emb_meta.parquet", index=False)
print(f"Saved: data/sp500_news_emb_meta.parquet ({len(meta)} rows)")

print(f"\n=== Phase A Complete ===")
print(f"Events:     {len(meta)}")
print(f"Embedding:  {embeddings.shape[1]}-dim FinBERT")
print(f"Sentiment:  3-dim (pos/neg/neu) + EODHD polarity")
print(f"Label dist: {meta['label'].value_counts(normalize=True).to_dict()}")

---
## Step 4: Embedding Validation

In [ ]:
# === Embedding Validation ===

emb = np.load("data/sp500_news_emb_finbert.npy")
sent = np.load("data/sp500_news_sentiment_finbert.npy")
meta = pd.read_parquet("data/sp500_news_emb_meta.parquet")

print("=== Integrity Checks ===")
print(f"Row match: emb={emb.shape[0]}, sent={sent.shape[0]}, meta={len(meta)} → {'OK' if emb.shape[0]==sent.shape[0]==len(meta) else 'MISMATCH'}")
print(f"NaN: emb={np.isnan(emb).any()}, sent={np.isnan(sent).any()}, meta={meta.isna().any().any()}")
print(f"Embedding L2 norms (should be ~1.0): mean={np.linalg.norm(emb.astype(np.float32), axis=1).mean():.4f}")

print(f"\n=== Dataset Stats ===")
print(f"Total events: {len(meta)}")
print(f"Unique tickers: {meta['ticker'].nunique()}")
print(f"Date range: {meta['date'].min()} to {meta['date'].max()}")
print(f"Label distribution:\n{meta['label'].value_counts()}")

# Top/bottom tickers by count
tc = meta["ticker"].value_counts()
print(f"\nTop 10 tickers: {tc.head(10).to_dict()}")
print(f"Bottom 10 tickers: {tc.tail(10).to_dict()}")

# FinBERT sentiment distribution
print(f"\n=== FinBERT Sentiment ===")
labels_sent = ["positive", "negative", "neutral"]
dominant = np.argmax(sent, axis=1)
for i, lab in enumerate(labels_sent):
    pct = (dominant == i).mean() * 100
    print(f"  {lab}: {pct:.1f}%")

In [ ]:
# === t-SNE Visualization of FinBERT Embeddings ===
# Sample 5000 points for speed, color by GICS sector

from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

# Load sector mapping
sectors = pd.read_csv("data/sp500_sectors.csv", index_col=0).squeeze().to_dict()

# Sample for t-SNE
N_SAMPLE = min(5000, len(meta))
idx = np.random.RandomState(42).choice(len(meta), N_SAMPLE, replace=False)
emb_sample = emb[idx].astype(np.float32)
meta_sample = meta.iloc[idx]

print(f"Running t-SNE on {N_SAMPLE} samples...")
tsne = TSNE(n_components=2, perplexity=30, random_state=42, init='pca', learning_rate='auto')
emb_2d = tsne.fit_transform(emb_sample)

# Color by sector
meta_sample = meta_sample.copy()
meta_sample["sector"] = meta_sample["ticker"].map(sectors).fillna("Unknown")
unique_sectors = sorted(meta_sample["sector"].unique())
cmap = plt.cm.get_cmap("tab20", len(unique_sectors))
color_map = {s: cmap(i) for i, s in enumerate(unique_sectors)}

plt.figure(figsize=(14, 10))
for sector in unique_sectors:
    mask = meta_sample["sector"] == sector
    plt.scatter(emb_2d[mask, 0], emb_2d[mask, 1], 
                c=[color_map[sector]], label=sector, alpha=0.5, s=10)
plt.legend(bbox_to_anchor=(1.02, 1), loc="upper left", fontsize=8)
plt.title("FinBERT Embeddings t-SNE (colored by GICS sector)")
plt.xlabel("Dim 1")
plt.ylabel("Dim 2")
plt.tight_layout()

os.makedirs("plots", exist_ok=True)
plt.savefig("plots/finbert_tsne_by_sector.png", dpi=150, bbox_inches="tight")
print("Saved: plots/finbert_tsne_by_sector.png")
plt.show()