# Sentiment Index Notebook

## Imports

In [None]:
%pip install finnhub-python


In [None]:
import os
import json
import time
import hashlib
from dataclasses import dataclass
from typing import Any, Dict, List

import numpy as np
import pandas as pd
import requests
import yfinance as yf
import matplotlib.pyplot as plt


## Config

In [None]:
SYMBOL = "AAPL"
START = "2024-01-01"
END   = "2024-06-30"

# FINNHUB_API_KEY = os.getenv("FINNHUB_API_KEY", "").strip()
FINNHUB_API_KEY = "d5lbg61r01qgquflgn9gd5lbg61r01qgquflgna0"
assert FINNHUB_API_KEY, "Set FINNHUB_API_KEY as an environment variable first."

CACHE_DIR = "cache/finnhub"
os.makedirs(CACHE_DIR, exist_ok=True)

MIN_INTERVAL_S = 1.2   # safe pacing for free tier
TIMEOUT_S = 20


## Sentiment index

In [None]:
POS_WORDS = {
    "beat", "beats", "growth", "surge", "record", "profit", "profits", "upgrade",
    "strong", "bull", "bullish", "outperform", "rally", "upside", "win", "wins"
}
NEG_WORDS = {
    "miss", "misses", "drop", "falls", "plunge", "loss", "losses", "downgrade",
    "weak", "bear", "bearish", "underperform", "crash", "downside", "lawsuit"
}

def _tokenize(text: str) -> List[str]:
    if not text:
        return []
    t = "".join(ch.lower() if ch.isalnum() else " " for ch in text)
    return [w for w in t.split() if len(w) >= 2]

def headline_polarity(headline: str) -> float:
    toks = _tokenize(headline)
    if not toks:
        return 0.0
    pos = sum(1 for w in toks if w in POS_WORDS)
    neg = sum(1 for w in toks if w in NEG_WORDS)
    return float((pos - neg) / (pos + neg + 1e-9))


## Daily prices (yFinance)

In [None]:
df = yf.download(SYMBOL, start=START, end=END, interval="1d", auto_adjust=True, progress=False)
df = df.dropna().copy()
df.index = pd.to_datetime(df.index).tz_localize(None)  # keep dates naive
df["date"] = df.index.strftime("%Y-%m-%d")
df.head()


## Raw Sentiment index

In [None]:

from data.finnhub_api import FinnhubClient

client = FinnhubClient(api_key=FINNHUB_API_KEY)

news = client.company_news_range(
    symbol="AAPL",
    date_from="2024-01-02",
    date_to="2025-01-02"
)

len(news), news[:2]



## Score

In [None]:
def ewma(x: np.ndarray, alpha: float = 0.2) -> np.ndarray:
    out = np.zeros_like(x, dtype=np.float32)
    m = 0.0
    for i in range(len(x)):
        m = alpha * float(x[i]) + (1.0 - alpha) * m
        out[i] = m
    return out

def rolling_z(x: np.ndarray, window: int = 60, clip: float = 3.0) -> np.ndarray:
    out = np.zeros_like(x, dtype=np.float32)
    for i in range(len(x)):
        lo = max(0, i - window + 1)
        win = x[lo:i+1]
        mu = float(np.mean(win))
        sd = float(np.std(win)) + 1e-6
        z = (float(x[i]) - mu) / sd
        if clip is not None:
            z = float(np.clip(z, -clip, clip))
        out[i] = z
    return out

df["sent_ewma"] = ewma(df["sent_raw"].values, alpha=0.2)
df["sent_z"] = rolling_z(df["sent_ewma"].values, window=60, clip=3.0)

df[["news_count","sent_raw","sent_ewma","sent_z"]].describe()


## Debug

In [None]:
print("Days:", len(df))
print("Days with news:", int((df["news_count"] > 0).sum()))
print("Raw sentiment min/mean/max:", float(df["sent_raw"].min()), float(df["sent_raw"].mean()), float(df["sent_raw"].max()))
print("Z sentiment min/mean/max:", float(df["sent_z"].min()), float(df["sent_z"].mean()), float(df["sent_z"].max()))

# If 'Days with news' is too low, widen date range or switch API later.


## Testing Alpha Vanatage

In [1]:
from data.finnhub_api import AlphaVantageNewsClient

client = AlphaVantageNewsClient(
    api_key="FYFJWZUN7XRMOHJT",
    cache_dir="cache/alphavantage_news",
)

tickers = ["AAPL", "AMZN", "MSFT", "GOOGL", "META"]
start_date = "2018-01-01"
end_date = "2024-01-01"

all_news = {}
for t in tickers:
    all_news[t] = client.fetch_news(t, start_date, end_date, limit=1000, use_cache=True)


Fetching data for:  {'function': 'NEWS_SENTIMENT', 'tickers': 'AAPL', 'time_from': '20180101T0000', 'time_to': '20240101T0000', 'limit': '1000', 'sort': 'LATEST', 'apikey': 'FYFJWZUN7XRMOHJT'}
[FETCH] AAPL (1000)
Fetching data for:  {'function': 'NEWS_SENTIMENT', 'tickers': 'AMZN', 'time_from': '20180101T0000', 'time_to': '20240101T0000', 'limit': '1000', 'sort': 'LATEST', 'apikey': 'FYFJWZUN7XRMOHJT'}
[FETCH] AMZN (1000)
Fetching data for:  {'function': 'NEWS_SENTIMENT', 'tickers': 'MSFT', 'time_from': '20180101T0000', 'time_to': '20240101T0000', 'limit': '1000', 'sort': 'LATEST', 'apikey': 'FYFJWZUN7XRMOHJT'}
[FETCH] MSFT (1000)
Fetching data for:  {'function': 'NEWS_SENTIMENT', 'tickers': 'GOOGL', 'time_from': '20180101T0000', 'time_to': '20240101T0000', 'limit': '1000', 'sort': 'LATEST', 'apikey': 'FYFJWZUN7XRMOHJT'}
[FETCH] GOOGL (1000)
Fetching data for:  {'function': 'NEWS_SENTIMENT', 'tickers': 'META', 'time_from': '20180101T0000', 'time_to': '20240101T0000', 'limit': '1000', '