In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Indian Equity News Agent — zero-args, CSV-driven companies
----------------------------------------------------------
- Reads companies from CSV: config/companies.csv with columns [symbol,name]
- Builds fast alias matcher (FlashText if available; else compiled regex)
- Skips generic/unmappable articles (keeps only ticker-linked items)
- Sentiment (FinBERT) is OFF by default; toggle in CONFIG
- Saves CSV + JSONL and prints a neat, ticker-only console digest

Install (first run):
    python -m venv .venv && source .venv/bin/activate
    pip install -U pip
    pip install feedparser httpx readability-lxml beautifulsoup4 \
                transformers torch rapidfuzz pandas python-dateutil rich \
                tldextract flashtext

Run:
    python news_agent.py
"""

from __future__ import annotations
import os, re, json, time, sqlite3, hashlib
from datetime import datetime, timedelta, timezone
from typing import List, Dict, Any, Optional, Tuple

# ============================ CONFIG ==========================================
CONFIG = {
    # RSS sources (your trusted 4)
    "SOURCES": [
        "https://www.livemint.com/rss/companies",
        "https://economictimes.indiatimes.com/markets/stocks/rssfeeds/2146842.cms",
        "https://www.cnbctv18.com/commonfeeds/v1/cne/rss/market.xml",
        "https://www.thehindubusinessline.com/companies/feeder/default.rss",
    ],

    # Companies CSV (required): columns -> symbol,name
    "COMPANY_CSV": "nifty500_symbols.csv",

    # Feature toggles (NO CLI — edit here)
    "ENABLE_SENTIMENT": False,     # <— set True to enable FinBERT
    "USE_FULLTEXT": False,         # when True (and sentiment enabled), fetch article pages

    # Text caps
    "TITLE_SUMMARY_MAX_CHARS": 600,
    "FULLTEXT_MAX_CHARS": 2000,

    # FinBERT batching
    "BATCH_SIZE": 16,

    # Limits and HTTP behavior
    "MAX_ARTICLES": 300,
    "REQUEST_TIMEOUT_S": 12.0,
    "MAX_RETRIES": 3,
    "BACKOFF_BASE_S": 0.8,
    "POLITE_DELAY_S": 0.4,
    "USER_AGENT": "Mozilla/5.0 (compatible; NewsSentimentBot/1.5; +https://example.local)",

    # Sentiment thresholds (FinBERT score = P(pos) - P(neg))
    "BULL_THRESHOLD": 0.15,
    "BEAR_THRESHOLD": -0.15,
    "HALF_LIFE_HOURS": 8.0,

    # Fuzzy fallback (only used if no alias/exact match hit)
    "FUZZY_TOP_K": 2,
    "FUZZY_SCORE_CUT": 84,

    # Alias generation knobs (from CSV names)
    "GEN_ADD_ACRONYM": True,               # e.g., "State Bank of India" -> "SBI"
    "ACRONYM_MIN_LEN": 3,                  # keep >=3 to avoid noise (LT would be dropped)
    "GEN_STRIP_TOKENS": [                  # tokens to drop when generating simplified aliases
        "limited", "ltd", "ltd.", "private", "pvt", "pvt.", "company", "co.", "inc", "inc.",
        "india", "industries", "corporation", "corp", "corp.", "plc"
    ],
    "ALIAS_MIN_CHARS": 3,                 # ignore too-short aliases

    # Paths
    "OUT_DIR": "outputs/news_sentiment",
    "DB_PATH": "outputs/news_sentiment/seen.sqlite3",

    # Logging
    "VERBOSE": False,
}
# ==============================================================================

import logging
from rich.console import Console
from rich.table import Table
from rich.text import Text
from rich.traceback import install as rich_traceback
from rich.logging import RichHandler

import feedparser
import httpx
from bs4 import BeautifulSoup
from readability import Document
from dateutil import parser as dtp
import pandas as pd
from rapidfuzz import process, fuzz
import tldextract

try:
    from flashtext import KeywordProcessor  # super fast keyword search
except Exception:
    KeywordProcessor = None

# ---- setup logging -----------------------------------------------------------
rich_traceback(show_locals=False)
console = Console()
IST = timezone(timedelta(hours=5, minutes=30))
LOG = logging.getLogger("news_agent")
handler = RichHandler(console=console, show_time=True, show_path=False, rich_tracebacks=True)
logging.basicConfig(level=logging.DEBUG if CONFIG["VERBOSE"] else logging.INFO,
                    format="%(message)s", handlers=[handler])

# ---- helpers ----------------------------------------------------------------
def ensure_dirs() -> None:
    os.makedirs(CONFIG["OUT_DIR"], exist_ok=True)
    os.makedirs(os.path.dirname(CONFIG["DB_PATH"]), exist_ok=True)
    # Ensure CSV exists
    if not os.path.exists(CONFIG["COMPANY_CSV"]):
        raise FileNotFoundError(f"Company CSV not found at: {CONFIG['COMPANY_CSV']}")

def sha256(s: str) -> str:
    return hashlib.sha256(s.encode("utf-8")).hexdigest()

def canonical_url(url: str) -> str:
    from urllib.parse import urlparse, urlunparse, parse_qsl, urlencode
    try:
        p = urlparse(url)
        q = [(k, v) for (k, v) in parse_qsl(p.query, keep_blank_values=True) if not k.lower().startswith("utm_")]
        clean = p._replace(fragment="", query=urlencode(q, doseq=True))
        return urlunparse(clean)
    except Exception:
        return url

def seen_init(db_path: str) -> None:
    con = sqlite3.connect(db_path)
    con.execute("CREATE TABLE IF NOT EXISTS seen (hash TEXT PRIMARY KEY, url TEXT, published TEXT);")
    con.commit(); con.close()

def seen_contains(db_path: str, url: str) -> bool:
    h = sha256(url)
    con = sqlite3.connect(db_path)
    cur = con.execute("SELECT 1 FROM seen WHERE hash=?", (h,))
    ok = cur.fetchone() is not None
    con.close()
    return ok

def seen_add(db_path: str, url: str, published_iso: str) -> None:
    h = sha256(url)
    con = sqlite3.connect(db_path)
    con.execute("INSERT OR IGNORE INTO seen(hash,url,published) VALUES (?,?,?)", (h, url, published_iso))
    con.commit(); con.close()

# ---- network ----------------------------------------------------------------
def _backoff_sleep(attempt: int) -> None:
    time.sleep((CONFIG["BACKOFF_BASE_S"] ** attempt) + (0.1 * attempt))

def http_get_text(url: str) -> Optional[str]:
    headers = {"User-Agent": CONFIG["USER_AGENT"]}
    for attempt in range(1, CONFIG["MAX_RETRIES"] + 1):
        try:
            with httpx.Client(follow_redirects=True, headers=headers, timeout=CONFIG["REQUEST_TIMEOUT_S"]) as c:
                r = c.get(url)
            if r.status_code in (200, 304):
                return r.text
            if r.status_code in (403, 404):
                LOG.debug(f"{r.status_code} {url}")
                return None
        except Exception as e:
            LOG.debug(f"GET fail {attempt}: {e}")
        _backoff_sleep(attempt)
    return None

# ---- RSS ingest --------------------------------------------------------------
def fetch_rss_items(feed_urls: List[str]) -> List[Dict[str, Any]]:
    items, cap, taken_total = [], CONFIG["MAX_ARTICLES"], 0
    for url in feed_urls:
        if taken_total >= cap: break
        html = http_get_text(url)
        if not html:
            LOG.warning(f"Skipping feed: {url}")
            continue
        parsed = feedparser.parse(html)
        src_title = (parsed.feed.get("title") if parsed.feed else None) or tldextract.extract(url).registered_domain
        entries = parsed.entries or []
        new_count = 0
        for e in entries:
            if taken_total >= cap: break
            link = e.get("link") or e.get("id")
            if not link: continue
            link = canonical_url(link)
            if seen_contains(CONFIG["DB_PATH"], link): continue

            title = (e.get("title") or "").strip()
            summary = BeautifulSoup(e.get("summary", "") or e.get("subtitle", ""), "lxml").get_text(" ", strip=True)
            try:
                pdt = dtp.parse(e.get("published") or e.get("updated") or datetime.now(IST).isoformat()).astimezone(IST)
            except Exception:
                pdt = datetime.now(IST)
            items.append({
                "provider": src_title, "url": link,
                "title": title, "summary": summary,
                "published": pdt.isoformat(),
            })
            new_count += 1; taken_total += 1
        LOG.info(f"[RSS] {src_title}: {new_count} new (cap left: {max(0, cap - taken_total)})")
        time.sleep(CONFIG["POLITE_DELAY_S"])
    return items

# ---- article extraction ------------------------------------------------------
def extract_text_from_html(html: str) -> str:
    doc = Document(html)
    soup = BeautifulSoup(doc.summary(), "lxml")
    for bad in soup(["script", "style", "noscript"]): bad.decompose()
    return soup.get_text("\n", strip=True)

def fetch_article_text(url: str) -> str:
    html = http_get_text(url)
    return extract_text_from_html(html) if html else ""

# ---- CSV -> alias table ------------------------------------------------------
def _acronym_from_name(name: str) -> Optional[str]:
    # e.g., "State Bank of India" -> "SBI"; "HCL Technologies" -> "HCL"
    words = re.findall(r"[A-Za-z0-9&]+", name)
    letters = [w[0].upper() for w in words if w and w[0].isalnum()]
    ac = "".join(letters)
    if len(ac) >= CONFIG["ACRONYM_MIN_LEN"]:
        return ac
    return None

def _simplify_name(name: str) -> Optional[str]:
    # Drop common tokens, punctuation collapse spaces
    tokens = re.findall(r"[A-Za-z0-9&]+", name.lower())
    filtered = [t for t in tokens if t not in set(CONFIG["GEN_STRIP_TOKENS"])]
    simp = " ".join(filtered).strip()
    if len(simp) >= CONFIG["ALIAS_MIN_CHARS"]:
        return simp
    return None

def load_company_aliases(csv_path: str) -> Dict[str, List[str]]:
    """
    Reads CSV (symbol,name) and returns dict: symbol -> [aliases...]
    Aliases include:
      - canonical name
      - simplified name (tokens stripped)
      - acronym (optional, e.g., SBI, HDFC, ICICI)
    """
    df = pd.read_csv(csv_path)
    # normalize columns
    cols = {c.lower(): c for c in df.columns}
    sym_col = cols.get("symbol")
    name_col = cols.get("name")
    if not sym_col or not name_col:
        raise ValueError("companies.csv must have columns: symbol,name")

    by_symbol: Dict[str, set] = {}
    for _, r in df.iterrows():
        sym = str(r[sym_col]).strip()
        name = str(r[name_col]).strip()
        if not sym or not name:
            continue
        aliases = set()
        aliases.add(name)
        simp = _simplify_name(name)
        if simp:
            aliases.add(simp)
        if CONFIG["GEN_ADD_ACRONYM"]:
            ac = _acronym_from_name(name)
            if ac:
                aliases.add(ac)
        # Drop too-short aliases
        aliases = {a for a in aliases if len(a) >= CONFIG["ALIAS_MIN_CHARS"]}
        by_symbol.setdefault(sym, set()).update(aliases)

    # length-sort aliases (longer first helps regex boundary edge cases)
    return {sym: sorted(list(als), key=lambda s: (-len(s), s)) for sym, als in by_symbol.items()}

# ---- build matcher -----------------------------------------------------------
def build_ticker_matcher(symbol_aliases: Dict[str, List[str]]):
    """
    Returns (mode, matcher):
      - ('flashtext', KeywordProcessor)
      - ('regex', dict[symbol]->compiled regex pattern)
    """
    if KeywordProcessor is not None:
        kp = KeywordProcessor(case_sensitive=False)
        for sym, aliases in symbol_aliases.items():
            for alias in aliases:
                if len(alias) < CONFIG["ALIAS_MIN_CHARS"]:
                    continue
                kp.add_keyword(alias, sym)
        return ("flashtext", kp)
    else:
        patterns = {}
        for sym, aliases in symbol_aliases.items():
            safe_aliases = [re.escape(a) for a in aliases if len(a) >= CONFIG["ALIAS_MIN_CHARS"]]
            if not safe_aliases:
                continue
            # word-boundary pattern across aliases
            pat = r"\b(?:%s)\b" % "|".join(safe_aliases)
            patterns[sym] = re.compile(pat, flags=re.I)
        return ("regex", patterns)

# ---- map text -> tickers (scaled) -------------------------------------------
def map_tickers_scaled(text: str,
                       symbol_aliases: Dict[str, List[str]],
                       matcher_tuple,
                       fuzzy_names: List[str],
                       fuzzy_map: Dict[str, str]) -> List[str]:
    """
    1) Exact/alias matching via FlashText or regex.
    2) If nothing matched, fallback to RapidFuzz on company names.
    Returns unique symbols (order preserved by first appearance).
    """
    mode, matcher = matcher_tuple
    if not text:
        return []

    hits: List[str] = []
    if mode == "flashtext":
        syms = matcher.extract_keywords(text)  # list of symbols
        seen = set()
        for s in syms:
            if s not in seen:
                seen.add(s); hits.append(s)
    else:
        for sym, pat in matcher.items():
            if pat.search(text):
                hits.append(sym)

    if hits:
        return hits

    # Fuzzy fallback across canonical names only (not all aliases)
    best = process.extract(text, fuzzy_names, scorer=fuzz.token_set_ratio, limit=CONFIG["FUZZY_TOP_K"])
    chosen = [fuzzy_map[name] for (name, score, _) in best if score >= CONFIG["FUZZY_SCORE_CUT"]]
    out, seen = [], set()
    for s in chosen:
        if s not in seen:
            seen.add(s); out.append(s)
    return out

# ---- sentiment (lazy) --------------------------------------------------------
_FINBERT_MODEL = None; _FINBERT_TOK = None
def finbert_load():
    global _FINBERT_MODEL, _FINBERT_TOK
    if _FINBERT_MODEL is None:
        from transformers import AutoTokenizer, AutoModelForSequenceClassification
        _FINBERT_TOK = AutoTokenizer.from_pretrained("ProsusAI/finbert")
        _FINBERT_MODEL = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")
        _FINBERT_MODEL.eval()

def finbert_scores_batch(texts: List[str], batch_size: int):
    finbert_load()
    import torch
    outs = []
    for i in range(0, len(texts), batch_size):
        chunk = texts[i:i+batch_size]
        enc = _FINBERT_TOK(chunk, truncation=True, padding=True, max_length=512, return_tensors="pt")
        with torch.no_grad():
            logits = _FINBERT_MODEL(**enc).logits
            probs = torch.softmax(logits, dim=-1).cpu().numpy()
        for p in probs:
            neg, neu, pos = float(p[0]), float(p[1]), float(p[2])
            score = pos - neg
            if score >= CONFIG["BULL_THRESHOLD"]: lab = "bullish"
            elif score <= CONFIG["BEAR_THRESHOLD"]: lab = "bearish"
            else: lab = "neutral"
            outs.append((score, lab, {"neg":neg, "neu":neu, "pos":pos}))
    return outs

# ---- console printing --------------------------------------------------------
def print_news(df: pd.DataFrame, show_sent: bool):
    if df.empty:
        console.print("[yellow]No ticker-linked articles.[/yellow]"); return
    t = Table(title="Latest Indian Equity News (ticker-linked)")
    t.add_column("#", justify="right"); t.add_column("Time (IST)", no_wrap=True)
    t.add_column("Source", no_wrap=True); t.add_column("Ticker(s)", no_wrap=True)
    t.add_column("Title / URL", overflow="fold")
    if show_sent: t.add_column("Sentiment", no_wrap=True)
    for i, r in enumerate(df.itertuples(index=False), 1):
        try:
            dt_show = pd.to_datetime(r.time_ist).tz_convert(IST).strftime("%Y-%m-%d %H:%M")
        except Exception:
            dt_show = str(r.time_ist)
        tick = r.tickers.replace(",", " ")
        text = Text(r.title); text.append("\n", style=""); text.append(r.url, style="blue underline")
        row = [str(i), dt_show, r.provider, tick, text]
        if show_sent:
            row.append(f"{getattr(r,'sentiment_label','') or ''} {getattr(r,'sentiment_score','') or ''}".strip())
        t.add_row(*row)
    console.print(t)

# ---- core --------------------------------------------------------------------
def run_once(enable_sentiment: bool):
    ensure_dirs(); seen_init(CONFIG["DB_PATH"])

    # Load companies and build matchers
    symbol_aliases = load_company_aliases(CONFIG["COMPANY_CSV"])
    matcher = build_ticker_matcher(symbol_aliases)

    # Fuzzy fallback lists (use canonical names only to reduce noise)
    canonical_names = sorted({ next(iter(als)) if als else "" for als in symbol_aliases.values() })
    # Ensure canonical_names map to symbols (pick the longest alias as canonical name)
    canonical_map: Dict[str, str] = {}
    for sym, aliases in symbol_aliases.items():
        canonical = max(aliases, key=len) if aliases else sym
        canonical_map[canonical] = sym

    items = fetch_rss_items(CONFIG["SOURCES"])
    if not items:
        LOG.info("No new items."); return pd.DataFrame(), pd.DataFrame()

    now = datetime.now(IST)
    rows = []

    # Build text batch for sentiment only when enabled
    if enable_sentiment:
        texts = []
        for it in items:
            base = (it["title"] + ". " + it.get("summary","")).strip()
            if CONFIG["USE_FULLTEXT"]:
                base += " " + fetch_article_text(it["url"])[:CONFIG["FULLTEXT_MAX_CHARS"]]
            texts.append(base[:CONFIG["TITLE_SUMMARY_MAX_CHARS"]])
        scores = finbert_scores_batch(texts, CONFIG["BATCH_SIZE"])
    else:
        scores = [(None,None,{"neg":None,"neu":None,"pos":None})]*len(items)

    # Compose output rows (skip non-mapped)
    for it,(score,label,probs) in zip(items, scores):
        text_for_map = (it["title"] + " " + it.get("summary",""))
        tickers = map_tickers_scaled(
            text_for_map,
            symbol_aliases=symbol_aliases,
            matcher_tuple=matcher,
            fuzzy_names=canonical_names,
            fuzzy_map=canonical_map
        )
        if not tickers:
            continue  # skip non-ticker articles

        row = {
            "time_ist": it["published"],
            "provider": it["provider"],
            "title": it["title"],
            "url": it["url"],
            "tickers": ",".join(tickers),
        }
        if enable_sentiment:
            row.update({
                "sentiment_score": round(score,4) if score is not None else None,
                "sentiment_label": label or "",
            })
        rows.append(row)

        seen_add(CONFIG["DB_PATH"], it["url"], it["published"])
        time.sleep(CONFIG["POLITE_DELAY_S"])

    df = pd.DataFrame(rows)
    if df.empty:
        console.print("[yellow]No ticker-matched news this run.[/yellow]")
        return df, pd.DataFrame()

    # Save outputs
    stamp = now.strftime("%Y-%m-%d_%H%M%S")
    out_dir = CONFIG["OUT_DIR"]
    path_articles_csv = os.path.join(out_dir, f"articles_{stamp}.csv")
    path_articles_jsonl = os.path.join(out_dir, f"articles_{stamp}.jsonl")
    df.to_csv(path_articles_csv, index=False)
    with open(path_articles_jsonl, "w", encoding="utf-8") as f:
        for _,r in df.iterrows():
            f.write(json.dumps(r.to_dict(), ensure_ascii=False) + "\n")

    LOG.info(f"Saved: {path_articles_csv}, {path_articles_jsonl}")

    # Print
    cols = ["time_ist","provider","title","url","tickers"]
    if enable_sentiment:
        cols += ["sentiment_label","sentiment_score"]
    print_news(df[cols], show_sent=enable_sentiment)

    return df, pd.DataFrame()

# ---- entry -------------------------------------------------------------------
def main(): 
    run_once(CONFIG["ENABLE_SENTIMENT"])

if __name__ == "__main__":
    main()
