# AI News Sentiment Agent (v3)
Fixes for empty results + broader, smarter sources.

Key changes vs v2:
- Title **and** summary keyword match for generic feeds (was title-only)
- More general feeds: ET IT/ITES, Business Standard Companies, CNBC-TV18 Latest
- Optional NSE Corporate Announcements JSON (with cookie priming)
- Retry + FORCE_REFRESH

**Outputs** remain the same: per-query CSVs + a daily `bias_ledger.csv` under `outputs/YYYY-MM-DD/`.


In [1]:
## =============== CONFIG — Edit only this cell ===============
QUERIES = [
    "TCS.NS",  # add more: "HDFCBANK.NS", "Reliance Industries", etc.
]
DAYS = 5
MAX_ARTICLES = 25
ENABLE_TELEGRAM = False
TELEGRAM_BOT_TOKEN = ""
TELEGRAM_CHAT_ID = ""
OUTPUT_BASE_DIR = "outputs"

# Decision thresholds
MIN_ARTICLES_FOR_STRONG_VIEW = 3
LONG_THRESHOLD = +0.15
SHORT_THRESHOLD = -0.15
TAU_DAYS = 3.0
SOURCE_WEIGHTS = {
    "moneycontrol.com": 1.15,
    "economictimes.indiatimes.com": 1.10,
    "livemint.com": 1.05,
    "business-standard.com": 1.05,
    "reuters.com": 1.20,
    "bloomberg.com": 1.20,
    "cnbctv18.com": 1.05,
}
DEFAULT_SOURCE_WEIGHT = 1.0

# Advanced
FORCE_REFRESH = True   # set True for a clean run if you saw 0 articles
MAX_RSS_PER_SOURCE = 60  # cap items consumed from each generic feed before alias filtering
RETRY = 2

# Broad finance/IT feeds (we keyword-filter by alias)
GENERAL_FEEDS = [
    "https://feeds.reuters.com/reuters/INtopNews",
    "https://feeds.reuters.com/reuters/businessNews",
    "https://www.moneycontrol.com/rss/MCtopnews.xml",
    "https://www.livemint.com/rss/companies",
    # Added:
    "https://economictimes.indiatimes.com/industry/it/ites/rssfeeds/13357270.cms",
    "https://www.business-standard.com/rss/companies-101.rss",
    "https://www.cnbctv18.com/rss/latest.xml",
]

# NSE Corporate Announcements fetch (best-effort)
ENABLE_NSE_CORP = True


### Install (first run) — optional

In [2]:
# !pip install feedparser requests beautifulsoup4 lxml readability-lxml html5lib numpy pandas python-dateutil yfinance tqdm transformers torch --upgrade


### Imports & helpers

In [3]:
import os, re, math, hashlib, json, time
from datetime import datetime, timedelta, timezone
from dateutil import tz, parser as dateparser
from urllib.parse import quote_plus, urlparse, parse_qs

import pandas as pd
import numpy as np
import requests
import feedparser
from bs4 import BeautifulSoup
from readability import Document
from tqdm import tqdm
import yfinance as yf

UA = ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
      "AppleWebKit/537.36 (KHTML, like Gecko) "
      "Chrome/122.0.0.0 Safari/537.36")
HDRS = {"User-Agent": UA, "Accept-Language": "en-IN,en;q=0.9"}

def ist_now():
    return datetime.now(tz.gettz("Asia/Kolkata"))

def to_ist(dt: datetime) -> datetime:
    if dt.tzinfo is None:
        dt = dt.replace(tzinfo=timezone.utc)
    return dt.astimezone(tz.gettz("Asia/Kolkata"))

def safe_filename(s: str, maxlen=140) -> str:
    s = re.sub(r"[^a-zA-Z0-9._-]+", "_", s)
    return s[:maxlen]

def domain_of(url: str) -> str:
    try:
        netloc = urlparse(url).netloc.lower()
        parts = netloc.split(".")
        return ".".join(parts[-3:]) if len(parts) >= 3 else netloc
    except Exception:
        return ""

CACHE_DIR = ".cache_news_agent_v3"
os.makedirs(CACHE_DIR, exist_ok=True)

def read_cache(key: str):
    path = os.path.join(CACHE_DIR, f"{safe_filename(key)}.json")
    if os.path.exists(path) and not FORCE_REFRESH:
        try:
            with open(path, "r", encoding="utf-8") as f:
                return json.load(f)
        except Exception:
            return None
    return None

def write_cache(key: str, data):
    path = os.path.join(CACHE_DIR, f"{safe_filename(key)}.json")
    try:
        with open(path, "w", encoding="utf-8") as f:
            json.dump(data, f, ensure_ascii=False)
    except Exception:
        pass

def get_with_retries(session, url, headers=None, timeout=12, tries=RETRY):
    last = None
    for i in range(max(1, tries)):
        try:
            r = session.get(url, headers=headers or HDRS, timeout=timeout, allow_redirects=True)
            if r.status_code == 200:
                return r
            last = r
        except Exception as e:
            last = e
        time.sleep(0.4)
    if isinstance(last, requests.Response):
        raise requests.HTTPError(f"HTTP {last.status_code} for {url}")
    raise RuntimeError(f"Failed to GET {url}: {last}")


### Query building & alias expansion

In [4]:
def resolve_company_query(q: str) -> str:
    q = q.strip()
    if re.search(r"[A-Za-z]{1,5}\.NS$", q) or re.search(r"^[A-Za-z.\-]{1,12}$", q):
        try:
            info = yf.Ticker(q).info
            long_name = info.get("longName") or info.get("shortName")
            if long_name:
                return long_name
        except Exception:
            pass
    return q

def expand_aliases(q: str):
    aliases = set()
    q = q.strip()
    aliases.add(q)
    m = re.match(r"^([A-Za-z0-9]+)\.NS$", q)
    if m:
        aliases.add(m.group(1))
        ln = resolve_company_query(q)
        aliases.add(ln)
        aliases.add(re.sub(r"\b[Ll]imited\b", "", ln).strip())
        aliases.add("Tata Consultancy Services") if "Tata Consultancy Services" not in aliases else None
        aliases.add("TCS") if "TCS" not in aliases else None
    else:
        words = [w for w in re.split(r"\W+", q) if w]
        if 2 <= len(words) <= 5:
            initials = ''.join(w[0].upper() for w in words)
            if len(initials) >= 2:
                aliases.add(initials)
    return [a for a in aliases if a]


### RSS builders (Google, Bing, Site feeds)

In [5]:
def google_news_rss_urls(company: str, days: int):
    return [f"https://news.google.com/rss/search?q={quote_plus(company)}+when:{days}d&hl=en-IN&gl=IN&ceid=IN:en"]

def bing_news_rss_urls(company: str):
    return [f"https://www.bing.com/news/search?q={quote_plus(company)}&qft=sortbydate%3d%221%22&form=RSSMHL&format=RSS"]

GENERAL_FEEDS_LOCAL = []  # wired from CONFIG later


### Fetch & clean helpers (improved)

In [6]:
def normalize_gnews_link(link: str) -> str:
    if 'news.google.' in link:
        try:
            qs = parse_qs(urlparse(link).query)
            if 'url' in qs and qs['url']:
                return qs['url'][0]
        except Exception:
            pass
    return link

def fetch_rss_entries(url: str):
    key = f"rss::{url}"
    cached = read_cache(key)
    if isinstance(cached, list) and len(cached) > 0:
        return cached
    feed = feedparser.parse(url)
    entries = []
    for e in feed.entries[:MAX_RSS_PER_SOURCE]:
        link = e.get("link") or ""
        if (not link) and e.get('links'):
            for L in e['links']:
                if L.get('type','').startswith('text/html') and L.get('href'):
                    link = L['href']
                    break
        link = normalize_gnews_link(link)
        title = e.get("title", "")
        summary = e.get("summary", "")
        published = e.get("published") or e.get("updated") or ""
        entries.append({"title": title, "summary": summary, "link": link, "published": published})
    if entries:
        write_cache(key, entries)
    return entries

def parse_pubdate(published: str) -> datetime:
    try:
        dt = dateparser.parse(published)
        return to_ist(dt) if dt else ist_now()
    except Exception:
        return ist_now()

def fetch_article_text(url: str, timeout=12):
    key = f"page::{hashlib.sha256(url.encode('utf-8')).hexdigest()[:16]}"
    cached = read_cache(key)
    if cached:
        return cached.get("text", ""), cached.get("title", "")
    try:
        sess = requests.Session()
        r = get_with_retries(sess, url, headers=HDRS, timeout=timeout)
        html = r.text
        doc = Document(html)
        cleaned_html = doc.summary()
        title = doc.short_title() or ""
        soup = BeautifulSoup(cleaned_html, "lxml")
        text = soup.get_text("\n", strip=True)
        if len(text) < 400:
            soup_full = BeautifulSoup(html, "lxml")
            for tag in soup_full(["script", "style", "noscript"]):
                tag.decompose()
            text2 = soup_full.get_text("\n", strip=True)
            if len(text2) > len(text):
                text = text2
        write_cache(key, {"text": text, "title": title})
        return text, title
    except Exception:
        return "", ""


### Optional: NSE Corporate Announcements (JSON)

In [7]:
def fetch_nse_corp(symbol: str, days: int):
    if not ENABLE_NSE_CORP:
        return []
    try:
        sess = requests.Session()
        home = get_with_retries(sess, "https://www.nseindia.com/")
        hdrs = HDRS.copy(); hdrs["Referer"] = "https://www.nseindia.com/"
        url = f"https://www.nseindia.com/api/corporate-announcements?index=equities&symbol={symbol}"
        r = get_with_retries(sess, url, headers=hdrs, timeout=12)
        j = r.json()
        out = []
        now = ist_now()
        for row in j.get('data', []):
            dt_str = row.get('sm_dt') or row.get('dt') or row.get('dissemDT') or ''
            pub = parse_pubdate(dt_str)
            if (now - pub).days <= days:
                title = row.get('sm_desc') or row.get('desc') or row.get('subject') or 'NSE Corporate Announcement'
                link = row.get('attchmntFile') or row.get('pdf') or ''
                if link and not link.startswith('http'):
                    link = 'https://nsearchives.nseindia.com' + link
                out.append({"title": title, "summary": row.get('more',''), "link": link, "published": pub.isoformat()})
        return out
    except Exception:
        return []


### Sentiment, aggregation & decision

In [8]:
_pipe = None
def load_model():
    global _pipe
    if _pipe is None:
        from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
        tok = AutoTokenizer.from_pretrained("ProsusAI/finbert")
        mdl = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")
        _pipe = pipeline("text-classification", model=mdl, tokenizer=tok, return_all_scores=True, truncation=True)
    return _pipe

def score_sentiment_finbert(text: str):
    pipe = load_model()
    if not text:
        return {"positive": 0.0, "neutral": 1.0, "negative": 0.0}
    res = pipe(text[:4000])
    scores = res[0]
    out = {"positive": 0.0, "neutral": 0.0, "negative": 0.0}
    for s in scores:
        out[s["label"].lower()] = float(s["score"])
    return out

def article_base_score(sentiment: dict) -> float:
    return float(sentiment.get("positive", 0.0) - sentiment.get("negative", 0.0))

def recency_weight(pub_dt: datetime, now_ist: datetime, tau_days: float) -> float:
    age_days = max((now_ist - pub_dt).total_seconds() / 86400.0, 0.0)
    return math.exp(-age_days / tau_days)

def source_weight(url: str) -> float:
    dom = domain_of(url)
    for d, w in SOURCE_WEIGHTS.items():
        if d in dom:
            return w
    return DEFAULT_SOURCE_WEIGHT

def aggregate_scores(rows: pd.DataFrame):
    if rows.empty:
        return 0.0, 0.0
    weights = rows["recency_w"] * rows["source_w"]
    agg = (rows["base_score"] * weights).sum() / (weights.sum() + 1e-9)
    n = len(rows)
    dispersion = float(np.std(rows["base_score"])) if n > 1 else 0.0
    conf = max(0.0, min(1.0, (math.log1p(n) / 3.0) * (1.0 - min(1.0, dispersion))))
    return float(agg), float(conf)

def final_bias(agg_score: float, n_articles: int) -> str:
    if n_articles >= MIN_ARTICLES_FOR_STRONG_VIEW:
        if agg_score >= LONG_THRESHOLD:
            return "LONG"
        if agg_score <= SHORT_THRESHOLD:
            return "SHORT"
    if agg_score > 0.05:
        return "LEAN LONG"
    if agg_score < -0.05:
        return "LEAN SHORT"
    return "NEUTRAL"


### Telegram (optional)

In [9]:
def send_telegram(message: str, bot_token: str, chat_id: str):
    if not (bot_token and chat_id):
        return False
    try:
        url = f"https://api.telegram.org/bot{bot_token}/sendMessage"
        data = {"chat_id": chat_id, "text": message, "parse_mode": "HTML", "disable_web_page_preview": True}
        r = requests.post(url, data=data, timeout=10)
        return r.ok
    except Exception:
        return False


### Gathering pipeline with broader fallbacks

In [10]:
def _entries_from_rss(rss_url: str):
    try:
        ents = fetch_rss_entries(rss_url)
        return ents if isinstance(ents, list) else []
    except Exception:
        return []

def _alias_match(title: str, summary: str, aliases):
    t = (title or '').lower()
    s = (summary or '').lower()
    for a in aliases:
        aa = a.lower()
        if aa and (aa in t or aa in s):
            return True
    return False

def gather_news_any(company_or_ticker: str, days: int, max_articles: int = 25):
    aliases = expand_aliases(company_or_ticker)
    items = []
    # Google News
    for alias in aliases:
        for rss in google_news_rss_urls(alias, days):
            items.extend(_entries_from_rss(rss))
    # Bing News
    if len(items) < 5:
        for alias in aliases:
            for rss in bing_news_rss_urls(alias):
                items.extend(_entries_from_rss(rss))
    # NSE Corporate Announcements (if symbol like TCS.NS)
    m = re.match(r"^([A-Za-z0-9]+)\.NS$", company_or_ticker.strip())
    if m:
        items.extend(fetch_nse_corp(m.group(1), days))
    # Generic feeds (keyword filter: title OR summary)
    if len(items) < 6:
        for feed in GENERAL_FEEDS_LOCAL:
            ents = _entries_from_rss(feed)
            for e in ents:
                if _alias_match(e.get('title'), e.get('summary'), aliases):
                    items.append(e)

    now = ist_now()
    norm = []
    for it in items:
        pub = parse_pubdate(it.get('published','')) or now
        if (now - pub).days > days:
            continue
        url = normalize_gnews_link(it.get('link',''))
        norm.append({"title": it.get('title','').strip(), "summary": it.get('summary',''), "url": url, "published_dt": pub})

    # De-dup: (norm title, domain)
    dedup = {}
    for it in norm:
        key = (re.sub(r"\s+", " ", it["title"].lower()).strip(), domain_of(it["url"]))
        if key not in dedup:
            dedup[key] = it
    out = sorted(dedup.values(), key=lambda x: x['published_dt'], reverse=True)
    return out[:max_articles]


### Run the agent & export CSVs + daily ledger

In [11]:
from IPython.display import display

def run_agent(query: str, days: int, max_articles: int):
    news = gather_news_any(query, days, max_articles)
    now = ist_now()
    rows = []
    for it in tqdm(news, desc=f"Scoring news for {query}"):
        text, extracted_title = fetch_article_text(it["url"])
        title = extracted_title or it["title"] or "(no title)"
        sentiment = score_sentiment_finbert(text[:4000] if text else title)
        base = article_base_score(sentiment)
        rw = recency_weight(it["published_dt"], now, TAU_DAYS)
        sw = source_weight(it["url"])
        rows.append({
            "published_ist": it["published_dt"].strftime("%Y-%m-%d %H:%M"),
            "domain": domain_of(it["url"]),
            "title": title[:200],
            "url": it["url"],
            "sent_pos": round(sentiment["positive"], 3),
            "sent_neu": round(sentiment["neutral"], 3),
            "sent_neg": round(sentiment["negative"], 3),
            "base_score": round(base, 3),
            "recency_w": round(rw, 3),
            "source_w": round(sw, 2),
            "weighted": round(base * rw * sw, 3),
        })
    df = pd.DataFrame(rows, columns=[
        "published_ist","domain","title","url",
        "sent_pos","sent_neu","sent_neg","base_score","recency_w","source_w","weighted"
    ])
    agg, conf = aggregate_scores(df if not df.empty else pd.DataFrame())
    bias = final_bias(agg, len(df))
    result = {
        "query": query,
        "resolved_query": resolve_company_query(query),
        "days": days,
        "articles": int(len(df)),
        "agg_score": float(round(agg, 3)),
        "confidence": float(round(conf, 3)),
        "bias": bias,
    }
    return result, df

# Wire GENERAL_FEEDS from CONFIG
GENERAL_FEEDS_LOCAL = GENERAL_FEEDS.copy()

today = ist_now().strftime("%Y-%m-%d")
out_dir = os.path.join(OUTPUT_BASE_DIR, today)
os.makedirs(out_dir, exist_ok=True)

ledger_path = os.path.join(out_dir, "bias_ledger.csv")
ledger_rows = []
all_results = []
for q in QUERIES:
    result, df = run_agent(q, DAYS, MAX_ARTICLES)
    all_results.append(result)
    print("\n=== Result ===")
    print(result)
    if not df.empty:
        display(df.sort_values("weighted", ascending=False).head(10))
        fname = f"{safe_filename(result['resolved_query']).lower()}_articles.csv"
        csv_path = os.path.join(out_dir, fname)
        df.to_csv(csv_path, index=False)
        print(f"Saved articles CSV: {csv_path}")
    ledger_rows.append({
        "run_date": today,
        "query": result["query"],
        "resolved_query": result["resolved_query"],
        "days_window": result["days"],
        "articles": result["articles"],
        "agg_score": result["agg_score"],
        "confidence": result["confidence"],
        "bias": result["bias"],
    })

ledger_df = pd.DataFrame(ledger_rows, columns=[
    "run_date","query","resolved_query","days_window","articles","agg_score","confidence","bias"
])
if os.path.exists(ledger_path):
    existing = pd.read_csv(ledger_path)
    ledger_df = pd.concat([existing, ledger_df], ignore_index=True)
ledger_df.to_csv(ledger_path, index=False)
print(f"Ledger updated: {ledger_path}")

if ENABLE_TELEGRAM and TELEGRAM_BOT_TOKEN and TELEGRAM_CHAT_ID:
    lines = ["<b>News Sentiment Summary</b>"]
    for r in all_results:
        lines.append(f"<b>{r['resolved_query']}</b>: Articles {r['articles']}, Agg {r['agg_score']}, Conf {r['confidence']} — <b>{r['bias']}</b>")
    _ok = send_telegram("\n".join(lines), TELEGRAM_BOT_TOKEN, TELEGRAM_CHAT_ID)
    print("Telegram sent:", _ok)


Scoring news for TCS.NS: 0it [00:00, ?it/s]


=== Result ===
{'query': 'TCS.NS', 'resolved_query': 'Tata Consultancy Services Limited', 'days': 5, 'articles': 0, 'agg_score': 0.0, 'confidence': 0.0, 'bias': 'NEUTRAL'}
Ledger updated: outputs/2025-09-28/bias_ledger.csv





### Debug cell — how many items each feed returned (first 3 titles)

In [12]:
def debug_sources(q: str, days: int):
    aliases = expand_aliases(q)
    counts = {}
    samples = {}
    # Google
    for alias in aliases:
        for rss in google_news_rss_urls(alias, days):
            ents = fetch_rss_entries(rss)
            counts[rss] = len(ents)
            samples[rss] = [e.get('title') for e in ents[:3]]
    # Bing
    for alias in aliases:
        for rss in bing_news_rss_urls(alias):
            ents = fetch_rss_entries(rss)
            counts[rss] = len(ents)
            samples[rss] = [e.get('title') for e in ents[:3]]
    # Generic feeds
    for feed in GENERAL_FEEDS_LOCAL:
        ents = fetch_rss_entries(feed)
        counts[feed] = len(ents)
        samples[feed] = [e.get('title') for e in ents[:3]]
    return counts, samples

for q in QUERIES:
    c, s = debug_sources(q, DAYS)
    print(f"\n[Debug counts for {q}]\n")
    for k,v in c.items():
        print(v, k)
    print("\nSample titles:")
    for k, arr in s.items():
        print("-", k)
        for t in arr:
            print("   *", t)



[Debug counts for TCS.NS]

0 https://news.google.com/rss/search?q=TCS.NS+when:5d&hl=en-IN&gl=IN&ceid=IN:en
0 https://news.google.com/rss/search?q=Tata+Consultancy+Services+Limited+when:5d&hl=en-IN&gl=IN&ceid=IN:en
0 https://news.google.com/rss/search?q=Tata+Consultancy+Services+when:5d&hl=en-IN&gl=IN&ceid=IN:en
0 https://news.google.com/rss/search?q=TCS+when:5d&hl=en-IN&gl=IN&ceid=IN:en
0 https://www.bing.com/news/search?q=TCS.NS&qft=sortbydate%3d%221%22&form=RSSMHL&format=RSS
0 https://www.bing.com/news/search?q=Tata+Consultancy+Services+Limited&qft=sortbydate%3d%221%22&form=RSSMHL&format=RSS
0 https://www.bing.com/news/search?q=Tata+Consultancy+Services&qft=sortbydate%3d%221%22&form=RSSMHL&format=RSS
0 https://www.bing.com/news/search?q=TCS&qft=sortbydate%3d%221%22&form=RSSMHL&format=RSS
0 https://feeds.reuters.com/reuters/INtopNews
0 https://feeds.reuters.com/reuters/businessNews
0 https://www.moneycontrol.com/rss/MCtopnews.xml
0 https://www.livemint.com/rss/companies
0 https://eco