In [1]:
import argparse
import hashlib
import io
import json
import os
import sys
import time
from datetime import datetime, timedelta, timezone

import pandas as pd
import numpy as np
import requests
import yfinance as yf
import feedparser
import tldextract
import yaml

from bs4 import BeautifulSoup
from dotenv import load_dotenv
from tqdm import tqdm
import dateparser

In [2]:
# -------- Paths --------
BASE_DIR = 'C:\\Users\\harsh\\OneDrive\\Desktop\\LLM Capstone\\Data Collection'
DATA_DIR = os.path.join(BASE_DIR, "data")
PRICES_DIR = os.path.join(DATA_DIR, "prices")
MACRO_DIR = os.path.join(DATA_DIR, "macro")
NEWS_DIR = os.path.join(DATA_DIR, "news")
CONFIGS_DIR = os.path.join(BASE_DIR, "configs")

os.makedirs(PRICES_DIR, exist_ok=True)
os.makedirs(MACRO_DIR, exist_ok=True)
os.makedirs(NEWS_DIR, exist_ok=True)

In [3]:
# -------- Env --------
load_dotenv()
API_NINJAS_KEY = os.getenv("API_NINJAS_KEY", "").strip()
DATA_GOV_IN_KEY = os.getenv("DATA_GOV_IN_KEY", "").strip()

In [4]:
# -------- Helpers --------
def read_yaml(path):
    with open(path, "r", encoding="utf-8") as f:
        return yaml.safe_load(f)

def save_parquet(df: pd.DataFrame, path: str):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    df.to_parquet(path, index=False)

def append_jsonl(records, path: str):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, "a", encoding="utf-8") as f:
        for r in records:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")

def read_jsonl(path: str):
    if not os.path.exists(path):
        return []
    out = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if line:
                out.append(json.loads(line))
    return out

def sha1(s: str) -> str:
    return hashlib.sha1(s.encode("utf-8")).hexdigest()

def parse_pub_date(dt_str):
    if not dt_str:
        return None
    # dateparser handles a lot of RSS formats
    dt = dateparser.parse(dt_str)
    if dt is None:
        return None
    # make ISO with Z
    return dt.astimezone(timezone.utc).replace(tzinfo=timezone.utc).isoformat().replace("+00:00", "Z")

In [5]:
TICKERS = ["INFY.NS", "RELIANCE.NS", "HDFCBANK.NS", "TCS.NS", "ICICIBANK.NS"]

for symbol in TICKERS:
    try:
        df = yf.Ticker(symbol).history(period="3y", interval="1d", auto_adjust=False)
        if df.empty:
            df = yf.Ticker(symbol).history(period="max", interval="1d", auto_adjust=False)
        if df.empty:
            print(f"[prices] {symbol}: no data, skipping")
            continue

        df = df.rename_axis("date").reset_index()
        df["date"] = pd.to_datetime(df["date"]).dt.tz_localize(None)

        # standardize names
        df = df.rename(columns={
            "Open":"open","High":"high","Low":"low","Close":"close",
            "Adj Close":"adj_close","Volume":"volume"
        })
        if "adj_close" not in df.columns:
            df["adj_close"] = df["close"]

        df["symbol"] = symbol
        out_path = os.path.join(PRICES_DIR, f"{symbol.replace('.','_')}.parquet")
        df.to_parquet(out_path, index=False)
        print(f"[prices] {symbol}: saved {len(df)} rows")
    except Exception as e:
        print(f"[prices] {symbol}: ERROR → {e}")

[prices] INFY.NS: saved 744 rows
[prices] RELIANCE.NS: saved 744 rows
[prices] HDFCBANK.NS: saved 744 rows
[prices] TCS.NS: saved 744 rows
[prices] ICICIBANK.NS: saved 744 rows


In [12]:
DATA_DIR = os.path.join(os.getcwd(), "data", "api_ninjas")
os.makedirs(DATA_DIR, exist_ok=True)

HEADERS = {"X-Api-Key": API_NINJAS_KEY}

NSE_TICKERS = ["INFY", "RELIANCE", "HDFCBANK", "TCS", "ICICIBANK"]

def fetch_stockprice(symbols=NSE_TICKERS):
    """Fetch current price using /v1/stockprice."""
    out_path = os.path.join(DATA_DIR, "stockprice.csv")
    with open(out_path, "w", encoding="utf-8") as f:
        f.write("ticker,name,price,exchange,currency,updated\n")
        for s in symbols:
            url = f"https://api.api-ninjas.com/v1/stockprice?ticker={s}"
            try:
                r = requests.get(url, headers=HEADERS, timeout=15)
                if r.status_code != 200:
                    print(f"[stockprice] {s}: status {r.status_code}")
                    continue
                data = r.json()
                # data example: { "ticker": "AAPL", "name": "Apple Inc.", ... }
                ticker = data.get("ticker", "")
                name = data.get("name", "")
                price = data.get("price", "")
                exchange = data.get("exchange", "")
                currency = data.get("currency", "")
                updated = data.get("updated", "")
                f.write(f"{ticker},{name},{price},{exchange},{currency},{updated}\n")
                print(f"[stockprice] {s} → price {price}")
            except Exception as e:
                print(f"[stockprice] {s}: ERROR → {e}")

def fetch_marketcap(symbols=NSE_TICKERS):
    """Fetch market cap using /v1/marketcap."""
    out_path = os.path.join(DATA_DIR, "marketcap.csv")
    with open(out_path, "w", encoding="utf-8") as f:
        f.write("ticker,name,market_cap,updated\n")
        for s in symbols:
            url = f"https://api.api-ninjas.com/v1/marketcap?ticker={s}"
            try:
                r = requests.get(url, headers=HEADERS, timeout=15)
                if r.status_code != 200:
                    print(f"[marketcap] {s}: status {r.status_code}")
                    continue
                data = r.json()
                ticker = data.get("ticker", "")
                name = data.get("name", "")
                mc = data.get("market_cap", "")
                updated = data.get("updated", "")
                f.write(f"{ticker},{name},{mc},{updated}\n")
                print(f"[marketcap] {s} → market cap {mc}")
            except Exception as e:
                print(f"[marketcap] {s}: ERROR → {e}")

def fetch_crypto(symbols=["BTCUSD","ETHUSD","DOGEUSD"]):
    """Fetch crypto prices using /v1/cryptoprice."""
    out_path = os.path.join(DATA_DIR, "cryptoprice.csv")
    with open(out_path, "w", encoding="utf-8") as f:
        f.write("symbol,price,timestamp\n")
        for sym in symbols:
            url = f"https://api.api-ninjas.com/v1/cryptoprice?symbol={sym}"
            try:
                r = requests.get(url, headers=HEADERS, timeout=15)
                if r.status_code != 200:
                    print(f"[cryptoprice] {sym}: status {r.status_code}")
                    continue
                data = r.json()
                price = data.get("price", "")
                ts = data.get("timestamp", "")
                f.write(f"{sym},{price},{ts}\n")
                print(f"[cryptoprice] {sym} → {price}")
            except Exception as e:
                print(f"[cryptoprice] {sym}: ERROR → {e}")

In [14]:
fetch_stockprice()
fetch_marketcap()
fetch_crypto()

[stockprice] INFY → price 16.96
[stockprice] RELIANCE: ERROR → 'list' object has no attribute 'get'
[stockprice] HDFCBANK: ERROR → 'list' object has no attribute 'get'
[stockprice] TCS → price 2.65
[stockprice] ICICIBANK: ERROR → 'list' object has no attribute 'get'
[marketcap] INFY → market cap 70304118129
[marketcap] RELIANCE: ERROR → 'list' object has no attribute 'get'
[marketcap] HDFCBANK: ERROR → 'list' object has no attribute 'get'
[marketcap] TCS → market cap 9145813
[marketcap] ICICIBANK: ERROR → 'list' object has no attribute 'get'
[cryptoprice] BTCUSD → 115398.52000000
[cryptoprice] ETHUSD → 4640.00000000
[cryptoprice] DOGEUSD → 0.28564000


In [21]:
# -------- Macro: CPI via data.gov.in --------
def fetch_cpi():
    if not DATA_GOV_IN_KEY:
        print("[macro:cpi] Missing DATA_GOV_IN_KEY; skipping.")
        return

    # This is the resource ID from your notes; adjust if you switch datasets
    resource_id = "352b3616-9d3d-42e5-80af-7d21a2a53fab"
    base = "https://api.data.gov.in/resource/{rid}"
    limit = 100
    offset = 0
    out_path = os.path.join(MACRO_DIR, "cpi.jsonl")
    old = read_jsonl(out_path)
    # Use (financial_year, cpi_c_inflation_) as dedup keys OR (date if present)
    seen_keys = set()
    for o in old:
        key = (o.get("financial_year"), o.get("cpi_c_inflation_"))
        seen_keys.add(key)

    appended = 0
    while True:
        params = {
            "api-key": DATA_GOV_IN_KEY,
            "format": "json",
            "limit": limit,
            "offset": offset
        }
        try:
            resp = requests.get(base.format(rid=resource_id), params=params, timeout=30)
            resp.raise_for_status()
            payload = resp.json()
            rows = payload.get("records", []) or payload.get("data", [])
        except Exception as e:
            print(f"[macro:cpi] Error at offset {offset}: {e}")
            break

        if not rows:
            break

        batch = []
        for r in rows:
            # Normalize field names defensively
            fy = r.get("financial_year") or r.get("financial_year_")
            infl = r.get("cpi_c_inflation_") or r.get("cpi_c_inflation")
            rec = {
                "financial_year": fy,
                "cpi_c_inflation_": try_float(infl),
                "source": "data.gov.in",
                "country": "India",
                "fetched_at": datetime.utcnow().isoformat() + "Z"
            }
            key = (rec["financial_year"], rec["cpi_c_inflation_"])
            if key not in seen_keys:
                seen_keys.add(key)
                batch.append(rec)

        if batch:
            append_jsonl(batch, out_path)
            appended += len(batch)

        offset += limit
        time.sleep(0.2)

    print(f"[macro:cpi] Appended {appended} new rows. Done.")

def try_float(x):
    try:
        return float(str(x).replace(",", "").strip())
    except Exception:
        return None

In [22]:
def fetch_cpi_monthly():
    if not DATA_GOV_IN_KEY:
        print("[macro:cpi] Missing DATA_GOV_IN_KEY")
        return
    
    # Monthly CPI dataset (Combined, Rural, Urban)
    resource_id = "9a10e07c-79f4-4db4-8c7f-fb6e4d9b5f49"
    base_url = f"https://api.data.gov.in/resource/{resource_id}"
    
    out_path = os.path.join(MACRO_DIR, "cpi_monthly.csv")
    rows_all = []
    offset, limit = 0, 100
    
    while True:
        params = {
            "api-key": DATA_GOV_IN_KEY,
            "format": "json",
            "limit": limit,
            "offset": offset
        }
        try:
            r = requests.get(base_url, params=params, timeout=30)
            r.raise_for_status()
            payload = r.json()
            print(payload)
            rows = payload.get("records") or []
        except Exception as e:
            print("[macro:cpi] ERROR →", e)
            break
        
        if not rows:
            break
        
        for row in rows:
            # row contains: year, month, cpi_combined, cpi_rural, cpi_urban, etc.
            year = row.get("year")
            month = row.get("month")
            combined = row.get("cpi_combined")
            rural = row.get("cpi_rural")
            urban = row.get("cpi_urban")
            rows_all.append((year, month, combined, rural, urban))
        
        offset += limit
    
    if rows_all:
        with open(out_path, "w", encoding="utf-8") as f:
            f.write("year,month,cpi_combined,cpi_rural,cpi_urban\n")
            for y, m, c, r, u in rows_all:
                f.write(f"{y},{m},{c},{r},{u}\n")
        print(f"[macro:cpi] Saved {len(rows_all)} rows → {out_path}")
    else:
        print("[macro:cpi] No data fetched")

In [17]:
fetch_cpi()

  "fetched_at": datetime.utcnow().isoformat() + "Z"


[macro:cpi] Appended 6 new rows. Done.


In [23]:
fetch_cpi_monthly()

{'message': 'Meta not found', 'version': '2.2.0', 'status': 'error', 'total': 0, 'count': 0, 'limit': '100', 'offset': '0', 'field': [], 'records': []}
[macro:cpi] No data fetched


In [8]:
import csv

OUT_CSV = os.path.join(NEWS_DIR, "google_news.csv")

def build_google_news_rss(query: str, hl="en-IN", gl="IN", ceid="IN:en") -> str:
    """
    Build a Google News RSS search URL for a given query.
    Example: query="RBI repo rate" → Google News headlines focused on India.
    """
    from urllib.parse import quote_plus
    q = quote_plus(query)
    return f"https://news.google.com/rss/search?q={q}&hl={hl}&gl={gl}&ceid={ceid}"

def _read_existing_links(path: str) -> set:
    if not os.path.exists(path):
        return set()
    seen = set()
    with open(path, "r", encoding="utf-8", newline="") as f:
        reader = csv.DictReader(f)
        for row in reader:
            link = (row.get("link") or "").strip()
            if link:
                seen.add(link)
    return seen

def _ts_to_iso(entry) -> str:
    # feedparser often provides published_parsed (time.struct_time)
    ts = entry.get("published_parsed") or entry.get("updated_parsed")
    if ts:
        return datetime(*ts[:6]).isoformat()
    # fallback to raw published string
    return (entry.get("published") or entry.get("updated") or "").strip()

def fetch_google_news(
    queries,
    per_query_limit: int = 50,
    sleep_between: float = 0.3,
    out_csv: str = OUT_CSV
):
    """
    Fetch Google News RSS for a list of queries and append to CSV.
    Dedup strictly by 'link'.
    """
    # Prepare CSV (create with header if not exists)
    file_exists = os.path.exists(out_csv)
    if not file_exists:
        with open(out_csv, "w", encoding="utf-8", newline="") as f:
            writer = csv.writer(f)
            writer.writerow(["title", "link", "source_feed", "published", "fetched_at", "query"])

    seen_links = _read_existing_links(out_csv)
    added = 0

    with open(out_csv, "a", encoding="utf-8", newline="") as f:
        writer = csv.writer(f)

        for q in queries:
            feed_url = build_google_news_rss(q)
            feed = feedparser.parse(feed_url)
            entries = feed.entries[:per_query_limit] if getattr(feed, "entries", None) else []

            for e in entries:
                title = (e.get("title") or "").strip()
                link = (e.get("link") or "").strip()
                if not link or link in seen_links:
                    continue

                published_iso = _ts_to_iso(e)
                fetched_at = datetime.now().isoformat()
                writer.writerow([title, link, feed_url, published_iso, fetched_at, q])
                seen_links.add(link)
                added += 1

            time.sleep(sleep_between)  # be polite

    print(f"[news] Added {added} new rows → {out_csv}")

# ---- Example usage ----
QUERIES = [
    "RBI repo rate",
    "NIFTY 50",
    "Reliance Industries",
    "HDFC Bank",
    "Infosys",
    "India CPI inflation",
    "RBI MPC meeting",
]

In [9]:
fetch_google_news(QUERIES, per_query_limit=50)

[news] Added 337 new rows → C:\Users\harsh\OneDrive\Desktop\LLM Capstone\Data Collection\data\news\google_news.csv


In [19]:
# ---- Robust GDELT v2 Doc API fetcher (handles OR syntax, optional weekly slices) ----
import os, csv, time, requests, pandas as pd, re
from datetime import datetime, date, timedelta
from dateutil.relativedelta import relativedelta

NEWS_DIR = os.path.join(os.getcwd(), "data", "news_gdelt")
os.makedirs(NEWS_DIR, exist_ok=True)
OUT_CSV = os.path.join(NEWS_DIR, "gdelt_news.csv")
HEADERS = {"User-Agent": "Mozilla/5.0 (capstone-news-bot/1.0)"}

def ensure_header(path, header):
    if not os.path.exists(path):
        with open(path, "w", encoding="utf-8", newline="") as f:
            csv.writer(f).writerow(header)

def read_seen_urls(path):
    if not os.path.exists(path): return set()
    seen = set()
    try:
        for chunk in pd.read_csv(path, usecols=["url"], chunksize=50000):
            seen.update(chunk["url"].dropna().astype(str).tolist())
    except Exception:
        with open(path, "r", encoding="utf-8", newline="") as f:
            for row in csv.DictReader(f):
                u = (row.get("url") or "").strip()
                if u: seen.add(u)
    return seen

def month_windows(start_d: date, end_d: date):
    cur = date(start_d.year, start_d.month, 1)
    while cur <= end_d:
        nxt = (cur + relativedelta(months=1)) - relativedelta(days=1)
        s = max(cur, start_d); e = min(nxt, end_d)
        yield s, e
        cur = cur + relativedelta(months=1)

def week_windows(start_d: date, end_d: date):
    s = start_d
    while s <= end_d:
        e = min(s + timedelta(days=6), end_d)
        yield s, e
        s = e + timedelta(days=1)

def dtstr(d: date, end=False):
    return f"{d.strftime('%Y%m%d')}{'235959' if end else '000000'}"

def sanitize_query(q: str) -> str:
    qs = q.strip()

    # 1) wrap OR groups if not already wrapped
    if " OR " in qs and "(" not in qs:
        qs = f"({qs})"

    # 2) unquote any too-short phrase (<=3 chars) to satisfy GDELT
    #    e.g.  "RBI" -> RBI,  "GST" -> GST
    def _unquote_short(m):
        inner = m.group(1)
        return inner if len(inner) <= 3 else f"\"{inner}\""
    qs = re.sub(r'"([^"]+)"', _unquote_short, qs)

    # 3) add useful filters (drop if you want max breadth)
    if "sourcelang:" not in qs:
        qs += " sourcelang:English"
    if "sourcecountry:" not in qs:
        qs += " sourcecountry:IN"

    return qs

def fetch_gdelt_doc(query: str, start_d: date, end_d: date, maxrecords=250, timeout=30):
    params = {
        "query": query,
        "mode": "ArtList",
        "startdatetime": dtstr(start_d, False),
        "enddatetime": dtstr(end_d, True),
        "maxrecords": str(maxrecords),
        "format": "json",
    }
    url = "https://api.gdeltproject.org/api/v2/doc/doc"
    try:
        r = requests.get(url, params=params, headers=HEADERS, timeout=timeout)
        ctype = (r.headers.get("Content-Type") or "").lower()
        if "json" not in ctype:
            # likely an error page; show a short hint then skip
            print(f"[gdelt] non-JSON ({r.status_code}) for {start_d}..{end_d}: {r.text[:90].replace(chr(10),' ')}")
            return []
        data = r.json()
        return data.get("articles", []) or []
    except Exception as e:
        print(f"[gdelt] EXC {start_d}..{end_d}: {e}")
        return []

def fetch_gdelt_bulk(
    queries,
    start="2023-01-01",
    end=None,
    per_window_cap=250,
    use_weekly_slices=False,
    sleep_s=0.5,
):
    if end is None: end = date.today().isoformat()
    start_d = datetime.fromisoformat(start).date()
    end_d   = datetime.fromisoformat(end).date()

    header = ["query","title","seendate","url","domain","language","country","source","socialimage"]
    ensure_header(OUT_CSV, header)
    seen = read_seen_urls(OUT_CSV)

    total_added = 0
    with open(OUT_CSV, "a", encoding="utf-8", newline="") as f:
        wr = csv.writer(f)
        for q in queries:
            q_sane = sanitize_query(q)
            print(f"[gdelt] Query: {q_sane}")
            windows = week_windows(start_d, end_d) if use_weekly_slices else month_windows(start_d, end_d)
            for s_dt, e_dt in windows:
                # basic retry with backoff
                arts = []
                for attempt in range(3):
                    arts = fetch_gdelt_doc(q_sane, s_dt, e_dt, maxrecords=per_window_cap)
                    if arts: break
                    time.sleep(0.8 * (attempt + 1))
                added_w = 0
                for a in arts:
                    url = (a.get("url") or "").strip()
                    if not url or url in seen: continue
                    wr.writerow([
                        q_sane,
                        (a.get("title") or "").replace("\n"," ").strip(),
                        a.get("seendate",""),
                        url,
                        a.get("domain",""),
                        a.get("language",""),
                        a.get("country",""),
                        a.get("source",""),
                        a.get("socialimage",""),
                    ])
                    seen.add(url)
                    total_added += 1
                    added_w += 1
                time.sleep(sleep_s)
            print(f"[gdelt]   cumulative added this run: {total_added}")
    print(f"[gdelt] DONE. Added {total_added} new rows → {OUT_CSV}")

In [20]:
QUERIES = [
    '(RBI OR "Reserve Bank of India")',   # RBI unquoted now
    '("CPI inflation" OR "consumer price index") AND India',
    '("NIFTY 50" OR Sensex)',             # Sensex can be unquoted
    '("HDFC Bank" OR "ICICI Bank" OR SBI)',
    '("Reliance Industries" OR RIL)',
    '("Infosys" OR TCS OR Wipro)',
]

fetch_gdelt_bulk(
    QUERIES,
    start="2025-03-01",
    end="2025-09-14",
    per_window_cap=250,
    use_weekly_slices=True,
    sleep_s=0.6
)

[gdelt] Query: (RBI OR "Reserve Bank of India") sourcelang:English sourcecountry:IN
[gdelt]   cumulative added this run: 6538
[gdelt] Query: ("CPI inflation" OR "consumer price index") AND India sourcelang:English sourcecountry:IN
[gdelt]   cumulative added this run: 7019
[gdelt] Query: ("NIFTY 50" OR Sensex) sourcelang:English sourcecountry:IN
[gdelt]   cumulative added this run: 13038
[gdelt] Query: ("HDFC Bank" OR "ICICI Bank" OR SBI) sourcelang:English sourcecountry:IN
[gdelt]   cumulative added this run: 17135
[gdelt] Query: ("Reliance Industries" OR RIL) sourcelang:English sourcecountry:IN
[gdelt]   cumulative added this run: 18695
[gdelt] Query: ("Infosys" OR TCS OR Wipro) sourcelang:English sourcecountry:IN
[gdelt]   cumulative added this run: 21894
[gdelt] DONE. Added 21894 new rows → c:\Users\harsh\OneDrive\Desktop\LLM Capstone\Data Collection\data\news_gdelt\gdelt_news.csv


In [50]:
import os, time, json, csv, random
from datetime import datetime
from urllib.parse import urlparse, urljoin

import requests
import pandas as pd
import tldextract
from bs4 import BeautifulSoup
from urllib import robotparser

import trafilatura
from trafilatura.settings import use_config
from readability import Document

# ---------- paths
BASE = os.getcwd()
GDELT_CSV = os.path.join(BASE, "data", "news_gdelt", "gdelt_news.csv")
ART_DIR   = os.path.join(BASE, "data", "news_articles")
ART_JSONL = os.path.join(ART_DIR, "articles.jsonl")
os.makedirs(ART_DIR, exist_ok=True)

# ---------- config
USER_AGENT = (
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
    "AppleWebKit/537.36 (KHTML, like Gecko) "
    "Chrome/124.0.0.0 Safari/537.36"
)
HEADERS = {
    "User-Agent": USER_AGENT,
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "en-IN,en;q=0.9",
    "Referer": "https://news.google.com/",
    "Connection": "keep-alive",
}

SLEEP_BASE = 0.8      # base delay between requests
SLEEP_JITTER = 0.5    # add a little randomness
TIMEOUT = 25

# keep this small to start; raise later
MAX_PAGES = 500

# Allow a focused set of Indian finance/major outlets (expand as needed)
ALLOW_HOST_SUBSTRINGS = [
    "reuters.com", "economictimes.indiatimes.com", "moneycontrol.com", "livemint.com",
    "business-standard.com", "thehindubusinessline.com", "financialexpress.com",
    "indiatimes.com", "cnbctv18.com", "thehindu.com", "ndtv.com", "hindustantimes.com",
    "timesofindia.indiatimes.com", "news18.com", "mintgenie.livemint.com",
]

# ---------- robots handling (cache)
robots_cache = {}

def can_fetch(url: str, user_agent: str = USER_AGENT) -> bool:
    """Respect robots.txt; if robots unreadable, default to False (skip)."""
    try:
        parsed = urlparse(url)
        base = f"{parsed.scheme}://{parsed.netloc}"
        rp = robots_cache.get(base)
        if rp is None:
            rp = robotparser.RobotFileParser()
            rp.set_url(f"{base}/robots.txt")
            try:
                rp.read()
            except Exception:
                robots_cache[base] = rp
                return False
            robots_cache[base] = rp
        return rp.can_fetch(user_agent, url)
    except Exception:
        return False

def host_allowed(url: str) -> bool:
    host = urlparse(url).netloc.lower()
    return any(dom in host for dom in ALLOW_HOST_SUBSTRINGS)

# ---------- load seen URLs from jsonl to make runs resumable
def load_seen_urls(jsonl_path: str) -> set:
    if not os.path.exists(jsonl_path):
        return set()
    seen = set()
    with open(jsonl_path, "r", encoding="utf-8") as f:
        for line in f:
            try:
                obj = json.loads(line)
                u = (obj.get("url") or "").strip()
                if u:
                    seen.add(u)
            except Exception:
                continue
    return seen

# ---------- requests session
session = requests.Session()
session.headers.update(HEADERS)

# ---------- trafilatura config (parse HTML we already fetched)
t_cfg = use_config()
t_cfg.set("DEFAULT", "user_agent", USER_AGENT)
t_cfg.set("DEFAULT", "favor_reliable_domains", "true")
t_cfg.set("DEFAULT", "include_comments", "false")
t_cfg.set("DEFAULT", "no_fallback", "false")   # allow internal fallbacks

def extract_with_trafilatura(html: str, base_url: str) -> str | None:
    """Return plain text using trafilatura from HTML string."""
    if not html:
        return None
    txt = trafilatura.extract(html, config=t_cfg, url=base_url)
    # print(txt)
    if txt and len(txt.strip()) > 400:
        return txt.strip()
    # print("Nothing parsed")
    return None

def extract_with_readability(html: str) -> str | None:
    """Fallback: readability-lxml content extraction."""
    try:
        doc = Document(html)
        article_html = doc.summary(html_partial=True)
        soup = BeautifulSoup(article_html, "html.parser")
        text = " ".join(p.get_text(" ", strip=True) for p in soup.find_all(["p", "li"]))
        text = " ".join(text.split())
        if len(text) > 400:
            return text
    except Exception:
        pass
    return None

def find_amp_link(html: str, url: str) -> str | None:
    """Look for <link rel='amphtml'> or a common AMP variant of the URL."""
    try:
        soup = BeautifulSoup(html, "html.parser")
        link = soup.find("link", rel=lambda v: v and "amphtml" in v.lower())
        if link and link.get("href"):
            amp = link["href"]
            # make absolute if relative
            if amp.startswith("//"):
                amp = urlparse(url).scheme + ":" + amp
            elif amp.startswith("/"):
                parsed = urlparse(url)
                amp = f"{parsed.scheme}://{parsed.netloc}{amp}"
            return amp
        # heuristic fallbacks: /amp or ?outputType=amp
        if url.endswith("/"):
            return url + "amp"
        return url + ("/amp" if not url.endswith("/amp") else "")
    except Exception:
        return None

def get_html(url: str) -> str | None:
    try:
        r = session.get(url, timeout=TIMEOUT, allow_redirects=True)
        if r.status_code != 200:
            return None
        ctype = (r.headers.get("Content-Type") or "").lower()
        if "text/html" not in ctype:
            return None
        return r.text
    except Exception:
        return None

def scrape_from_gdelt(
    gdelt_csv: str = GDELT_CSV,
    out_jsonl: str = ART_JSONL,
    max_pages: int = MAX_PAGES,
    base_sleep: float = SLEEP_BASE,
    jitter: float = SLEEP_JITTER
):
    if not os.path.exists(gdelt_csv):
        print("[scrape] GDELT CSV not found:", gdelt_csv)
        return

    # Load GDELT rows (url, title, seendate, domain, query)
    try:
        df = pd.read_csv(gdelt_csv, usecols=["url","title","seendate","domain","query"])
    except Exception:
        df = pd.read_csv(gdelt_csv)

    df = df.dropna(subset=["url"]).drop_duplicates(subset=["url"])
    seen = load_seen_urls(out_jsonl)

    added = 0
    with open(out_jsonl, "a", encoding="utf-8") as out:
        for _, row in df.iterrows():
            url   = str(row.get("url") or "").strip()
            title = str(row.get("title") or "").strip()
            if not url or url in seen:
                continue
            if not host_allowed(url):
                continue
            if not can_fetch(url):
                continue

            # 1) fetch original HTML
            html = get_html(url)
            # 2) try trafilatura on original
            text = extract_with_trafilatura(html, url) if html else None
            # print(text)
            # 3) try AMP if original failed/short
            if not text:
                amp_url = find_amp_link(html or "", url)
                if amp_url and amp_url != url and can_fetch(amp_url):
                    amp_html = get_html(amp_url)
                    text = extract_with_trafilatura(amp_html, amp_url) if amp_html else None

            # 4) readability fallback
            if not text and html:
                text = extract_with_readability(html)

            if text:
                rec = {
                    "url": url,
                    "title": title,
                    "seendate": str(row.get("seendate") or ""),
                    "domain": str(row.get("domain") or ""),
                    "query": str(row.get("query") or ""),
                    "fetched_at": datetime.now().isoformat(),
                    "text": text,
                    "text_len": len(text)
                }
                out.write(json.dumps(rec, ensure_ascii=False) + "\n")
                seen.add(url)
                added += 1

            # polite sleep with jitter
            time.sleep(base_sleep + random.random() * jitter)

            if added >= max_pages:
                break

    print(f"[scrape] Added {added} articles → {out_jsonl}")

In [51]:
# ---- Run a small batch first; then increase MAX_PAGES and re-run
scrape_from_gdelt(max_pages=300, base_sleep=0.9, jitter=0.6)

[scrape] Added 300 articles → c:\Users\harsh\OneDrive\Desktop\LLM Capstone\Data Collection\data\news_articles\articles.jsonl
