In [13]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import json
import time
import re
import pandas as pd
from datetime import datetime
from pathlib import Path

REQUIRED_FIELDS = ['id', 'canonical_url', 'updated_time', 'source', 'title', 'description']

def reuters_search(company: str, ticker: str, slug: str, page_size: int = 100, delay: float = 1.0, output_dir: str = "data/reuters_news"):
    filename = f"{output_dir.rstrip('/')}/{ticker}.csv"

    # check if the file exists
    if Path(filename).exists():
        print(f"Skipping {ticker}, already file exists.")
        return

    opts = Options()
    # Uncomment for silent/background scraping
    opts.add_argument("--headless=new")
    opts.add_argument("--disable-gpu")
    opts.add_argument("--no-sandbox")
    opts.add_argument("--window-size=1360,900")
    opts.add_argument("--disable-blink-features=AutomationControlled")
    opts.add_argument(
        "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/129.0.0.0 Safari/537.36"
    )

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=opts)

    offset = 0
    all_articles = []
    total_count = 0

    try:
        while True:
            query = (
                f'{{"arc-site":"reuters","offset":{offset},"orderby":"","size":"{page_size}",'
                f'"topic_url":"/company/{slug}/","uri":"/company/{slug}/","website":"reuters"}}'
            )
            base_api_url = f"https://www.reuters.com/pf/api/v3/content/fetch/articles-by-topic-v1?query={query}"
            driver.get(base_api_url)
            page_source = driver.page_source

            match = re.search(r"<pre>(.*?)</pre>", page_source, re.DOTALL)
            json_text = match.group(1).strip() if match else page_source

            try:
                data = json.loads(json_text)
                if not isinstance(data, dict):
                    data = {}
            except Exception as e:
                data = {}

            # safely extract articles
            articles = (data.get("result") or {}).get("articles") or []
            if not articles:
                break

            for article in articles:
                item = {}
                for field in REQUIRED_FIELDS:
                    value = article.get(field)
                    if field == "source" and isinstance(value, dict):
                        item[field] = value.get("name") or value.get("label")
                    elif field == "canonical_url":
                        item[field] = "https://www.reuters.com" + value
                    else:
                        item[field] = value
                item["company"] = company
                all_articles.append(item)

            count = len(articles)
            total_count += count
            print(f"Retrieved {count} articles at offset {offset}. Total so far: {total_count}")

            offset += page_size
            time.sleep(delay)

    finally:
        driver.quit()

    df = pd.DataFrame(all_articles)
    if not df.empty:
        df["updated_time"] = pd.to_datetime(df["updated_time"], errors="coerce", utc=True)
        df = df.sort_values("updated_time", ascending=False)

        # Save to CSV
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        df.to_csv(filename, index=False, encoding="utf-8-sig")
        print(f"Completed {ticker}, Saved {len(df)} articles for {company} → {filename}")
    else:
        print(f"No data to save for {company}.")

    return df

In [15]:
sp_100_tickers = pd.read_csv("data/sp100_tickers.csv")

for _, company_data in sp_100_tickers.iterrows():
    df = reuters_search(
        company=company_data['Name'], 
        ticker=company_data['Symbol'], 
        slug=company_data['Reuters_Company_ID'], 
        page_size=100)

Retrieved 100 articles at offset 0. Total so far: 100
Retrieved 100 articles at offset 100. Total so far: 200
Retrieved 100 articles at offset 200. Total so far: 300
Retrieved 100 articles at offset 300. Total so far: 400
Retrieved 100 articles at offset 400. Total so far: 500
Retrieved 100 articles at offset 500. Total so far: 600
Retrieved 100 articles at offset 600. Total so far: 700
Retrieved 100 articles at offset 700. Total so far: 800
Retrieved 100 articles at offset 800. Total so far: 900
Retrieved 100 articles at offset 900. Total so far: 1000
Retrieved 100 articles at offset 1000. Total so far: 1100
Retrieved 100 articles at offset 1100. Total so far: 1200
Retrieved 100 articles at offset 1200. Total so far: 1300
Retrieved 100 articles at offset 1300. Total so far: 1400
Retrieved 100 articles at offset 1400. Total so far: 1500
Retrieved 100 articles at offset 1500. Total so far: 1600
Retrieved 100 articles at offset 1600. Total so far: 1700
Retrieved 100 articles at offset 17