In [1]:
from dotenv import load_dotenv
import pandas as pd
import os
import requests
import time

load_dotenv()

True

In [2]:
API_KEY = os.getenv('NYT_API')
BASE_URL = "https://api.nytimes.com/svc/search/v2/articlesearch.json"

def get_nyt_articles(begin_date, end_date, query="Apple", pages=10):
    articles = []

    for page in range(pages):
        params = {
            "q": query,
            "begin_date": begin_date,  # Format: YYYYMMDD
            "end_date": end_date,
            "page": page,
            "api-key": API_KEY
        }

        response = requests.get(BASE_URL, params=params)
        if response.status_code != 200:
            print(f"Failed at page {page}, status code: {response.status_code}")
            break

        docs = response.json().get("response", {}).get("docs", [])
        if not docs:
            print(f"No docs found on page {page}")
            break

        for doc in docs:
            articles.append({
                "title": doc.get("headline", {}).get("main", ""),
                "snippet": doc.get("snippet", ""),
                "pub_date": doc.get("pub_date", ""),
                "web_url": doc.get("web_url", ""),
                "section": doc.get("section_name", ""),
                "source": doc.get("source", "")
            })

        print(f"[✓] Page {page + 1}: {len(docs)} articles fetched")
        time.sleep(20)  # Rate limit: 10 req/min
        
        if response.status_code == 429:
            print("🚫 Rate limit hit! Resting.")
            time.sleep(20)
            continue

    return pd.DataFrame(articles)

In [3]:
begin_date = 20250716
end_date = 20250717
save_path = "../data/nyt_apple1718.csv"

# df = get_nyt_articles(begin_date=begin_date, end_date=end_date, query="Apple", pages=10)
# df.to_csv(save_path, index=False)
# print(f"\nSaved {len(df)} articles to {save_path}")