In [1]:
from dotenv import load_dotenv
import pandas as pd
import os
import requests
import time

load_dotenv()

True

In [10]:
API_KEY = os.getenv('NYT_API')
BASE_URL = "https://api.nytimes.com/svc/search/v2/articlesearch.json"

def get_nyt_articles(begin_date, end_date, query="Apple", pages=10):
    articles = []

    for page in range(pages):
        params = {
            "q": query,
            "begin_date": begin_date,  # Format: YYYYMMDD
            "end_date": end_date,
            "page": page,
            "api-key": API_KEY
        }

        response = requests.get(BASE_URL, params=params)
        if response.status_code != 200:
            print(f"Failed at page {page}, status code: {response.status_code}")
            break

        docs = response.json().get("response", {}).get("docs", [])
        if not docs:
            print(f"No docs found on page {page}")
            break

        for doc in docs:
            articles.append({
                "title": doc.get("headline", {}).get("main", ""),
                "snippet": doc.get("snippet", ""),
                "pub_date": doc.get("pub_date", ""),
                "web_url": doc.get("web_url", ""),
                "section": doc.get("section_name", ""),
                "source": doc.get("source", "")
            })

        print(f"[✓] Page {page + 1}: {len(docs)} articles fetched")
        time.sleep(20)  # Rate limit: 10 req/min
        
        if response.status_code == 429:
            print("🚫 Rate limit hit! Resting.")
            time.sleep(20)
            continue

    return pd.DataFrame(articles)

In [11]:
begin_date = 20210719
end_date = 20250717
save_path = "../data/nyt_apple_new.csv"
query = "(Apple OR 'Apple Inc' OR 'Tim Cook' OR iPhone OR iPad OR MacBook OR 'Apple Watch' OR 'Apple Music' OR 'App Store' OR 'iOS' OR 'macOS' OR 'Apple Event' OR 'Apple Vision Pro' OR 'Apple TV' OR 'AAPL')"

df = get_nyt_articles(begin_date=begin_date, end_date=end_date, query="Apple", pages=100)
df.to_csv(save_path, index=False)
print(f"\nSaved {len(df)} articles to {save_path}")

[✓] Page 1: 10 articles fetched
[✓] Page 2: 10 articles fetched
[✓] Page 3: 10 articles fetched
[✓] Page 4: 10 articles fetched
[✓] Page 5: 10 articles fetched
[✓] Page 6: 10 articles fetched
[✓] Page 7: 10 articles fetched
[✓] Page 8: 10 articles fetched
[✓] Page 9: 10 articles fetched
[✓] Page 10: 10 articles fetched
[✓] Page 11: 10 articles fetched
[✓] Page 12: 10 articles fetched
[✓] Page 13: 10 articles fetched
[✓] Page 14: 10 articles fetched
[✓] Page 15: 10 articles fetched
[✓] Page 16: 10 articles fetched
[✓] Page 17: 10 articles fetched
[✓] Page 18: 10 articles fetched
[✓] Page 19: 10 articles fetched
[✓] Page 20: 10 articles fetched
[✓] Page 21: 10 articles fetched
[✓] Page 22: 10 articles fetched
[✓] Page 23: 10 articles fetched
[✓] Page 24: 10 articles fetched
[✓] Page 25: 10 articles fetched
[✓] Page 26: 10 articles fetched
[✓] Page 27: 10 articles fetched
[✓] Page 28: 10 articles fetched
[✓] Page 29: 10 articles fetched
[✓] Page 30: 10 articles fetched
[✓] Page 31: 10 art

In [12]:
def process_news_data(df, name):
    print(f"\n==== {name} ====")
    print(df.columns)
    
    print(df['pub_date'].head())
    
    df['pub_date'] = pd.to_datetime(df['pub_date'], errors='coerce')

    df = df.dropna(subset=['pub_date'])

    df['pub_date'] = df['pub_date'].dt.tz_localize(None)

    df = df.sort_values('pub_date')

    print("Earliest date:", df['pub_date'].min())
    print("Latest date:", df['pub_date'].max())

    print("Articles per year:")
    print(df['pub_date'].dt.year.value_counts().sort_index())

    print("Articles per month:")
    print(df['pub_date'].dt.to_period('M').value_counts().sort_index())

    return df

In [13]:
news_df = pd.read_csv('../data/nyt_apple_new.csv')

news_df

Unnamed: 0,title,snippet,pub_date,web_url,section,source
0,"Jeff Williams, Apple’s Chief Operating Officer...",Mr. Williams was long considered a leading can...,2025-07-08T22:27:10Z,https://www.nytimes.com/2025/07/08/technology/...,Technology,The New York Times
1,"Trump Turns Back to His Trade War, Threatening...",The president threatened both Apple and the Eu...,2025-05-23T12:46:49Z,https://www.nytimes.com/2025/05/23/us/politics...,U.S.,The New York Times
2,Businesses Plead for Tariff Breaks After Trump...,Retail executives huddled with the president a...,2025-04-22T09:03:37Z,https://www.nytimes.com/2025/04/22/us/politics...,U.S.,The New York Times
3,Apple’s New Software Focuses on Design Aesthet...,The company also introduced artificial intelli...,2025-06-09T18:02:31Z,https://www.nytimes.com/2025/06/09/technology/...,Technology,The New York Times
4,"Bad Apple, the Rise of the A.I. Empire and Ita...",“I have rarely read a judge who is so obviousl...,2025-05-09T11:00:05Z,https://www.nytimes.com/2025/05/09/podcasts/ha...,Podcasts,The New York Times
...,...,...,...,...,...,...
995,The Best Movies and TV Shows Coming to Disney+...,"“Cruel Intentions,” “Music by John Williams” a...",2024-11-01T15:13:17Z,https://www.nytimes.com/2024/11/01/arts/televi...,Arts,The New York Times
996,Make America 1897 Again,"Trump, Musk, Zuckerberg and MAGA’s new Gilded ...",2025-01-28T10:01:50Z,https://www.nytimes.com/2025/01/28/opinion/tru...,Opinion,The New York Times
997,"Police Track C.E.O.’s Killer, and Biden Aides ...","Plus, the spectacular rebirth of Notre-Dame.",2024-12-06T11:00:11Z,https://www.nytimes.com/2024/12/06/podcasts/po...,Podcasts,The New York Times
998,Friday Briefing: A White House Shake-Up,"Plus, the actors up for a Tony Award.",2025-05-01T21:05:22Z,https://www.nytimes.com/2025/05/01/briefing/tr...,Briefing,The New York Times


In [14]:
news_df = process_news_data(news_df, "new data")


==== new data ====
Index(['title', 'snippet', 'pub_date', 'web_url', 'section', 'source'], dtype='object')
0    2025-07-08T22:27:10Z
1    2025-05-23T12:46:49Z
2    2025-04-22T09:03:37Z
3    2025-06-09T18:02:31Z
4    2025-05-09T11:00:05Z
Name: pub_date, dtype: object
Earliest date: 2023-12-22 10:03:15
Latest date: 2025-07-16 11:07:17
Articles per year:
pub_date
2023      1
2024    304
2025    695
Name: count, dtype: int64
Articles per month:
pub_date
2023-12      1
2024-01      1
2024-02      6
2024-03     17
2024-04      7
2024-05      8
2024-06     17
2024-07     14
2024-08     29
2024-09     34
2024-10     45
2024-11     50
2024-12     76
2025-01    115
2025-02     98
2025-03     99
2025-04    122
2025-05    119
2025-06     87
2025-07     55
Freq: M, Name: count, dtype: int64
