In [10]:
import pandas as pd

daily_news = pd.read_csv("../data/gnews_partial.csv")
stock_df = pd.read_csv('../data/stock_data_4years.csv', skiprows=2)

stock_df.columns = ['Date', 'Close', 'High', 'Low', 'Open', 'Volume']

daily_news.head()

Unnamed: 0,title,description,content,publishedAt,source,url
0,Today in Apple history,"On June 18, 1993, Apple CEO John Sculley gave ...","June 18, 1993: John Sculley steps down as Appl...",2025-06-18T14:30:22Z,Cult of Mac,https://www.cultofmac.com/apple-history/john-s...
1,The AI Trade Is Back in Play: 2 Stocks to Buy ...,Consider Apple (NASDAQ:AAPL) and another cheap...,The AI (artificial intelligence) trade is gett...,2025-06-13T23:00:00Z,The Motley Fool Canada,https://www.fool.ca/2025/06/13/the-ai-trade-is...
2,China Tariff Easing Spurs Stock Market Surge A...,The “dream scenario” for tech investors helped...,Topline Stocks surged across the board Monday ...,2025-05-12T04:00:00Z,Forbes,https://www.forbes.com/sites/dereksaul/2025/05...
3,Apple Faces New iPhone Fiasco as Update Bricks...,"Apple (AAPL) faces another wave of challenges,...","Apple (AAPL) faces another wave of challenges,...",2025-04-27T16:35:02Z,Business Insider,https://markets.businessinsider.com/news/stock...
4,2 Reasons I'm Considering Apple Stock for a $2...,Apple (NASDAQ:AAPL) stock looks like a deep-va...,The stock market has continued to experience a...,2025-04-23T14:00:00Z,The Motley Fool Canada,https://www.fool.ca/2025/04/23/2-reasons-im-co...


In [None]:
import requests
import pandas as pd
import time
import os
from dotenv import load_dotenv

load_dotenv()

API_KEY = os.getenv('NYT_API')
BASE_URL = "https://api.nytimes.com/svc/search/v2/articlesearch.json"

start_date = 20220901
end_date = 20230228

def get_nyt_articles(query="Apple", begin_date=start_date, end_date=end_date, pages=10):
    articles = []

    for page in range(pages):
        params = {
            "q": query,
            "begin_date": begin_date,  # Format: YYYYMMDD
            "end_date": end_date,
            "page": page,
            "api-key": API_KEY
        }

        response = requests.get(BASE_URL, params=params)
        if response.status_code != 200:
            print(f"Failed at page {page}, status code: {response.status_code}")
            break

        docs = response.json().get("response", {}).get("docs", [])
        if not docs:
            print(f"No docs found on page {page}")
            break

        for doc in docs:
            articles.append({
                "title": doc.get("headline", {}).get("main", ""),
                "snippet": doc.get("snippet", ""),
                "pub_date": doc.get("pub_date", ""),
                "web_url": doc.get("web_url", ""),
                "section": doc.get("section_name", ""),
                "source": doc.get("source", "")
            })

        print(f"[✓] Page {page + 1}: {len(docs)} articles fetched")
        time.sleep(20)  # Rate limit: 10 req/min
        
        if response.status_code == 429:
            print("🚫 Rate limit hit! Resting.")
            time.sleep(20)
            continue

    return pd.DataFrame(articles)

# Run the function
df = get_nyt_articles(query="Apple", pages=50)
df.to_csv("nyt_apple22092402.csv", index=False)
print(f"\nSaved {len(df)} articles to nyt_apple22092402.csv")

Failed at page 0, status code: 429

Saved 0 articles to nyt_apple22092402.csv


In [45]:
apple_news = pd.read_csv('nyt_apple240111.csv')
apple_news.head()

Unnamed: 0,title,snippet,pub_date,web_url,section,source
0,7 Apple Picking Tips From the Guy Behind Apple...,Brian Frange has been pontificating on apples ...,2024-09-10T09:05:34Z,https://www.nytimes.com/2024/09/10/style/apple...,Style,The New York Times
1,What to Do With That Apple-Picking Haul,Both the fruit and cider go into this supremel...,2024-10-11T15:00:13Z,https://www.nytimes.com/2024/10/11/dining/what...,Food,The New York Times
2,How Healthy Are Apples?,They’re America’s most popular fruit. Here’s w...,2024-09-23T09:00:25Z,https://www.nytimes.com/2024/09/23/well/eat/ap...,Well,The New York Times
3,"U.S. Sues Apple, Accusing It of Maintaining an...",The lawsuit caps years of regulatory scrutiny ...,2024-03-21T14:30:39Z,https://www.nytimes.com/2024/03/21/technology/...,Technology,The New York Times
4,Apple’s Quarterly Profit Down Because of Tax P...,Sales for the iPhone maker were up 6 percent t...,2024-10-31T21:10:57Z,https://www.nytimes.com/2024/10/31/technology/...,Technology,The New York Times


In [46]:
apple_news['pub_date'].min()

'2024-01-03T18:21:13Z'

In [47]:
apple_news['pub_date'].max()

'2024-11-29T12:00:14Z'

In [48]:
print(apple_news.columns)

print(apple_news['pub_date'].head())

# Convert to datetime
apple_news['pub_date'] = pd.to_datetime(apple_news['pub_date'])

# Sort by date
apple_news = apple_news.sort_values('pub_date')

# View the range of dates
print("Earliest date:", apple_news['pub_date'].min())
print("Latest date:", apple_news['pub_date'].max())

# count articles per year or month
print(apple_news['pub_date'].dt.year.value_counts().sort_index())
print(apple_news['pub_date'].dt.to_period('M').value_counts().sort_index())

Index(['title', 'snippet', 'pub_date', 'web_url', 'section', 'source'], dtype='object')
0    2024-09-10T09:05:34Z
1    2024-10-11T15:00:13Z
2    2024-09-23T09:00:25Z
3    2024-03-21T14:30:39Z
4    2024-10-31T21:10:57Z
Name: pub_date, dtype: object
Earliest date: 2024-01-03 18:21:13+00:00
Latest date: 2024-11-29 12:00:14+00:00
pub_date
2024    520
Name: count, dtype: int64
pub_date
2024-01     20
2024-02     24
2024-03     48
2024-04     25
2024-05     32
2024-06     41
2024-07     35
2024-08     55
2024-09     58
2024-10    107
2024-11     75
Freq: M, Name: count, dtype: int64


  print(apple_news['pub_date'].dt.to_period('M').value_counts().sort_index())


In [49]:
apple_news.shape

(520, 6)

In [51]:
df_2122 = pd.read_csv('nyt_apple2122.csv')
df_2223 = pd.read_csv('nyt_apple2223.csv')
df_2325 = pd.read_csv('nyt_apple2325.csv')
df_0721 = pd.read_csv('nyt_apple0721.csv')
df_050822 = pd.read_csv('nyt_apple22052208.csv')
df_011124 = pd.read_csv('nyt_apple240111.csv')

In [52]:
# Function to inspect and process each dataset
def process_news_data(df, name):
    print(f"\n==== {name} ====")
    print(df.columns)
    
    # Check first 5 dates
    print(df['pub_date'].head())
    
    # Convert to datetime
    df['pub_date'] = pd.to_datetime(df['pub_date'], errors='coerce')
    
    # Drop NaT if any failed conversions
    df = df.dropna(subset=['pub_date'])
    
    # Sort by date
    df = df.sort_values('pub_date')
    
    # Print date range
    print("Earliest date:", df['pub_date'].min())
    print("Latest date:", df['pub_date'].max())
    
    # Articles per year
    print("Articles per year:")
    print(df['pub_date'].dt.year.value_counts().sort_index())
    
    # Articles per month
    print("Articles per month:")
    print(df['pub_date'].dt.to_period('M').value_counts().sort_index())
    
    return df  # Return cleaned/sorted version if needed later

# Process each file
df_2122 = process_news_data(df_2122, "NYT Apple 2021-2022")
df_2223 = process_news_data(df_2223, "NYT Apple 2022-2023")
df_2325 = process_news_data(df_2325, "NYT Apple 2023-2025")
df_0721 = process_news_data(df_0721, "NYT Apple 07 2021")
df_050822 = process_news_data(df_050822, "NYT Apple 2022 05 - 2022 08")
df_011124 = process_news_data(df_011124, "NYT Apple 2024 01 - 2024 11")


==== NYT Apple 2021-2022 ====
Index(['title', 'snippet', 'pub_date', 'web_url', 'section', 'source'], dtype='object')
0    2022-03-11T10:00:21Z
1    2021-09-10T15:38:13Z
2    2022-03-08T19:35:06Z
3    2022-03-01T21:37:14Z
4    2022-02-10T21:07:40Z
Name: pub_date, dtype: object
Earliest date: 2021-07-20 09:00:07+00:00
Latest date: 2022-04-26 19:29:18+00:00
Articles per year:
pub_date
2021    562
2022    438
Name: count, dtype: int64
Articles per month:
pub_date
2021-07     37
2021-08     70
2021-09    128
2021-10    118
2021-11    121
2021-12     88
2022-01    117
2022-02    103
2022-03    137
2022-04     81
Freq: M, Name: count, dtype: int64

==== NYT Apple 2022-2023 ====
Index(['title', 'snippet', 'pub_date', 'web_url', 'section', 'source'], dtype='object')
0    2023-10-31T07:00:16Z
1    2023-11-29T14:00:10Z
2    2023-10-30T19:00:02Z
3    2023-12-18T19:12:16Z
4    2023-11-12T10:00:21Z
Name: pub_date, dtype: object
Earliest date: 2022-04-28 21:11:37+00:00
Latest date: 2023-12-20 20:00

  print(df['pub_date'].dt.to_period('M').value_counts().sort_index())
  print(df['pub_date'].dt.to_period('M').value_counts().sort_index())
  print(df['pub_date'].dt.to_period('M').value_counts().sort_index())
  print(df['pub_date'].dt.to_period('M').value_counts().sort_index())
  print(df['pub_date'].dt.to_period('M').value_counts().sort_index())
  print(df['pub_date'].dt.to_period('M').value_counts().sort_index())


In [53]:
full_df = pd.concat([df_2122, df_2223, df_2325, df_0721, df_050822, df_011124], ignore_index=True)
full_df['pub_date'] = pd.to_datetime(full_df['pub_date'], errors='coerce')
full_df = full_df.sort_values('pub_date')
full_df = full_df.drop_duplicates(subset='web_url')

In [54]:
full_df

Unnamed: 0,title,snippet,pub_date,web_url,section,source
3000,What Happens if There’s a Covid Outbreak at th...,Kara Swisher finds out why the 2020 Tokyo Game...,2021-07-19 09:00:10+00:00,https://www.nytimes.com/2021/07/19/opinion/swa...,Opinion,The New York Times
3001,This Conversation Will Change How You Think Ab...,Modern work culture is built on a broken model...,2021-07-20 09:00:07+00:00,https://www.nytimes.com/2021/07/20/opinion/ezr...,Opinion,The New York Times
3002,‘The New Bauhaus’ Review: Rethinking an Approa...,This documentary on the interdisciplinary arti...,2021-07-20 11:00:05+00:00,https://www.nytimes.com/2021/07/20/movies/the-...,Movies,The New York Times
1,Apple delays its return to office as the Delta...,Employees are now expected to come back to the...,2021-07-20 15:42:40+00:00,https://www.nytimes.com/2021/07/20/technology/...,Technology,The New York Times
3004,Can Apple’s AirTags Find Lost Pets?,We look at the pros and cons of using Apple’s ...,2021-07-20 16:06:15+00:00,https://www.nytimes.com/2021/07/20/technology/...,Technology,The New York Times
...,...,...,...,...,...,...
2995,Jimmy Fallon Fans the Flames of Burning MAGA Hats,"People torched the hats in videos, apparently ...",2025-07-16 06:42:23+00:00,https://www.nytimes.com/2025/07/16/arts/televi...,Arts,The New York Times
2996,"How to Keep Love Alive, With Rob Delaney of ‘D...",The Emmy-nominated actor talks about his role ...,2025-07-16 09:00:29+00:00,https://www.nytimes.com/2025/07/16/podcasts/ro...,Podcasts,The New York Times
2997,Project 2025’s Other Project,Inside the plan from the Heritage Foundation t...,2025-07-16 10:00:10+00:00,https://www.nytimes.com/2025/07/16/podcasts/th...,Podcasts,The New York Times
2998,"Tariffs Push Prices Up, and the Supreme Court’...","Plus, why that e-book cost your library $50.",2025-07-16 10:00:12+00:00,https://www.nytimes.com/2025/07/16/podcasts/th...,Podcasts,The New York Times


In [55]:
full_df = process_news_data(full_df, "NYT Apple 2021-2025")


==== NYT Apple 2021-2025 ====
Index(['title', 'snippet', 'pub_date', 'web_url', 'section', 'source'], dtype='object')
3000   2021-07-19 09:00:10+00:00
3001   2021-07-20 09:00:07+00:00
3002   2021-07-20 11:00:05+00:00
1      2021-07-20 15:42:40+00:00
3004   2021-07-20 16:06:15+00:00
Name: pub_date, dtype: datetime64[ns, UTC]
Earliest date: 2021-07-19 09:00:10+00:00
Latest date: 2025-07-16 11:07:17+00:00
Articles per year:
pub_date
2021     565
2022    1021
2023     868
2024     590
2025     655
Name: count, dtype: int64
Articles per month:
pub_date
2021-07     51
2021-08     70
2021-09    125
2021-10    117
2021-11    118
2021-12     84
2022-01    115
2022-02    101
2022-03    134
2022-04     82
2022-05    128
2022-06    132
2022-07    133
2022-08    103
2022-09     20
2022-10     20
2022-11     26
2022-12     27
2023-01     28
2023-02     36
2023-03     40
2023-04     48
2023-05     48
2023-06     89
2023-07     82
2023-08     76
2023-09    123
2023-10     97
2023-11    150
2023-12   

  print(df['pub_date'].dt.to_period('M').value_counts().sort_index())


In [None]:
start_date = 20220901
end_date = 20230228

np.int64(0)