In [25]:
import pandas as pd
import numpy as np
import requests
import json
import time
import os

from dotenv import load_dotenv
from datetime import datetime

In [7]:
load_dotenv()

True

In [40]:
def nyt_search(start_date, end_date, key, pages=1, t_sleep=6, q=None, news_desk=None):
    responses = []
    base_url = "https://api.nytimes.com/svc/search/v2/articlesearch"
    
    for i in range(pages):
        url = base_url + '.json?fq=' + news_desk + '&facet_field=day_of_week&facet=true&page=' + str(i) + '&begin_date=' + start_date + '&end_date=' + end_date + '&api-key=' + key
        data = requests.get(url).json()
        responses.append(data)
        time.sleep(t_sleep)
    
    return(responses)

def nyt_search_responses_unwrap(responses):
    
    result = {
        "Date" : [],
        "Source" : [],
        "Headline" : [],
        "Lead Paragraph" : [],
        "URL" : []
    }
    
    for d in responses:
        for article in d['response']['docs']:
            result["Date"].append(article['pub_date'][:10])
            result["Source"].append(article['source'])
            result["Headline"].append(article['headline']['main'])
            result["Lead Paragraph"].append(article['lead_paragraph'])
            result["URL"].append(article['web_url'])
            
    return(result)
    
    

In [9]:
base_url = "https://api.nytimes.com/svc/search/v2/articlesearch"
news_desk = "news_desk:(\"Business\" \"Politics\" \"Financial\" \"Foreign\" \"Jobs\" \"Personal Investing\")"

start_dates = ["20201001", "20201011", "20201021", "20201031", "20201110", "20201120", "20201130", "20201210", "20201220", "20201230", "20210109"]
end_dates = ["20201010", "20201020", "20201030", "20201109", "20201119", "20201129", "20201209", "20201219", "20201229", "20210108", "20210119"]

pages = 50

responses = []

In [10]:
for i in range(len(start_dates)):
    responses.append(nyt_search(start_date=start_dates[i], end_date=end_dates[i], key=os.getenv("NYT_API_KEY"), pages=pages, news_desk=news_desk))
    print(f"Search period {i} completed")

Search period 0 completed
Search period 1 completed
Search period 2 completed
Search period 3 completed
Search period 4 completed
Search period 5 completed
Search period 6 completed
Search period 7 completed
Search period 8 completed
Search period 9 completed
Search period 10 completed


In [28]:
flattened_responses = np.ravel(responses)

In [41]:
raw_dictionary = nyt_search_responses_unwrap(flattened_responses)

In [48]:
nyt_df = pd.DataFrame.from_dict(raw_dictionary)

In [49]:
nyt_df["Date"] = pd.to_datetime(nyt_df["Date"])
nyt_df = nyt_df.set_index("Date")
nyt_df = nyt_df.sort_index(axis=0)

In [51]:
df_headlines = nyt_df.groupby(['Date'])['Headline'].apply(','.join).reset_index().set_index('Date')
df_lead_paragraph = nyt_df.groupby(['Date'])['Lead Paragraph'].apply(';'.join).reset_index().set_index('Date')

In [52]:
concat_df = pd.concat([df_headlines, df_lead_paragraph], axis=1, join='inner')

In [54]:

nyt_df.to_csv(r'nyt_full_data.csv', index = True)

In [55]:

concat_df.to_csv(r'nyt_concatenated_data.csv', index = True)