In [1]:
import pandas as pd
import numpy as np
import requests
import json
import time
import os

from dotenv import load_dotenv
from datetime import datetime, timedelta

In [2]:
load_dotenv()

True

In [25]:
def nyt_search(start_date, end_date, key, pages=1, t_sleep=6, q=None, news_desk=None):
    responses = []
    base_url = "https://api.nytimes.com/svc/search/v2/articlesearch"
    
    for i in range(pages):
        url = base_url + '.json?fq=' + news_desk + '&facet_field=day_of_week&facet=true&page=' + str(i) + '&begin_date=' + start_date + '&end_date=' + end_date + '&api-key=' + key
        data = requests.get(url).json()
        responses.append(data)
        time.sleep(t_sleep)
    
    return(responses)

def nyt_search_responses_unwrap(responses):
    
    result = {
        "Date" : [],
        "Source" : [],
        "Section" : [],
        "Headline" : [],
        "Lead Paragraph" : [],
        "URL" : []
    }
    
    for d in responses:
        for article in d['response']['docs']:
            result["Date"].append(article['pub_date'][:10])
            result["Source"].append(article['source'])
            result["Section"].append(article['news_desk'])
            result["Headline"].append(article['headline']['main'])
            result["Lead Paragraph"].append(article['lead_paragraph'])
            result["URL"].append(article['web_url'])
            
    return(result)

In [4]:
base_url = "https://api.nytimes.com/svc/search/v2/articlesearch"
news_desk = "news_desk:(\"Business\" \"Politics\" \"Financial\" \"Foreign\" \"Jobs\" \"Personal Investing\")"

start_dates = []
end_dates = []

pages = 50

responses = []

In [5]:
start_dt = datetime.strptime("12-14-2019", "%m-%d-%Y")
end_dt = datetime.strptime("01-16-2021", "%m-%d-%Y")
day_count = 400

date_list = [start_dt + timedelta(days=x) for x in range(400)]

In [6]:
i = 0

while i < day_count:
    start_dates.append(f"{date_list[i]:%Y}{date_list[i]:%m}{date_list[i]:%d}")
    i += 7
    end_dates.append(f"{date_list[i]:%Y}{date_list[i]:%m}{date_list[i]:%d}")
    i += 1

In [7]:
for i in range(10):
    responses.append(nyt_search(start_date=start_dates[i], end_date=end_dates[i], key=os.getenv("NYT_API_1"), pages=pages, news_desk=news_desk))
    print(f"Search period {i+1} completed")
    
print(f"Search block ONE completed")
time.sleep(10)

for i in range(10, 20):
    responses.append(nyt_search(start_date=start_dates[i], end_date=end_dates[i], key=os.getenv("NYT_API_2"), pages=pages, news_desk=news_desk))
    print(f"Search period {i+1} completed")
    
print(f"Search block TWO completed")
time.sleep(10)
   
for i in range(20, 30):
    responses.append(nyt_search(start_date=start_dates[i], end_date=end_dates[i], key=os.getenv("NYT_API_3"), pages=pages, news_desk=news_desk))
    print(f"Search period {i+1} completed")
    
print(f"Search block THREE completed")
time.sleep(10)

for i in range(30, 40):
    responses.append(nyt_search(start_date=start_dates[i], end_date=end_dates[i], key=os.getenv("NYT_API_4"), pages=pages, news_desk=news_desk))
    print(f"Search period {i+1} completed")
    
print(f"Search block FOUR completed")
time.sleep(10)

for i in range(40, 50):
    responses.append(nyt_search(start_date=start_dates[i], end_date=end_dates[i], key=os.getenv("NYT_API_5"), pages=pages, news_desk=news_desk))
    print(f"Search period {i+1} completed")
    
print(f"Search block FIVE completed")

Search period 1 completed
Search period 2 completed
Search period 3 completed
Search period 4 completed
Search period 5 completed
Search period 6 completed
Search period 7 completed
Search period 8 completed
Search period 9 completed
Search period 10 completed
Search block ONE completed
Search period 11 completed
Search period 12 completed
Search period 13 completed
Search period 14 completed
Search period 15 completed
Search period 16 completed
Search period 17 completed
Search period 18 completed
Search period 19 completed
Search period 20 completed
Search block TWO completed
Search period 21 completed
Search period 22 completed
Search period 23 completed
Search period 24 completed
Search period 25 completed
Search period 26 completed
Search period 27 completed
Search period 28 completed
Search period 29 completed
Search period 30 completed
Search block THREE completed
Search period 31 completed
Search period 32 completed
Search period 33 completed
Search period 34 completed
Search p

In [8]:
flattened_responses = np.ravel(responses)

In [26]:
raw_dictionary = nyt_search_responses_unwrap(flattened_responses)

In [27]:
nyt_df = pd.DataFrame.from_dict(raw_dictionary)

In [28]:
nyt_df["Date"] = pd.to_datetime(nyt_df["Date"])
nyt_df = nyt_df.set_index("Date")
nyt_df = nyt_df.sort_index(axis=0)

In [30]:
df_headlines = nyt_df.groupby(['Date'])['Headline'].apply(','.join).reset_index().set_index('Date')
df_lead_paragraph = nyt_df.groupby(['Date'])['Lead Paragraph'].apply(';'.join).reset_index().set_index('Date')

In [31]:
concat_df = pd.concat([df_headlines, df_lead_paragraph], axis=1, join='inner')

In [33]:

nyt_df.to_csv(r'nyt_full_data.csv', index = True)

In [34]:

concat_df.to_csv(r'nyt_concatenated_data.csv', index = True)