In [1]:
key = 'qQYVw6u9MbLH35Tt1qHSsNpAGM6QFz8x'

In [2]:
import os
import json
import time
import requests
import datetime
import dateutil
import pandas as pd
from dateutil.relativedelta import relativedelta

In [3]:
# Specifying the date range
end = datetime.date.today()
start = end - relativedelta(years=5)

In [4]:
months_in_range = [x.split(' ') for x in pd.date_range(start, end, freq='MS').strftime("%Y %-m %-d %-M").tolist()]

In [5]:
print(months_in_range)

[['1971', '8', '1', '0'], ['1971', '9', '1', '0'], ['1971', '10', '1', '0'], ['1971', '11', '1', '0'], ['1971', '12', '1', '0'], ['1972', '1', '1', '0'], ['1972', '2', '1', '0'], ['1972', '3', '1', '0'], ['1972', '4', '1', '0'], ['1972', '5', '1', '0'], ['1972', '6', '1', '0'], ['1972', '7', '1', '0'], ['1972', '8', '1', '0'], ['1972', '9', '1', '0'], ['1972', '10', '1', '0'], ['1972', '11', '1', '0'], ['1972', '12', '1', '0'], ['1973', '1', '1', '0'], ['1973', '2', '1', '0'], ['1973', '3', '1', '0'], ['1973', '4', '1', '0'], ['1973', '5', '1', '0'], ['1973', '6', '1', '0'], ['1973', '7', '1', '0'], ['1973', '8', '1', '0'], ['1973', '9', '1', '0'], ['1973', '10', '1', '0'], ['1973', '11', '1', '0'], ['1973', '12', '1', '0'], ['1974', '1', '1', '0'], ['1974', '2', '1', '0'], ['1974', '3', '1', '0'], ['1974', '4', '1', '0'], ['1974', '5', '1', '0'], ['1974', '6', '1', '0'], ['1974', '7', '1', '0'], ['1974', '8', '1', '0'], ['1974', '9', '1', '0'], ['1974', '10', '1', '0'], ['1974', '11',

In [6]:
def send_request(date):
    '''Sends a request to the NYT Archive API for given date.'''
    base_url = 'https://api.nytimes.com/svc/archive/v1/'
    url = base_url + '/' + date[0] + '/' + date[1] + '.json?api-key=' + 'qQYVw6u9MbLH35Tt1qHSsNpAGM6QFz8x'
    response = requests.get(url).json()
    time.sleep(6)
    return response


def is_valid(article, date):
    '''An article is only worth checking if it is in range, and has a headline.'''
    is_in_range = date > start and date < end
    has_headline = type(article['headline']) == dict and 'main' in article['headline'].keys()
    return is_in_range and has_headline


def parse_response(response):
    '''Parses and returns response as pandas data frame.'''
    data = {'headline': [],  
        'date': [], 
        'abstract' : [],
        'snippet': [],
        'lead_paragraph': [],
        'print_section': [],
        'print_page': [],    
        'news_desk': [],
        'word_count': [],
        'subsection_name': [],
        'document_type': [],
        'type_of_material': [],
        'section_name': [],
        'byline': [],
        'keywords': []}
    
    articles = response['response']['docs'] 
    for article in articles: # For each article, make sure it falls within our date range
        date = dateutil.parser.parse(article['pub_date']).date()
        if is_valid(article, date):
            data['date'].append(date)
            data['headline'].append(article['headline']['main']) 
            if 'abstract' in article:
                data['abstract'].append(article['abstract'])
            else:
                data['abstract'].append(None)
            if 'snippet' in article:
                data['snippet'].append(article['snippet'])
            else:
                data['snippet'].append(None)
            if 'lead_paragraph' in article:
                data['lead_paragraph'].append(article['lead_paragraph'])
            else:
                data['lead_paragraph'].append(None)
            if 'print_section' in article:
                data['print_section'].append(article['print_section'])
            else:
                data['print_section'].append(None)
            if 'print_page' in article:
                data['print_page'].append(article['print_page'])
            else:
                data['print_page'].append(None)
            if 'news_desk' in article:
                data['news_desk'].append(article['news_desk'])
            else:
                data['news_desk'].append(None)             
            if 'word_count' in article:
                data['word_count'].append(article['word_count'])
            else:
                data['word_count'].append(None)
            if 'subsection_name' in article:
                data['subsection_name'].append(article['subsection_name'])
            else:
                data['subsection_name'].append(None)     
            if 'section_name' in article:
                data['section_name'].append(article['section_name'])
            else:
                data['section_name'].append(None)
#             if 'document_type' in article:
#                 data['document_type'].append(article['document_type'])
#             else:
#                 data['document_type'].append(None)        
            data['document_type'].append(article['document_type'])
            if 'type_of_material' in article: 
                data['type_of_material'].append(article['type_of_material'])
            else:
                data['type_of_material'].append(None)
            if 'byline' in article: 
                data['byline'].append(article['byline'])
            else:
                data['byline'].append(None)
            keywords = [keyword['value'] for keyword in article['keywords'] if keyword['name'] == 'subject']
            data['keywords'].append(keywords)
    return pd.DataFrame(data) 


def get_data(dates):
    '''Sends and parses request/response to/from NYT Archive API for given dates.'''
    total = 0
    print('Date range: ' + str(dates[0]) + ' to ' + str(dates[-1]))
    if not os.path.exists('headlines'):
        os.mkdir('headlines')
    for date in dates:
        response = send_request(date)
        df = parse_response(response)
        total += len(df)
        df.to_csv('headlines/' + date[0] + '-' + date[1] + '.csv', index=False)
        print('Saving headlines/' + date[0] + '-' + date[1] + '.csv...')
    print('Number of articles collected: ' + str(total))


In [7]:
get_data(months_in_range)

Date range: ['1971', '8', '1', '0'] to ['2021', '7', '1', '0']
Saving headlines/1971-8.csv...
Saving headlines/1971-9.csv...
Saving headlines/1971-10.csv...
Saving headlines/1971-11.csv...
Saving headlines/1971-12.csv...
Saving headlines/1972-1.csv...
Saving headlines/1972-2.csv...
Saving headlines/1972-3.csv...
Saving headlines/1972-4.csv...
Saving headlines/1972-5.csv...
Saving headlines/1972-6.csv...
Saving headlines/1972-7.csv...
Saving headlines/1972-8.csv...
Saving headlines/1972-9.csv...
Saving headlines/1972-10.csv...
Saving headlines/1972-11.csv...
Saving headlines/1972-12.csv...
Saving headlines/1973-1.csv...
Saving headlines/1973-2.csv...
Saving headlines/1973-3.csv...
Saving headlines/1973-4.csv...
Saving headlines/1973-5.csv...
Saving headlines/1973-6.csv...
Saving headlines/1973-7.csv...
Saving headlines/1973-8.csv...
Saving headlines/1973-9.csv...
Saving headlines/1973-10.csv...
Saving headlines/1973-11.csv...
Saving headlines/1973-12.csv...
Saving headlines/1974-1.csv..

Saving headlines/1993-5.csv...
Saving headlines/1993-6.csv...
Saving headlines/1993-7.csv...
Saving headlines/1993-8.csv...
Saving headlines/1993-9.csv...
Saving headlines/1993-10.csv...
Saving headlines/1993-11.csv...
Saving headlines/1993-12.csv...
Saving headlines/1994-1.csv...
Saving headlines/1994-2.csv...
Saving headlines/1994-3.csv...
Saving headlines/1994-4.csv...
Saving headlines/1994-5.csv...
Saving headlines/1994-6.csv...
Saving headlines/1994-7.csv...
Saving headlines/1994-8.csv...
Saving headlines/1994-9.csv...
Saving headlines/1994-10.csv...
Saving headlines/1994-11.csv...
Saving headlines/1994-12.csv...
Saving headlines/1995-1.csv...
Saving headlines/1995-2.csv...
Saving headlines/1995-3.csv...
Saving headlines/1995-4.csv...
Saving headlines/1995-5.csv...
Saving headlines/1995-6.csv...
Saving headlines/1995-7.csv...
Saving headlines/1995-8.csv...
Saving headlines/1995-9.csv...
Saving headlines/1995-10.csv...
Saving headlines/1995-11.csv...
Saving headlines/1995-12.csv...

Saving headlines/2015-4.csv...
Saving headlines/2015-5.csv...
Saving headlines/2015-6.csv...
Saving headlines/2015-7.csv...
Saving headlines/2015-8.csv...
Saving headlines/2015-9.csv...
Saving headlines/2015-10.csv...
Saving headlines/2015-11.csv...
Saving headlines/2015-12.csv...
Saving headlines/2016-1.csv...
Saving headlines/2016-2.csv...
Saving headlines/2016-3.csv...
Saving headlines/2016-4.csv...
Saving headlines/2016-5.csv...
Saving headlines/2016-6.csv...
Saving headlines/2016-7.csv...
Saving headlines/2016-8.csv...
Saving headlines/2016-9.csv...
Saving headlines/2016-10.csv...
Saving headlines/2016-11.csv...
Saving headlines/2016-12.csv...
Saving headlines/2017-1.csv...
Saving headlines/2017-2.csv...
Saving headlines/2017-3.csv...
Saving headlines/2017-4.csv...
Saving headlines/2017-5.csv...
Saving headlines/2017-6.csv...
Saving headlines/2017-7.csv...
Saving headlines/2017-8.csv...
Saving headlines/2017-9.csv...
Saving headlines/2017-10.csv...
Saving headlines/2017-11.csv...


In [8]:
df = pd.read_csv('headlines/2020-8.csv')

In [9]:
df.head()

Unnamed: 0,headline,date,abstract,snippet,lead_paragraph,print_section,print_page,news_desk,word_count,subsection_name,document_type,type_of_material,section_name,byline,keywords
0,A Better Year for Trump’s Family Business (Las...,2020-08-01,"In 2019, the Trump Organization showed improve...","In 2019, the Trump Organization showed improve...",Before the coronavirus ripped through the coun...,A,25.0,Investigative,1187,,article,News,Business Day,"{'original': 'By Ben Protess, Steve Eder and M...","['Coronavirus (2019-nCoV)', 'Hotels and Travel..."
1,Court Frees Michigan Teen Who Was Held for Ski...,2020-08-01,The release came more than a week after a judg...,The release came more than a week after a judg...,The Michigan Court of Appeals on Friday freed ...,,,Express,543,,article,News,U.S.,"{'original': 'By Aimee Ortiz', 'person': [{'fi...","['Juvenile Delinquency', 'Decisions and Verdic..."
2,Federal Agents Don’t Need Army Fatigues,2020-08-01,"If you’re an officer of the law, dress like on...","If you’re an officer of the law, dress like on...","Masked men, clad indistinguishably from soldie...",A,18.0,Editorial,517,,article,Editorial,Opinion,"{'original': 'By The Editorial Board', 'person...","['Demonstrations, Protests and Riots', 'Camouf..."
3,Canada’s Key Role in Creating a Once Awaited V...,2020-08-01,An American researcher created the polio vacci...,An American researcher created the polio vacci...,Canadians don’t have to go back to 1918 and th...,,,Foreign,1104,Canada,article,News,World,"{'original': 'By Ian Austen', 'person': [{'fir...",['Vaccination and Immunization']
4,Quotation of the Day: China Wields Its Securit...,2020-08-01,"Quotation of the Day for Saturday, August 1, 2...","Quotation of the Day for Saturday, August 1, 2...",“They are running short of confidence to face ...,A,3.0,Summary,42,,article,Quote,Today’s Paper,"{'original': '', 'person': [], 'organization':...",[]
