In [1]:
import pandas as pd
import requests
import json
import time
import dateutil
import datetime
from dateutil.relativedelta import relativedelta
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
import os

In [2]:
API_KEY = 'jYRG0WYljyNdIX8Cq8kNcbpaCS8oLwrc'

In [3]:
def send_request(date):
    base_url = 'https://api.nytimes.com/svc/archive/v1/'
    url = base_url + '/' + date[0] + '/' + date[1] + '.json?api-key=' + API_KEY
    try:
        response = requests.get(url, verify=False).json()
    except Exception:
        return None
    time.sleep(6)
    return response

def is_valid(article, date):
    is_in_range = start <= date <= end
    has_headline = type(article['headline']) == dict and 'main' in article['headline'].keys()
    has_vietnam = (
        'Vietnam' in article['headline']['main'] or
        'Vietnam' in article.get('lead_paragraph', '') or
        any('Vietnam' in keyword['value'] for keyword in article['keywords'])
    )
    return is_in_range and has_headline and has_vietnam

def parse_response(response):
    data = {
        'headline': [],
        'date': [],
        'web_url': [],
        'doc_type': [],
        'lead_paragraph': [],
        'material_type': [],
        'author': [],
        'section': [],
        'subsection': [],
        'keywords': []
    }
    
    articles = response['response']['docs']
    for article in articles:
        date = dateutil.parser.parse(article['pub_date']).date()
        if is_valid(article, date):
            data['date'].append(date)
            data['headline'].append(article['headline']['main'])
            data['section'].append(article.get('section_name'))
            data['lead_paragraph'].append(article.get('lead_paragraph'))
            data['web_url'].append(article.get('web_url'))
            data['subsection'].append(article.get('subsection_name'))
            data['author'].append(article.get('byline', {}).get('original'))
            data['doc_type'].append(article['document_type'])
            data['material_type'].append(article.get('type_of_material'))
            keywords = [keyword['value'] for keyword in article['keywords'] if 'Vietnam' in keyword['value']]
            data['keywords'].append(keywords)
    
    return pd.DataFrame(data)

def get_data(dates):
    total = 0
    print('Date range: ' + str(dates[0]) + ' to ' + str(dates[-1]))
    if not os.path.exists('headlines'):
        os.mkdir('headlines')
    for date in dates:
        print('Working on ' + str(date) + '...')
        csv_path = 'headlines/' + date[0] + '-' + date[1] + '.csv'
        if not os.path.exists(csv_path): # If we don't already have this month
            response = send_request(date)
            if response is not None:
                df = parse_response(response)
                total += len(df)
                df.to_csv(csv_path, index=False)
                print('Saving ' + csv_path + '...')
    print('Number of articles collected: ' + str(total))


In [140]:
end = datetime.date(1975, 12, 31)
start = datetime.date(1975, 11, 1)

In [141]:
months = [x.split(' ') for x in pd.date_range(start, end, freq='MS').strftime("%Y %-m").tolist()]

In [142]:
get_data(months)

Date range: ['1975', '11'] to ['1975', '12']
Working on ['1975', '11']...
Saving headlines/1975-11.csv...
Working on ['1975', '12']...
Saving headlines/1975-12.csv...
Number of articles collected: 88


In [143]:
from bs4 import BeautifulSoup
import selenium 
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
# from webdriver_manager.chrome import ChromeDriverManager
# from selenium.webdriver.common.proxy import Proxy, ProxyType
# from selenium.webdriver.chrome.options import Options
import time 
import pandas as pd

In [144]:
chrome_driver_path = '/usr/local/bin/chromedriver'

service = Service(chrome_driver_path)

driver = webdriver.Chrome(service=service)

In [145]:
from selenium.webdriver.common.proxy import Proxy, ProxyType
from selenium.webdriver.chrome.options import Options

In [146]:
def get_body(url):
    driver.get(url)
    article_text = ''
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    paragraph = soup.find_all('p')
    for i in paragraph:
        a = i.get_text()
        if a != 'Advertisement' and a != 'Supported by' and a != 'Send any friend a story' and a != 'As a subscriber, you have 10 gift articles to give each month. Anyone can read what you share.' and not a.startswith("By"):
            article_text += a 
            article_text += " "
    time.sleep(8)
    return article_text

In [149]:
df = pd.read_csv("headlines/1973-12.csv")

In [148]:
df['article_body'] = df.apply(lambda x: get_body(x.web_url), axis=1) 

WebDriverException: Message: disconnected: not connected to DevTools
  (failed to check if window was closed: disconnected: not connected to DevTools)
  (Session info: chrome=126.0.6478.115)
Stacktrace:
0   chromedriver                        0x0000000100b1fa20 chromedriver + 4389408
1   chromedriver                        0x0000000100b1832c chromedriver + 4358956
2   chromedriver                        0x0000000100734afc chromedriver + 281340
3   chromedriver                        0x000000010071e154 chromedriver + 188756
4   chromedriver                        0x000000010071e090 chromedriver + 188560
5   chromedriver                        0x00000001007af41c chromedriver + 783388
6   chromedriver                        0x000000010076bed0 chromedriver + 507600
7   chromedriver                        0x000000010076c8a8 chromedriver + 510120
8   chromedriver                        0x0000000100ae73a4 chromedriver + 4158372
9   chromedriver                        0x0000000100aebe08 chromedriver + 4177416
10  chromedriver                        0x0000000100acd064 chromedriver + 4051044
11  chromedriver                        0x0000000100aec6f4 chromedriver + 4179700
12  chromedriver                        0x0000000100ac0064 chromedriver + 3997796
13  chromedriver                        0x0000000100b0a0bc chromedriver + 4300988
14  chromedriver                        0x0000000100b0a238 chromedriver + 4301368
15  chromedriver                        0x0000000100b17f24 chromedriver + 4357924
16  libsystem_pthread.dylib             0x000000018d846f94 _pthread_start + 136
17  libsystem_pthread.dylib             0x000000018d841d34 thread_start + 8


In [90]:
driver.close()

In [92]:
df.to_csv("headlines/1973-12.csv", index=False)