In [2]:
import requests
import time
import dateutil.parser
import pandas as pd
import os
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup

In [8]:
API_KEY = 'key'
start = dateutil.parser.parse('2023-01-01').date()  # Start date in YYYY-MM-DD format
end = dateutil.parser.parse('2023-12-31').date()

In [9]:
def send_request(date):
    base_url = 'https://api.nytimes.com/svc/archive/v1/'
    url = base_url + '/' + date[0] + '/' + date[1] + '.json?api-key=' + API_KEY
    try:
        response = requests.get(url, verify=False).json()
    except Exception:
        return None
    time.sleep(6)
    return response

def is_valid(article, date):
    is_in_range = start <= date <= end
    has_headline = type(article['headline']) == dict and 'main' in article['headline'].keys()
    has_vietnam = (
        'Vietnam' in article['headline']['main'] or
        'Vietnam' in article.get('lead_paragraph', '') or
        any('Vietnam' in keyword['value'] for keyword in article['keywords'])
    )
    return is_in_range and has_headline and has_vietnam

def parse_response(response):
    data = {
        'headline': [],
        'date': [],
        'web_url': [],
        'doc_type': [],
        'lead_paragraph': [],
        'material_type': [],
        'author': [],
        'section': [],
        'subsection': [],
        'keywords': [],
        'full_text': []
    }
    
    articles = response['response']['docs']
    for article in articles:
        date = dateutil.parser.parse(article['pub_date']).date()
        if is_valid(article, date):
            data['date'].append(date)
            data['headline'].append(article['headline']['main'])
            data['section'].append(article.get('section_name'))
            data['lead_paragraph'].append(article.get('lead_paragraph'))
            data['web_url'].append(article.get('web_url'))
            data['subsection'].append(article.get('subsection_name'))
            data['author'].append(article.get('byline', {}).get('original'))
            data['doc_type'].append(article['document_type'])
            data['material_type'].append(article.get('type_of_material'))
            keywords = [keyword['value'] for keyword in article['keywords'] if 'Vietnam' in keyword['value']]
            data['keywords'].append(keywords)
            data['full_text'].append('')  # Placeholder for full text
    return pd.DataFrame(data)

def get_article_text(url):
    options = Options()
    options.headless = True
    service = Service('path_to_chromedriver')  # Update this path to your chromedriver
    driver = webdriver.Chrome(service=service, options=options)
    
    try:
        driver.get(url)
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, 'body')))
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        paragraphs = soup.find_all('p')
        full_text = ' '.join([para.get_text() for para in paragraphs])
    except Exception as e:
        print(f"Error retrieving text from {url}: {e}")
        full_text = ''
    finally:
        driver.quit()
    
    return full_text
