Obtain NYT URL's

In [3]:
#Import necessary libraries
import time
import json
import requests
import urllib
import lxml.html
from bs4 import BeautifulSoup
import newspaper
import trafilatura
import calendar
import os

In [4]:
nyt_api_key = "AUtPPNuICzon3X9uEaX2k4Dgs0YGBA25"

In [5]:
"Standard values"
news_desk = "politics"
query_list = ["Donald Trump", "Joe Biden"]
filter_list =["headline", "lead_paragraph"]
begin_date = "20230601"
end_date="20160101"

In [6]:
def create_url(tags=query_list,news_desk=news_desk, section = "Washington", filters=filter_list, begin_date="20160101",
 end_date="20230601", page="0"):
    """
    Create request url for API based on query search parameters passed to the
        function.
    
    Inputs:
        tags (lst): list of tags (strings) to look for. The tags to filter for
            are looked in the filters defined in the "filters" parameter.
        filters (lst): list of filters where to look tags. They can be "headline",
            "lead_paragraph" and/or "body"
        begin_date (str): 8 digits (YYYYMMDD) string that specify the begin date
            or from when to start looking for articles.
        end_date (str): 8 digits (YYYYMMDD) string that specify the end date or
            until when to stop looking for articles.
        page (str): number of page string that states where to look for articles.

    Return (str): URL string with query to send request to NYT Article Search 
        API
    """

    endpoint = "https://api.nytimes.com/svc/search/v2/articlesearch.json?" 
    tags_copy = tags[:]
    filters_copy = filters[:]

    #headline = "(headline=" + " OR ".join(filters_copy) + ")"
    #news_desk = "(news_desk)" + news_desk

    for i,tag in enumerate(tags_copy):
        tags_copy[i] = "\"" + tag + "\""
    for i, fil in enumerate(filters_copy):
        filters_copy[i] = fil + ":(" + " OR ".join(tags_copy) + ")"
    
    fq = "fq=" + " OR ".join(filters_copy) + " AND news_desk=" + news_desk + " AND section=" + section
    fq = "fq=" + "news_desk=" + news_desk + "AND section=" + section
    url = endpoint + fq + "&begin_date=" + begin_date + "&end_date=" + end_date +\
            "&page=" + page + "&sort=oldest" + "&api-key=" + nyt_api_key

    return url

create_url()

'https://api.nytimes.com/svc/search/v2/articlesearch.json?fq=news_desk=politicsAND section=Washington&begin_date=20160101&end_date=20230601&page=0&sort=oldest&api-key=AUtPPNuICzon3X9uEaX2k4Dgs0YGBA25'

In [7]:
def make_request(tags=query_list,news_desk=news_desk, section = "Washington", filters=filter_list, begin_date="20210101",
 end_date="20230101", page="0"):
    """
    Make a GET request to the NYT Article Search API with a request delay of 6
        seconds to avoid reaching request limit of 60 requests per minute.

    Inputs:
        tags (lst): list of tags (strings) to look for. The tags to filter for
            are looked in the body, headline and byline of the articles.
        filters (lst): list of filters where to look tags. They can be "headline",
            "lead_paragraph" and/or "body"
        begin_date (str): 8 digits (YYYYMMDD) string that specify the begin date
            or from when to start looking for articles.
        end_date (str): 8 digits (YYYYMMDD) string that specify the end date or
            until when to stop looking for articles.
        page (str): number of page string that states where to look for articles.
    
    Return (Response): API request response with specified query parameters
    """

    url = create_url(tags, news_desk, section, filters, begin_date, end_date, page)
    time.sleep(1)
    resp = requests.get(url)

    return resp



In [62]:
test = make_request()

In [63]:
resp_json = json.loads(test.text)
resp_json

{'status': 'OK',
 'copyright': 'Copyright (c) 2023 The New York Times Company. All Rights Reserved.',
 'response': {'docs': [{'abstract': 'It isn’t voting that needs fixing, but what comes after it and the public’s confidence in the results.\xa0',
    'web_url': 'https://www.nytimes.com/2022/01/03/opinion/voting-rights-democrats.html',
    'snippet': 'It isn’t voting that needs fixing, but what comes after it and the public’s confidence in the results.\xa0',
    'lead_paragraph': 'With their legislative agenda stymied for now, Democrats reportedly are hoping to take another crack at election reform. The Senate majority leader, Chuck Schumer, and President Biden have both identified voting rights legislation as a top priority.',
    'print_section': 'A',
    'print_page': '17',
    'source': 'The New York Times',
    'multimedia': [{'rank': 0,
      'subtype': 'xlarge',
      'caption': None,
      'credit': None,
      'type': 'image',
      'url': 'images/2022/01/03/multimedia/03levin

In [8]:
def get_json(tags=query_list,news_desk=news_desk, section = "Washington", filters=filter_list, begin_date="20210101",
 end_date="20230101"):
    """
    Create json files from articles that meet query search parameters for a
    specific year because of API restrictions with numbers of pages

    Inputs:
        tags (lst): list of tags to look for. The tags to filter for are looked
            in the body, headline and byline of the articles.
        filters (lst): list of filters where to look tags. They can be "headline",
            "lead_paragraph" and/or "body"
        begin_date (str): 8 digits (YYYYMMDD) string that specify the begin date
            or from when to start looking for articles.
        end_date (str): 8 digits (YYYYMMDD) string that specify the end date or
            until when to stop looking for articles.
    """
    year = begin_date[:4]
    month_name = calendar.month_name[int(begin_date[4:6])]
    resp = make_request(tags,news_desk, section , filters, begin_date,
    end_date, page)
    resp_json = json.loads(resp.text)
    current_dir = "/Users/jpmartinezclaeys/Desktop/U Chicago/James Turk - RA/newsfaces" #Update with os capabilities
    file_name = os.path.join(current_dir, "raw_data", year, month_name,
                            "nyt_0.json")
    
    with open(file_name, "w") as f:
        json.dump(resp_json, f, indent=1)
        f.close()

    # Get number of articles that match our query search parameters
    hits = resp_json["response"]["meta"]["hits"]

    # Get maximum number of pages we can query
    max_pages = int(hits / 10)

    # Query everything and save the jsons
    for page_n in range(1, max_pages + 1):
        page_str = str(page_n)
        resp = make_request(tags,news_desk, section, filters, begin_date,
        end_date, page_str)
        resp_json = json.loads(resp.text)
        name = "nyt_" + page_str + ".json"
        file_name = os.path.join(current_dir, "test_nyt", year, month_name,
                                name)
        
        with open(file_name, "w") as f:
            json.dump(resp_json, f, indent=1)
            f.close()

In [11]:
def create_dirs(tags=query_list,  filters=filter_list, begin_date="20230101",
 end_date="20230101"):
    """
    Create directories and all json files from articles that meet query search
        parameters

    Inputs:
        tags (lst): list of tags to look for. The tags to filter for are looked
            in the filter sections defined by the "filters" argument.
        filters (lst): list of filters where to look tags. They can be
            "headline", "lead_paragraph" and/or "body"
        begin_date (str): 8 digits (YYYYMMDD) string that specify the begin date
            or from when to start looking for articles.
        end_date (str): 8 digits (YYYYMMDD) string that specify the end date or
            until when to stop looking for articles.
    """
    
    current_dir = "/Users/jpmartinezclaeys/Desktop/U Chicago/James Turk - RA/newsfaces"
    path = os.path.join(current_dir, "test_nyt")
    
    begin_year = int(begin_date[:4])
    end_year = int(end_date[:4])
    first_month = 1
    last_month = 12

    # We create JSONs for each year and month because the NYT API has a page
    # limit of 200 pages per query search
    for year in range(begin_year, end_year + 1):
        if year == end_year:
            last_month = int(end_date[4:6])

        for month in range(first_month, last_month + 1):
            year_str = str(year)
            month_dir = calendar.month_name[month]
            new_dir = os.path.join(path, year_str, month_dir)
            if os.path.exists(new_dir) == False:
                os.makedirs(new_dir)
            
            month_str = str(month) 
            if len(month_str) == 1:
                month_str = "0" + month_str
            
            _, day = calendar.monthrange(year, month)
            begin_new = year_str + month_str + "01"
            end_new = year_str + month_str + str(day)
            get_json(tags, filters, begin_new, end_new)

In [10]:
get_json(tags=query_list,news_desk=news_desk, section = "Washington", filters=filter_list, begin_date="20230101",
 end_date="20230101", page="0")

FileNotFoundError: [Errno 2] No such file or directory: '/Users/jpmartinezclaeys/Desktop/U Chicago/James Turk - RA/newsfaces/raw_data/2023/January/nyt_0.json'

In [None]:
create_dirs

In [48]:
#Trafilatura

downloaded = trafilatura.fetch_url('https://www.nytimes.com/politics/first-draft/2016/01/05/ted-cruz-shrugs-off-donald-trumps-questioning-his-citizenship/')
trafilatura.extract(downloaded, include_images=True)

#downloaded = trafilatura.fetch_url("https://www.nytimes.com/2016/01/06/opinion/campaign-stops/purity-disgust-and-donald-trump.html")
#trafilatura.extract(downloaded, include_images=True)

'Ted Cruz Shrugs Off Donald Trump’s Questioning His Citizenship\n![ ](https://static01.nyt.com/images/2016/01/06/us/05firsdraft-trump/05firsdraft-trump-tmagArticle.jpg)\nDonald J. Trump is increasingly taking jabs at Senator Ted Cruz of Texas, who is riding momentum in Iowa less than a month before the caucuses there. But so far, Mr. Cruz is resisting the bait.\nIn the latest in a series of barbs against Mr. Cruz, Mr. Trump said there were questions about whether Mr. Cruz, who was born in Canada but whose mother was a United States citizen, was eligible to seek the presidency, making the comments in interviews with a New Hampshire television station and separately with The Washington Post.\nThe issue could end up tying Mr. Cruz up in court, Mr. Trump told The Post.\nThe Constitution restricts the presidency to a “natural born citizen,” but many legal scholars have said that this would apply to Mr. Cruz, although a similar issue has never been tested in the courts. Mr. Cruz, who was bor