In [1]:
# import other libraries
import time
import string
import pandas as pd
import numpy as np
import requests
import os
import random 
from dotenv import load_dotenv

In [2]:
#loading key for NYT API
load_dotenv()

NYT_key = os.environ.get("NYT_Key")


In [3]:
def nyt_wrapper(q, key, bdate, edate, fq, pg, s):
    """
    
    Function creates a pull request for the New York Times API that contains the designated inputs and returns 
    the information as a json file. 
    
    Inputs:
    q = Query (string)
    key = NYT API key (string)
    bdate = earliest published date for the articles (string). Must be in YYYYMMDD format.
    edate = latest published date for the articles (string). Must be in YYYYMMDD format.
    fq = filter query (string)
    pg = page in the results (integer)
    sort = how the results should be organized (string)
    
    Output: 
    Json file from the NYT
    
    """
    query_url = f"https://api.nytimes.com/svc/search/v2/articlesearch.json?" \
                f"q={q}" \
                f"&api-key={key}" \
                f"&begin_date={bdate}" \
                f"&fq={fq}" \
                f"&end_date={edate}" \
                f"&page={pg}" \
                f"&sort={s}" 
    url_response = requests.get(query_url)
    
    return url_response.json()
    

In [4]:
nyt_pull = []

def multiple_pull_requests(q, key, bdate, edate, fq, pgstart, pgend, s):
    """
    
    Function compiles multiple pull requests from the NYT API to create a list of json files/dictionaries. Uses the 
    previously created wrapper to pull the api request. The only thing that changes between each pull request 
    is the page number. Functions starts with pg number equal to input pgstart.
    The while loop adds +1 to the pg integer value and stops the pulls when the pg number equals the input pgend.
    
    Between each pull request, function pauses for 15 seconds so that it doesn't trigger the API limit.
    
    
    Inputs:
    q = Query (string)
    key = NYT API key (string)
    bdate = earliest published date for the articles (string). Must be in YYYYMMDD format.
    edate = latest published date for the articles (string). Must be in YYYYMMDD format.
    fq = filter query (string)
    pgstart = the page that the pulls should start with (integer)
    pgend = the page that the pulls should stop at (integer)
    sort = how the results should be organized (string)
    
    Output: 
    List of multiple Json files from the NYT
    
    """
        
    pg = pgstart
    while pg < pgend:
        pgresults = nyt_wrapper(q, key, bdate, edate, fq, pg, s) #each pull contains 10 articles
        nyt_pull.append(pgresults) 
        time.sleep(10)
        pg = pg+1
    return nyt_pull
    
    
query = "SAG-AFTRA"
begin_date = "20230615"  
end_date = "20231108"
filterq = "source:(The New York Times)"
pg_start = 0 
pg_end = 5
sort = "relevance"  


NYT_results = multiple_pull_requests(query, NYT_key, begin_date, end_date, filterq, pg_start, pg_end, sort)

In [5]:
clean_NYT_articles = []

def clean_NYT(nyt_json):
    '''
    Function to extract columns of interest from input Nyt_json. Function iterates through each json file and identifies
    information on each of the 10 articles within the json file. Saves information within a local dictionary and then
    appends it to list clean_NYT_articles
    
    Inputs:
    nyt_json: list of nested json files (dictionaries)
    
    outputs: 
    clean_NYT_articles: list of dictionaries, each dictionary containing information about a single article
    '''
    for n in range(len(nyt_json)): #matches number of pages 
        for a in range(10): #there are 10 articles on each page
            all_page_articles = nyt_json[n]["response"]["docs"]
            article = all_page_articles[a]
            temp_NYT = {}
            
        # collect information with temp_NYT
            temp_NYT["abstract"]= article["abstract"]
            temp_NYT["weblink"]= article["web_url"]
            temp_NYT["lead_par"]= article["lead_paragraph"]
            temp_NYT["source"] = article["source"]
            temp_NYT["headline"]= article["headline"]["main"]
            temp_NYT["Authors"]= article["byline"]["original"]
            temp_NYT["keywords"] = [i["value"] for i in article["keywords"]]
        #add article information to clean_NYT_articles list
            clean_NYT_articles.append(temp_NYT)
    return(clean_NYT_articles)

clean_NYT_data = clean_NYT(NYT_results)

In [6]:
#saving results in a dataframe
NYT_dataframe = pd.DataFrame(clean_NYT_data)

NYT_dataframe.head()

Unnamed: 0,abstract,weblink,lead_par,source,headline,Authors,keywords
0,The SAG-AFTRA union told its members not to dr...,https://www.nytimes.com/2023/10/20/arts/sag-af...,"Barbie, Ken and Wednesday Addams costumes are ...",The New York Times,Halloween Is Tricky for Actors on Strike,By Christine Hauser,"[Strikes, Actors and Actresses, Organized Labo..."
1,"SAG-AFTRA, which has been on strike for 114 da...",https://www.nytimes.com/2023/11/04/business/me...,The negotiating committee of the actors’ union...,The New York Times,"Actors’ Union Says It Receives ‘Last, Best and...",By Nicole Sperling,"[SAG-AFTRA, Actors and Actresses, Strikes, Org..."
2,The entertainment companies are growing optimi...,https://www.nytimes.com/2023/10/29/business/me...,Following several productive days at the negot...,The New York Times,Studios Said to See Progress in Talks With Str...,By Brooks Barnes,"[Organized Labor, Movies, Actors and Actresses..."
3,Talks aimed at ending the strike by the actors...,https://www.nytimes.com/2023/10/21/business/ac...,The major entertainment studios and the union ...,The New York Times,Actors and Hollywood Studios to Restart Negoti...,By Nicole Sperling,"[Strikes, Actors and Actresses, Organized Labo..."
4,The Hollywood cliché of performers working as ...,https://www.nytimes.com/2023/10/16/dining/acto...,"In January, Francesca Xuereb took the leap man...",The New York Times,"As the Strike Wears On, Actors Turn to a Famil...",By Meghan McCarron,"[Television, Movies, Organized Labor, Restaura..."


In [7]:
NYT_dataframe.to_csv("NYT_SAG-AFTRA_Strike_Articles.csv", sep=',', index=False, encoding='utf-8')