## 01_NYT_Data

This scripts collects headlines and related metadata from the New York Times that are related to NYC's Specialized High School Exam (SHSAT), with a focus on articles between the years of 2018-Present. It collects data via API calls, wrangles data into a uniform structure, and saves the results into /data.

In [1]:
#set library
import requests
from dotenv import load_dotenv
import pandas as pd
import os
import time
import numpy as np
from bs4 import BeautifulSoup
import json

### NY Times 

In [2]:
#load api key
load_dotenv()
api_key = os.environ.get('georgetown_api_key')

In [3]:
# set url for api
base_url =  "https://api.nytimes.com/svc/search/v2/articlesearch.json"

In [4]:
#setting parameters
search = "SHSAT OR Specialized High Schools Admissions Test OR Specialized High Schools Admissions Exam"
start = "20180101"
end = "20251001"
pg = 0
counter = True

#data storage
nyt_json = []

In [5]:
#api query with pagination
while counter == True:
    #request
    response = requests.get(base_url, params= {
        "q": search, 
        "api-key": api_key,
        "begin_date": start,  
        "end_date": end,
        "page": pg
    })

    #check status
    if response.status_code != 200:
        print("query failed")
        break
    
    #save results
    nyt_json.append(response.json()["response"]["docs"])

    #pagination
    if not response.json().get("response", {}).get("docs"):
        counter = False
    else:
        pg = pg + 1

    #pause to respect rate limits
    time.sleep(13)

### Cleaning Output

In [35]:
nyt_df = pd.DataFrame(
    columns = ["link", "headline" , "author", "date_published", "abstract"]
)

In [65]:
for page in nyt_json:
    #skip if page is blank
    if not page:
            continue 

    #create temp dictionary
    pg_dict = dict.fromkeys(nyt_df.columns)
    
    #loop through each story
    for story in page:
        pg_dict["link"] = [story["web_url"]]
        pg_dict["headline"] = [story["headline"]["main"]]
        pg_dict["author"] = [story["byline"]["original"]]
        pg_dict["date_published"] = [story["pub_date"]]
        pg_dict["abstract"] = [story["abstract"]]

        nyt_df = pd.concat([nyt_df, pd.DataFrame(pg_dict)])


In [67]:
nyt_df.to_csv("../data/nyt_results.csv", index = False)