## 01_NYT_Data

This scripts collects headlines and related metadata from the New York Times that are related to NYC's Specialized High School Exam (SHSAT), with a focus on articles between the years of 2018-Present. It collects data via API calls, wrangles data into a uniform structure, and saves the results into /data.

In [12]:
#set library
import requests
from dotenv import load_dotenv
import pandas as pd
import os
import time
import numpy as np
from bs4 import BeautifulSoup
import json

### NY Times 

In [2]:
#load api key
load_dotenv()
api_key = os.environ.get('georgetown_api_key')

In [3]:
# set url for api
base_url =  "https://api.nytimes.com/svc/search/v2/articlesearch.json"

In [4]:
#setting parameters
search = "SHSAT OR Specialized High Schools Admissions Test OR Specialized High Schools Admissions Exam"
start = "20180101"
end = "20251001"
pg = 0
counter = True

#data storage
nyt_json = []

In [5]:
#api query with pagination
while counter == True:
    #request
    response = requests.get(base_url, params= {
        "q": search, 
        "api-key": api_key,
        "begin_date": start,  
        "end_date": end,
        "page": pg
    })

    #check status
    if response.status_code != 200:
        print("query failed")
        break
    
    #save results
    nyt_json.append(response.json()["response"]["docs"])

    #pagination
    if not response.json().get("response", {}).get("docs"):
        counter = False
    else:
        pg = pg + 1

    #pause to respect rate limits
    time.sleep(13)

In [14]:
#save file
with open("../data/nyt_results.json", "w", encoding="utf-8") as f:
    json.dump(nyt_json, f, ensure_ascii=False, indent=2)