## 02_NYPost_Data

This scripts collects headlines and related metadata from the New York Post that are related to NYC's Specialized High School Exam (SHSAT), with a focus on articles between the years of 2018-Present. It collects data via webscraping, wrangles data into a uniform structure, and saves the results into /data.

In [1]:
#libraries
import pandas as pd
import numpy as np
import requests 
from bs4 import BeautifulSoup 
import time 

### NY Post

In [2]:
#set url
base_url = "https://nypost.com/search/SHSAT+OR+Specialized+High+Schools+Admissions+Test+OR+Specialized+High+Schools+Admissions+Exam/"

In [3]:
#set parameters
pg = 1
stop = False
headline_count = None

#set storage
nyp_pages = []

In [4]:
#web scrape with pagination
while stop == False:
    #create query url
    query = base_url + "page/" + str(pg) + "/?orderby=relevance"

    #scrape page
    response = requests.get(query)

    #check status
    if response.status_code != 200:
        print("query failed")
        break
    
    #retrieve html
    pg_soup = BeautifulSoup(response.content, 'html.parser')

    #save results
    nyp_pages.append(pg_soup)

    #see how many total results there are
    if pg == 1:
        headline_count = int(pg_soup.find("div", class_ = "search-results__header").find("h2").find("em").text)
    
    #see total page results
    page_results = len(pg_soup.find("div", class_ = "page__content search-results").find("div", "search-results__stories").find_all("div", class_ = "search-results__story"))

    #update
    headline_count = headline_count - page_results
    if headline_count > 0:
        pg = pg + 1
    else: 
        stop = True

    #pause to respect rate limits
    time.sleep(13)
    

### Cleaning Output

In [5]:
nyp_df = pd.DataFrame(
    columns = ["link", "headline" , "author", "date_published"]
)

In [6]:
for page in nyp_pages:
    #get results for each page
    pg_results = page.find("div", class_ = "page__content search-results").find("div", "search-results__stories").find_all("div", class_ = "search-results__story")

    #create temp dictionary
    pg_dict = dict.fromkeys(nyp_df.columns)
    
    #loop through each story
    for story in pg_results:
        pg_dict["link"] = [story.find("a")["href"]]
        pg_dict["headline"] = [story.find("h3").get_text(strip = True)]
        pg_dict["author"] = [story.find("span").get_text().split('\xa0')[0].strip().replace("By", "").strip()]
        pg_dict["date_published"] = [story.find("span").get_text().split('\xa0')[1].strip().replace('\n', '').replace('\t', '').replace("|", "")]

        nyp_df = pd.concat([nyp_df, pd.DataFrame(pg_dict)])


In [7]:
nyp_df

Unnamed: 0,link,headline,author,date_published
0,https://nypost.com/2025/06/21/us-news/nyc-soci...,NYC socialist mayoral candidate Zohran Mamdani...,Rich Calder,"June 21, 2025 5:00pm"
0,https://nypost.com/2024/11/16/us-news/nyc-stud...,NYC students' futures could be derailed over d...,Deirdre Bardolf,"November 16, 2024 10:11am"
0,https://nypost.com/2024/12/14/us-news/fight-in...,Fight intensifies over entrance exam for NYC's...,Deirdre Bardolf,"December 14, 2024 12:04pm"
0,https://nypost.com/2024/12/19/us-news/contract...,Contract for controversial entrance exam into ...,Aneeta Bhole,"December 19, 2024 12:11am"
0,https://nypost.com/2024/12/04/opinion/ny-publi...,Activist sneak attack could kill admissions te...,Yiatin Chu and Lisa Marks,"December 4, 2024 8:50pm"
...,...,...,...,...
0,https://nypost.com/2012/09/09/how-to-land-your...,How to land your top pick,Susan Edelman,"September 9, 2012 4:00am"
0,https://nypost.com/2012/09/09/11-30/,11-30,Post Staff Report,"September 9, 2012 4:00am"
0,https://nypost.com/2012/09/09/why-some-schools...,Why some schools rocket to the top,Post Staff Report,"September 9, 2012 4:00am"
0,https://nypost.com/2011/08/27/the-top-10-2/,The top 10,Post Staff Report,"August 27, 2011 12:07am"


In [8]:
nyp_df.to_csv("../data/nyp_results.csv", index = False)