## 03_NYDaily_Data

This scripts collects headlines and related metadata from the New York Daily News that are related to NYC's Specialized High School Exam (SHSAT), with a focus on articles between the years of 2018-Present. It collects data via webscraping, wrangles data into a uniform structure, and saves the results into /data.

In [11]:
#libraries
import pandas as pd
import numpy as np
import requests 
from bs4 import BeautifulSoup 
import time 

### NY Daily News Post

In [12]:
#set url
base_url = "https://www.nydailynews.com/page/"

In [13]:
#set parameters
pg = 1
stop = False
headline_count = None

#set storage
nydn_pages = []

In [14]:
#web scrape with pagination
while stop == False:
    #create query url
    query = base_url + str(pg) + "/?s=SHSAT+OR+Specialized+High+Schools+Admissions+Test+OR+Specialized+High+Schools+Ad&order=desc&orderby=relevance&category_name&obit__spotlight&obit__site_name"

    #scrape page
    response = requests.get(query)

    #check status
    if response.status_code != 200:
        print("query failed")
        break
    
    #retrieve html
    pg_soup = BeautifulSoup(response.content, 'html.parser')

    #save results
    nydn_pages.append(pg_soup)

    #see how many total results there are
    if pg == 1:
        result_text = pg_soup.find("div", class_ = "sort-filter").find("span", class_ = "results").get_text()
        headline_count = int("".join([char for char in result_text if char.isdigit()]))
    
    #see total page results
    page_results = len(pg_soup.find("div", class_ = "content-wrapper").find("div", class_ = "search-content filter-open load-more-wrapper").find_all("article"))

    #update
    headline_count = headline_count - page_results
    if headline_count > 0:
        pg = pg + 1
    else: 
        stop = True

    #pause to respect rate limits
    time.sleep(13)
    

### Cleaning Output

In [15]:
nydn_df = pd.DataFrame(
    columns = ["link", "headline" , "author", "date_published"]
)

In [16]:
for page in nydn_pages:
    #get results for each page
    pg_results = page.find("div", class_ = "content-wrapper").find("div", class_ = "search-content filter-open load-more-wrapper").find_all("article")

    #create temp dictionary
    pg_dict = dict.fromkeys(nydn_df.columns)
    
    #loop through each story
    for story in pg_results:
        pg_dict["link"] = [story.find("a")["href"]]
        pg_dict["headline"] = [story.find("h2").find("a", class_ = "article-title").find("span").get_text(strip = True)]
        pg_dict["author"] = [story.find("div", class_ = "entry-meta").find("div", class_ = "byline").find("a").get_text()]
        pg_dict["date_published"] = [story.find("div", class_ = "entry-meta").find("time")["datetime"]]

        nydn_df = pd.concat([nydn_df, pd.DataFrame(pg_dict)])


In [17]:
nydn_df

Unnamed: 0,link,headline,author,date_published
0,https://www.nydailynews.com/2025/08/07/zohran-...,"Zohran Mamdani, once against the specialized h...",Cayla Bamberger,2025-08-07 17:43:04
0,https://www.nydailynews.com/2024/12/18/nyc-edu...,NYC education panel approves digital version o...,Cayla Bamberger,2024-12-18 23:36:05
0,https://www.nydailynews.com/2024/11/23/push-to...,Push to digitize NYC entrance exam for special...,Cayla Bamberger,2024-11-23 16:00:52
0,https://www.nydailynews.com/2024/05/07/putting...,Putting the test to the test: NYC’s Specialize...,New York Daily News Editorial Board,2024-05-07 04:00:46
0,https://www.nydailynews.com/2024/05/15/city-st...,City still paying big for ex-mayor’s shortcuts,Richard Steier,2024-05-15 05:00:11
0,https://www.nydailynews.com/2023/02/14/embattl...,Embattled principal at LaGuardia High for arts...,Cayla Bamberger,2023-02-14 18:03:18
0,https://www.nydailynews.com/2021/05/03/women-c...,Women candidates for NYC mayor share vision fo...,Shant Shahrigian,2021-05-03 19:31:04
0,https://www.nydailynews.com/2021/04/07/andrew-...,Andrew Yang goes on defensive at teachers’ uni...,Shant Shahrigian,2021-04-07 19:25:42
0,https://www.nydailynews.com/2021/02/07/nyc-may...,NYC mayoral wannabe Maya Wiley backs tax hikes...,Shant Shahrigian,2021-02-07 19:07:28
0,https://www.nydailynews.com/2020/05/27/readers...,"Readers sound off on energy, opening churches ...",Voice of the People,2020-05-26 23:00:00


In [18]:
#save data
nydn_df.to_csv("../data/nydn_results.csv", index = False)