## 02_NYPost_Data

This scripts collects headlines and related metadata from the New York Post that are related to NYC's Specialized High School Exam (SHSAT), with a focus on articles between the years of 2018-Present. It collects data via webscraping, wrangles data into a uniform structure, and saves the results into /data.

In [1]:
#libraries
import pandas as pd
import numpy as np
import requests 
from bs4 import BeautifulSoup 
import time 

### NY Post

In [2]:
#set url
base_url = "https://nypost.com/search/SHSAT+OR+Specialized+High+Schools+Admissions+Test+OR+Specialized+High+Schools+Admissions+Exam/"

In [3]:
#set parameters
pg = 0
stop = False
headline_count = None

#set storage
nyp_pages = []

In [None]:
#web scrape with pagination
while stop == False:
    #create query url
    query = base_url + "page/" + str(pg) + "/?orderby=relevance"

    #scrape page
    response = requests.get(query)

    #check status
    if response.status_code != 200:
        print("query failed")
        break
    
    #retrieve html
    pg_soup = BeautifulSoup(response.content, 'html.parser')

    #save results
    nyp_pages.append(pg_soup)

    #see how many total results there are
    if pg == 0:
        headline_count = int(pg_soup.find("div", class_ = "search-results__header").find("h2").find("em").text)
    
    #see total page results
    page_results = len(pg_soup.find("div", class_ = "page__content search-results").find("div", "search-results__stories").find_all("div", class_ = "search-results__story"))

    #update
    total_results_left = headline_count - page_results
    if total_results_left > 0:
        pg = pg + 1
    else: 
        stop = True

    #pause to respect rate limits
    time.sleep(13)
    

query failed


### Cleaning Output

In [55]:
nyp_df = pd.DataFrame(
    columns = ["link", "headline" , "author"]
)

In [57]:
for page in nyp_pages:
    #get results for each page
    pg_results = page.find("div", class_ = "page__content search-results").find("div", "search-results__stories").find_all("div", class_ = "search-results__story")

    #create temp dictionary
    pg_dict = dict.fromkeys(nyp_df.columns)
    
    #loop through each story
    for story in pg_results:
        pg_dict["link"] = [story.find("a")["href"]]
        pg_dict["headline"] = [story.find("h3").get_text()]
        pg_dict["author"] = [story.find("span").get_text()]

        nyp_df = pd.concat([nyp_df, pd.DataFrame(pg_dict)])


In [60]:
#remove duplicates
nyp_df = nyp_df.drop_duplicates()

In [61]:
nyp_df.to_csv("../data/nyp_results.csv", index = False)