## 01 Data Collection

This script collects headlines and related metadata from NYC-local newsouts on articles that cover NYC's Specialized High School Exam (SHSAT). It collects data via either API calls or webscraping, wrangles data into a uniform structure, and saves the results into /data

In [None]:
#api calls
from dotenv import load_dotenv

#webscraping
import requests
import os
import time
from bs4 import BeautifulSoup

#selenium
import undetected_chromedriver as uc
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import StaleElementReferenceException, NoSuchElementException

#data wrangling
import pandas as pd


#### New York Times API

In [None]:
#load api key
load_dotenv()
api_key = os.environ.get('nyt_api_key')

In [None]:
# set url for api
base_url =  "https://api.nytimes.com/svc/search/v2/articlesearch.json"

In [None]:
#setting parameters
search = "SHSAT OR Specialized High Schools Admissions Test OR Specialized High Schools Admissions Exam"
start = "20120101"
end = "20251118"
pg = 0
counter = True

#data storage
nyt_json = []

In [None]:
#api query with pagination
while counter == True:
    #request
    response = requests.get(base_url, params= {
        "q": search, 
        "api-key": api_key,
        "begin_date": start,  
        "end_date": end,
        "page": pg
    })

    #check status
    if response.status_code != 200:
        print("query failed")
        break
    
    #save results
    nyt_json.append(response.json()["response"]["docs"])

    #pagination
    if not response.json().get("response", {}).get("docs"):
        counter = False
    else:
        pg = pg + 1

    #pause to respect rate limits
    time.sleep(13)

In [None]:
#create empty df
nyt_df = pd.DataFrame(
    columns = ["link", "headline" , "author", "date_published", "snippet"]
)

In [None]:
#extract data
for page in nyt_json:
    #skip if page is blank
    if not page:
            continue 

    #create temp dictionary
    pg_dict = dict.fromkeys(nyt_df.columns)
    
    #loop through each story
    for story in page:
        pg_dict["link"] = [story["web_url"]]
        pg_dict["headline"] = [story["headline"]["main"]]
        pg_dict["author"] = [story["byline"]["original"]]
        pg_dict["date_published"] = [story["pub_date"]]
        pg_dict["snippet"] = [story["abstract"]]

        nyt_df = pd.concat([nyt_df, pd.DataFrame(pg_dict)])


In [None]:
#check results
nyt_df.shape

In [None]:
#save results
nyt_df.to_csv("../data/nyt_results.csv", index = False)

#### Functions for Webscraping

In [None]:
def query_url(newsout: str, pg):
    """
    returns specific query url for each
    outlet and page
    """
    if newsout == "NYP":
    
        base_url = "https://nypost.com/search/SHSAT/"

        query = base_url + "page/" + str(pg) + "/?orderby=relevance"

    elif newsout == "NYDN":
        
        base_url = "https://www.nydailynews.com/page/"

        query = base_url + str(pg) + "/?s=SHSAT&orderby=date&order=desc"
    
    elif newsout == "NY1":

        base_url = "https://ny1.com/nyc/all-boroughs/search#SHSAT/"

        query = base_url + str(pg) + "/publishDate%20desc"

    elif newsout == "Brooklyn":
        base_url = "https://brooklyneagle.com/page/"

        query = base_url + str(pg) + "/?s=SHSAT"
    
    elif newsout == "City":
        base_url = "https://www.city-journal.org/search?top=true&limit=12&page-number="
        
        query = base_url + str(pg) + "&search=SHSAT&types%5B%5D=article&dates=&sort=desc"
    
    return query


In [None]:
def total_results(newsout: str, pg_soup):
    """
    checks how much total headlines
    there are for the search
    """
    if newsout == "NYP":
        headline_count = int(pg_soup.find("div", class_ = "search-results__header").find("h2").find("em").text)
        
    elif newsout == "NYDN":
        result_text = pg_soup.find("div", class_ = "sort-filter").find("span", class_ = "results").get_text()

        headline_count = int("".join([char for char in result_text if char.isdigit()]))
    
    return headline_count

In [None]:
def page_results(newsout: str, pg_soup):
    """
    checks how much total headlines
    there are for the current page
    """
    if newsout == "NYP":
        page_results = len(pg_soup.find("div", class_ = "page__content search-results").find("div", "search-results__stories").find_all("div", class_ = "search-results__story"))
        
    elif newsout == "NYDN":
        page_results = len(pg_soup.find("div", class_ = "content-wrapper").find("div", class_ = "search-content filter-open load-more-wrapper").find_all("article"))

    return page_results


In [None]:
def web_scraping(newsout: str):
    """
    scrapes newspaper outlets based on
    newsout arg and returns list  
    this is only for static news outlets
    """

    #set parameters
    pg = 1
    stop = False
    headline_count = None

    #storage
    pages = []

    #scraping with pagination
    while stop == False:
        #base_url
        query = query_url(newsout, pg)

        #scrape page
        response = requests.get(query)

        #check status
        if response.status_code != 200:
            print("query failed")
            break
        
        #retrieve html
        pg_soup = BeautifulSoup(response.content, 'html.parser')
        
        #save results
        pages.append(pg_soup)

        #see how many total results there are
        if pg == 1:
            headline_count = total_results(newsout, pg_soup)

        #see total results on current page
        page_result = page_results(newsout, pg_soup)
        
        #update
        headline_count = headline_count - page_result
        if headline_count > 0:
            pg = pg + 1
        else: 
            stop = True

        #pause to respect rate limits
        time.sleep(13)
    
    #return scrapped results
    return pages


#### New York Post

In [None]:
#scrape data
nyp_pages = web_scraping("NYP")

In [None]:
#create empty df
nyp_df = pd.DataFrame(
    columns = ["link", "headline" , "author", "date_published", "snippet"]
)

In [None]:
#extract data
for page in nyp_pages:
    #get results for each page
    pg_results = page.find("div", class_ = "page__content search-results").find("div", "search-results__stories").find_all("div", class_ = "search-results__story")

    #create temp dictionary
    pg_dict = dict.fromkeys(nyp_df.columns)
    
    #loop through each story
    for story in pg_results:
        pg_dict["link"] = [story.find("a")["href"]]
        pg_dict["headline"] = [story.find("h3").get_text(strip = True)]
        pg_dict["author"] = [story.find("span").get_text().split('\xa0')[0].strip().replace("By", "").strip()]
        pg_dict["date_published"] = [story.find("span").get_text().split('\xa0')[1].strip().replace('\n', '').replace('\t', '').replace("|", "")]
        pg_dict["snippet"] = [story.find("p").get_text(strip = True)]

        nyp_df = pd.concat([nyp_df, pd.DataFrame(pg_dict)])


In [None]:
#check results
nyp_df.shape

In [None]:
#save results
nyp_df.to_csv("../data/nyp_results.csv", index = False)

#### New York Daily News

In [None]:
#scrape data
nydn_pages = web_scraping("NYDN")

In [None]:
#create empty df
nydn_df = pd.DataFrame(
    columns = ["link", "headline" , "author", "date_published", "snippet"]
)

In [None]:
#extract data
for page in nydn_pages:
    #get results for each page
    pg_results = page.find("div", class_ = "content-wrapper").find("div", class_ = "search-content filter-open load-more-wrapper").find_all("article")

    #create temp dictionary
    pg_dict = dict.fromkeys(nydn_df.columns)
    
    #loop through each story
    for story in pg_results:
        pg_dict["link"] = [story.find("a")["href"]]
        pg_dict["headline"] = [story.find("h2").find("a", class_ = "article-title").find("span").get_text(strip = True)]
        pg_dict["author"] = [story.find("div", class_ = "entry-meta").find("div", class_ = "byline").find("a").get_text()]
        pg_dict["date_published"] = [story.find("div", class_ = "entry-meta").find("time")["datetime"]]
        pg_dict["snippet"] = [story.find("div", class_ = "excerpt").get_text(strip = True)]

        nydn_df = pd.concat([nydn_df, pd.DataFrame(pg_dict)])


In [None]:
#check results
nydn_df.shape

In [None]:
#save data
nydn_df.to_csv("../data/nydn_results.csv", index = False)

#### Chalkbeat

In [None]:
#start selenium
driver = webdriver.Chrome()

In [None]:
#navigate to chalkbeat search
query = "https://www.chalkbeat.org/search/?query=SHSAT"

driver.get(query)

In [None]:
#storing html per page
pages = []

In [None]:
while True:
    #append page html to list
    pages.append(driver.page_source)

    #find and click next button
    try:
        next_button = driver.find_element(By.CSS_SELECTOR, "a.next_btn")
        if not next_button.is_displayed():
            break
        next_button.click()
        time.sleep(3)
    except NoSuchElementException:
        print("finished!")
        break
driver.quit()

In [None]:
#create empty df
chalkbeat_df = pd.DataFrame(
    columns = ["link", "headline" , "author", "date_published", "snippet"]
)

In [None]:
#extract data
for pg in pages:
    #get results for each page
    pg_results = BeautifulSoup(pg, 'html.parser').find_all("div", class_ = "queryly_item_row")

    #create temp dictionary
    pg_dict = dict.fromkeys(chalkbeat_df.columns)
    
    #loop through each story
    for story in pg_results:
        pg_dict["link"] = ["https://www.chalkbeat.org" + story.find("a")["href"]]
        pg_dict["headline"] = [story.find("div", class_ = "queryly_item_title").get_text(strip = True)]
        pg_dict["date_published"] = [story.find("div", style = "margin-top:6px;color:#555;font-size:12px;").get_text(strip = True)]
        pg_dict["snippet"] = [story.find("div", class_ = "queryly_item_description").get_text(strip = True)]

        chalkbeat_df = pd.concat([chalkbeat_df, pd.DataFrame(pg_dict)])


In [None]:
#check results
chalkbeat_df.shape

In [None]:
#save data
chalkbeat_df.to_csv("../data/chalkbeat_results.csv", index = False)

#### Spectrum NY 1

In [None]:
#start selenium
driver = webdriver.Chrome()

In [None]:
#storing html per page
pages = []

In [None]:
for pg in range(1,6):
    #find search page
    query = query_url("NY1", pg)
    driver.get(query)

    #let page load
    time.sleep(2)
    try:
        #get article html 
        hits = driver.find_elements(By.CLASS_NAME, "hit")

        #extract hits
        page_html = []
        for h in hits:
            try:
                page_html.append(h.get_attribute("outerHTML"))
            except StaleElementReferenceException:
                continue  
        #store
        if page_html:  
            pages.append(page_html)

    except (NoSuchElementException, StaleElementReferenceException):
        continue
    
    #rest
    time.sleep(3)
driver.quit()

In [None]:
#create empty df
spectrum_df = pd.DataFrame(
    columns = ["link", "headline" , "author", "date_published", "snippet"]
)

In [None]:
#extract data
for pg in pages:
    #create temp dictionary
    pg_dict = dict.fromkeys(spectrum_df.columns)
    
    #loop through each story
    for result in pg:
        story = BeautifulSoup(result,'html.parser')
        pg_dict["link"] = ["https://ny1.com" + story.find("a")["href"]]
        pg_dict["headline"] = [story.find("div", class_ = "title").get_text(strip = True)]
        pg_dict["date_published"] = [story.find("span").get_text(strip = True)]
        pg_dict["snippet"] = [story.find("div", class_ = "description").get_text(strip = True)]

        spectrum_df = pd.concat([spectrum_df, pd.DataFrame(pg_dict)])


In [None]:
#check results
spectrum_df.shape

In [None]:
#save data
spectrum_df.to_csv("../data/spectrum_results.csv", index = False)

#### amNY

In [None]:
#run query
response = requests.get(
    "https://www.amny.com/?s=SHSAT"
)

In [None]:
#get html
pg_soup = BeautifulSoup(response.content, 'html.parser')

In [None]:
#extract all articles
stories = pg_soup.find("main").find_all("article")

In [None]:
#create empty df
amny_df = pd.DataFrame(
    columns = ["link", "headline" , "author", "date_published", "snippet"]
)

In [None]:
#extract data
for story in stories:
    #create temp dictionary
    pg_dict = dict.fromkeys(amny_df.columns)
    
    #extract data
    pg_dict["link"] = [story.find("a")["href"]]
    pg_dict["headline"] = [story.find("a")["title"]]
    if story.find("span", class_ = "posted-on"):
        pg_dict["date_published"] = [story.find("span", class_ = "posted-on").find("time").get_text()]

    amny_df = pd.concat([amny_df, pd.DataFrame(pg_dict)])


In [None]:
#check results
amny_df.shape

In [None]:
#save data
amny_df.to_csv("../data/amny_results.csv", index = False)

#### Brooklyn Daily Eagle

In [None]:
#start selenium/driver, using uc to avoid cloudflare issues
driver = uc.Chrome()

In [None]:
#html storage
pages = []

In [None]:
#scraping pages
for num in range(1, 11):
    #go to page
    driver.get(query_url("Brooklyn", num))
    #grab html
    pages.append(driver.page_source)
    #rest
    time.sleep(3)

driver.quit()

In [None]:
#create empty df
brooklyn_df = pd.DataFrame(
    columns = ["link", "headline" , "author", "date_published", "snippet"]
)

In [None]:
#extract data
for pg in pages:
    #get bs4 obj
    pg_soup = BeautifulSoup(pg, "html.parser")

    #create temp dictionary
    pg_dict = dict.fromkeys(brooklyn_df.columns)

    #identify article list
    if pg_soup.find("div", class_ = "main-container"):
        article_list = pg_soup.find("div", class_ = "main-container").find_all("article")
    else:
        break
    
    #loop through each story
    for result in article_list:
        pg_dict["link"] = [result.find("a")["href"]] if result.find("a") else [None]
        pg_dict["headline"] = [result.find("a").get("title")] if result.find("a") else [None]
        pg_dict["date_published"] = [result.find(class_ = "meta").get_text(strip = True)] if result.find(class_ = "meta") else [None]
        pg_dict["snippet"] = [result.find("div", class_="meta").find_next_sibling(text = True).strip()] if result.find(class_ = "meta") else [None]

        brooklyn_df = pd.concat([brooklyn_df, pd.DataFrame(pg_dict)])


In [None]:
#check results
brooklyn_df.shape

In [None]:
#save data
brooklyn_df.to_csv("../data/brooklyn_results.csv", index = False)

### Queens Chronicle

In [None]:
#base url
base = "https://www.qchron.com/search/?tncms_csrf_token=c4d400dd4c6760f075f26f98b75ee774e2ece127677b3054cf9964704481bab4.0e7fd1a70149b3d138a0&l=25&sort=relevance&f=html&t=article%2Cvideo%2Cyoutube%2Ccollection&app=editorial&nsa=eedition&q=SHSAT"

In [None]:
#get url for each page using offset
query_urls = [base + "&o=" + str(num) for num in range(0, 125, 25)]

In [None]:
#storage
pages = []

In [None]:
#start selenium
driver = webdriver.Chrome()

In [None]:
#webscraping
for pg_url in query_urls:
    #query page
    driver.get(pg_url)
    #get page source
    pg = driver.page_source
    #extract html
    response = BeautifulSoup(pg, 'html.parser').find_all("div", class_ = "card-container")
    #append
    pages.append(response)
    #rest
    time.sleep(3)

In [None]:
#close selenium
driver.quit()

In [None]:
#create empty df
queens_df = pd.DataFrame(
    columns = ["link", "headline" , "author", "date_published", "snippet"]
)

In [None]:
#extract data
for pg in pages:
    #create temp dictionary
    pg_dict = dict.fromkeys(queens_df.columns)
    
    #loop through each story
    for result in pg:
        pg_dict["link"] = ["https://www.qchron.com/" + result.find("a")["href"]] if result.find("a") else [None]
        pg_dict["headline"] = [result.find("div", class_ = "card-headline").get_text(strip = True)] if result.find("div", class_ = "card-headline") else [None]
        pg_dict["date_published"] = [result.find("li", class_ = "card-date").find("time")["datetime"]] if result.find("li", class_ = "card-date").find("time") else [None]
        pg_dict["snippet"] = [result.find("div", class_ = "card-lead").get_text(strip = True)]

        queens_df = pd.concat([queens_df, pd.DataFrame(pg_dict)])


In [None]:
#check results
queens_df.shape

In [None]:
#save data
queens_df.to_csv("../data/queens_results.csv", index = False)

### NY Amsterdam News

In [None]:
#start selenium
driver = webdriver.Chrome()

In [None]:
#get website
query = "https://amsterdamnews.com/"
driver.get(query)

In [None]:
#find search button
search_button = driver.find_element(By.ID, "search-toggle")
search_button.click()

In [None]:
#search for SHSAT : THIS ONLY WORKS IF BROWSER IS EXPANDED FOR SOME REASON
search_box = driver.find_element(By.ID, "search-form-2")
search_box.send_keys("SHSAT")

In [None]:
#enter results
search_box.send_keys(Keys.ENTER)  

In [None]:
#find popout where the search results are
results = driver.find_element(By.CLASS_NAME, "jetpack-instant-search__search-results")

In [None]:
#scroll until we run out of results
for _ in range(20):
    driver.execute_script("arguments[0].scrollBy(0, 1000);", results)
    time.sleep(1)

In [None]:
#grabs page results
news_page = driver.page_source

In [None]:
#quits selenium
driver.quit()

In [None]:
#pulls out list of articles
article_list = BeautifulSoup(news_page, 'html.parser').find("div", class_ = "jetpack-instant-search__search-results").find("ol", class_ = "jetpack-instant-search__search-results-list is-format-expanded").find_all("li")

In [None]:
#create empty df
nyamsterdam_df = pd.DataFrame(
    columns = ["link", "headline" , "author", "date_published", "snippet"]
)

In [None]:
#extract data
for art in article_list:

    #create temp dictionary
    pg_dict = dict.fromkeys(nyamsterdam_df.columns)
    
    #extract data
    pg_dict["link"] = ["https:" + art.find("div").find("a")["href"]] if art.find("div") else [None]
    pg_dict["headline"] = [art.find("div").find("a").get_text()] if art.find("div") else [None]
    pg_dict["date_published"] = [art.find_all(class_ = "jetpack-instant-search__path-breadcrumb-piece")[1].get_text(strip = True)] if len(art.find_all(class_ = "jetpack-instant-search__path-breadcrumb-piece")) > 1 else [None]
    pg_dict["snippet"] = [art.find("div", class_ = "jetpack-instant-search__search-result-expanded__content").get_text(strip = True)] if art.find("div", class_ = "jetpack-instant-search__search-result-expanded__content") else [None]

    nyamsterdam_df = pd.concat([nyamsterdam_df, pd.DataFrame(pg_dict)])

In [None]:
#remove empties
nyamsterdam_df = nyamsterdam_df.dropna(axis = 0, how = "all")

In [None]:
#check results
nyamsterdam_df.shape

In [None]:
#save data
nyamsterdam_df.to_csv("../data/nyamsterdam_results.csv", index = False)

### City Journal

In [None]:
#empty storage
pages = []

In [None]:
#start selenium
driver = webdriver.Chrome()

In [None]:
#web scraping
for pg in range(1,3):
    query = query_url("City", pg)
    driver.get(query)
    pages.append(driver.page_source)
    time.sleep(3)
    
driver.quit()

In [None]:
#create empty df
city_df = pd.DataFrame(
    columns = ["link", "headline" , "author", "date_published", "snippet"]
)

In [None]:
#extract data
for news_page in pages:
    article_list = BeautifulSoup(news_page, 'html.parser').find_all("div", class_="m_card horizontal with-thumbnail vertical")
    for art in article_list:

        #create temp dictionary
        pg_dict = dict.fromkeys(city_df.columns)
        
        #extract data
        pg_dict["link"] = [art.find("a", class_ = "title").get("href")]
        pg_dict["headline"] = [art.find("a", class_ = "title").get_text()]
        pg_dict["date_published"] = [art.find("div", class_ = "date").get_text()]
        pg_dict["author"] = [art.find("span", class_ = "authors").get_text()]
        pg_dict["snippet"] = [art.find("p", class_ = "subtitle").get_text()]

        city_df = pd.concat([city_df, pd.DataFrame(pg_dict)])

In [None]:
#check results
city_df.shape

In [None]:
#save data
city_df.to_csv("../data/cityjournal_results.csv", index = False)

### Gotham Gazette

In [None]:
#get api key for google custom search
api_key = os.environ.get('cse_api_key')

In [None]:
# set url for api
base_url =  "https://www.googleapis.com/customsearch/v1?"
pages = []


In [None]:
#identify starting point
start_page = 1
counter = True

In [None]:
while counter:
    response = requests.get(base_url, params= {
        "key": api_key,
        "cx": "016666263354593363178:zqqt8gavlkw",
        "q": "SHSAT",
        "start": start_page
    })
    #check status
    if response.status_code != 200:
        print("query failed")
        break
    
    #append results
    if response.json().get("items") and response.json()["queries"].get("nextPage"):
        pages.append(response.json()["items"])
        #update
        start_page = response.json()["queries"]["nextPage"][0]["startIndex"]
    
    #continue loop?
    if response.json()["queries"].get("nextPage"):
        counter = True
    else:
        counter = False
    #rest 
    time.sleep(3) 

In [None]:
#create empty df
gotham_df = pd.DataFrame(
    columns = ["link", "headline" , "author", "date_published", "snippet"]
)

In [None]:
#extract data
for pg in pages:
    #skip if page is blank
    if not pg:
            continue 

    #create temp dictionary
    pg_dict = dict.fromkeys(gotham_df.columns)
    
    #loop through each story
    for story in pg:
        pg_dict["link"] = [story["link"]]
        pg_dict["headline"] = [story["pagemap"]["metatags"][0].get("og:title")]
        pg_dict["author"] = [story["pagemap"]["metatags"][0].get("author")]
        pg_dict["date_published"] = [story["snippet"].split(" ...")[0]]
        pg_dict["snippet"] = [story["snippet"].split(" ...")[1]] if len(story["snippet"].split(" ...")) > 1 else [story["snippet"].split(" ...")[0]]

        gotham_df = pd.concat([gotham_df, pd.DataFrame(pg_dict)])

In [None]:
#save data
gotham_df.to_csv("../data/gotham_results.csv", index = False)