I used skills from week 1 of the advanced scraping module to scrape CUNY undergraduate demographic data from the New York State Education Department's website, which has a static url: https://data.nysed.gov/lists.php?start=67&type=higher

In [2]:
## import libraries
import pandas as pd
from random import randrange
from bs4 import BeautifulSoup
import time
import requests

In [3]:
#url to scrape
url = "https://data.nysed.gov/lists.php?start=67&type=higher"
## need to soup
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

In [4]:
## target each university on the page
uni_list = soup.find_all("div", class_="title")
uni_list

[<div class="title"><a href="profile.php?instid=800000052570">CANISIUS UNIVERSITY OF BUFFALO, NY </a></div>,
 <div class="title">HIGHER EDUCATION</div>,
 <div class="title">HIGHER EDUCATION</div>,
 <div class="title"><a href="profile.php?instid=800000042817">CATHOLIC MED CTR BROOK/QUEENS SCH NR </a></div>,
 <div class="title"><a href="profile.php?instid=800000065326">CAYUGA COMM COLLEGE - FULTON CENTER </a></div>,
 <div class="title">HIGHER EDUCATION</div>,
 <div class="title">HIGHER EDUCATION</div>,
 <div class="title"><a href="profile.php?instid=800000054552">CAYUGA COUNTY COMM COLLEGE </a></div>,
 <div class="title">HIGHER EDUCATION</div>,
 <div class="title">HIGHER EDUCATION</div>,
 <div class="title"><a href="profile.php?instid=800000050956">CAZENOVIA COLLEGE </a></div>,
 <div class="title">HIGHER EDUCATION</div>,
 <div class="title"><a href="profile.php?instid=800000040731">CENTRAL CITY BUSINESS INSTITUTE 665 </a></div>,
 <div class="title"><a href="profile.php?instid=80000005297

In [5]:
## extract all a tags in the list of divs
atags = []
for div_element in uni_list:
    for atag in div_element.find_all("a"):
        atags.append(atag)
## extract just the CUNYs
cuny_atags = []
for atag in atags:
    if "CUNY" in atag.text:
        cuny_atags.append(atag)
cuny_atags

[<a href="profile.php?instid=800000047621">CUNY BERNARD M. BARUCH COLLEGE </a>,
 <a href="profile.php?instid=800000043781">CUNY BROOKLYN COLLEGE </a>,
 <a href="profile.php?instid=800000047619">CUNY CENTRAL ADMINISTRATION </a>,
 <a href="profile.php?instid=800000047047">CUNY CITY COLLEGE </a>,
 <a href="profile.php?instid=800000041831">CUNY COLLEGE OF STATEN ISLAND </a>,
 <a href="profile.php?instid=800000047620">CUNY GRADUATE SCHOOL </a>,
 <a href="profile.php?instid=800000047622">CUNY HUNTER COLLEGE </a>,
 <a href="profile.php?instid=800000047317">CUNY JOHN JAY COL CRIM JUSTICE </a>,
 <a href="profile.php?instid=800000043091">CUNY LAW SCHOOL AT QUEENS  </a>,
 <a href="profile.php?instid=800000046077">CUNY LEHMAN COLLEGE </a>,
 <a href="profile.php?instid=800000045393">CUNY NYC COLLEGE OF TECHNOLOGY </a>,
 <a href="profile.php?instid=800000043093">CUNY QUEENS COLLEGE </a>,
 <a href="profile.php?instid=800000089728">CUNY SCHOOL OF PROFESSIONAL STUDIES </a>,
 <a href="profile.php?instid

In [6]:
## extract the hrefs
cuny_hrefs = []
for atag2 in cuny_atags:
    cuny_hrefs.append(atag2.get("href"))
    
cuny_hrefs

['profile.php?instid=800000047621',
 'profile.php?instid=800000043781',
 'profile.php?instid=800000047619',
 'profile.php?instid=800000047047',
 'profile.php?instid=800000041831',
 'profile.php?instid=800000047620',
 'profile.php?instid=800000047622',
 'profile.php?instid=800000047317',
 'profile.php?instid=800000043091',
 'profile.php?instid=800000046077',
 'profile.php?instid=800000045393',
 'profile.php?instid=800000043093',
 'profile.php?instid=800000089728',
 'profile.php?instid=800000073513',
 'profile.php?instid=800000042570']

In [7]:
# extract the college names too
cuny_names = []
for tag in cuny_atags:
    cuny_name = tag.text.strip()
    cuny_names.append(cuny_name)

(cuny_names)

['CUNY BERNARD M. BARUCH COLLEGE',
 'CUNY BROOKLYN COLLEGE',
 'CUNY CENTRAL ADMINISTRATION',
 'CUNY CITY COLLEGE',
 'CUNY COLLEGE OF STATEN ISLAND',
 'CUNY GRADUATE SCHOOL',
 'CUNY HUNTER COLLEGE',
 'CUNY JOHN JAY COL CRIM JUSTICE',
 'CUNY LAW SCHOOL AT QUEENS',
 'CUNY LEHMAN COLLEGE',
 'CUNY NYC COLLEGE OF TECHNOLOGY',
 'CUNY QUEENS COLLEGE',
 'CUNY SCHOOL OF PROFESSIONAL STUDIES',
 'CUNY STELLA AND CHARLES GUTTMAN COMM',
 'CUNY YORK COLLEGE']

In [8]:
## separate the institution id from the rest
cuny_ids = [href.split('=')[1] for href in cuny_hrefs]
cuny_ids

['800000047621',
 '800000043781',
 '800000047619',
 '800000047047',
 '800000041831',
 '800000047620',
 '800000047622',
 '800000047317',
 '800000043091',
 '800000046077',
 '800000045393',
 '800000043093',
 '800000089728',
 '800000073513',
 '800000042570']

In [9]:
year = ["2022","2023","2024"]
base_url = "https://data.nysed.gov/highered-enrollment.php"
links = [f"{base_url}?year={y}&instid={cuny_id}"for y in year for cuny_id in cuny_ids]
links

['https://data.nysed.gov/highered-enrollment.php?year=2022&instid=800000047621',
 'https://data.nysed.gov/highered-enrollment.php?year=2022&instid=800000043781',
 'https://data.nysed.gov/highered-enrollment.php?year=2022&instid=800000047619',
 'https://data.nysed.gov/highered-enrollment.php?year=2022&instid=800000047047',
 'https://data.nysed.gov/highered-enrollment.php?year=2022&instid=800000041831',
 'https://data.nysed.gov/highered-enrollment.php?year=2022&instid=800000047620',
 'https://data.nysed.gov/highered-enrollment.php?year=2022&instid=800000047622',
 'https://data.nysed.gov/highered-enrollment.php?year=2022&instid=800000047317',
 'https://data.nysed.gov/highered-enrollment.php?year=2022&instid=800000043091',
 'https://data.nysed.gov/highered-enrollment.php?year=2022&instid=800000046077',
 'https://data.nysed.gov/highered-enrollment.php?year=2022&instid=800000045393',
 'https://data.nysed.gov/highered-enrollment.php?year=2022&instid=800000043093',
 'https://data.nysed.gov/hig

In [48]:
year_ = {y: links for y in years}
year_dict

{'2022': ['https://data.nysed.gov/highered-enrollment.php?year=2022&instid=800000047621',
  'https://data.nysed.gov/highered-enrollment.php?year=2022&instid=800000043781',
  'https://data.nysed.gov/highered-enrollment.php?year=2022&instid=800000047619',
  'https://data.nysed.gov/highered-enrollment.php?year=2022&instid=800000047047',
  'https://data.nysed.gov/highered-enrollment.php?year=2022&instid=800000041831',
  'https://data.nysed.gov/highered-enrollment.php?year=2022&instid=800000047620',
  'https://data.nysed.gov/highered-enrollment.php?year=2022&instid=800000047622',
  'https://data.nysed.gov/highered-enrollment.php?year=2022&instid=800000047317',
  'https://data.nysed.gov/highered-enrollment.php?year=2022&instid=800000043091',
  'https://data.nysed.gov/highered-enrollment.php?year=2022&instid=800000046077',
  'https://data.nysed.gov/highered-enrollment.php?year=2022&instid=800000045393',
  'https://data.nysed.gov/highered-enrollment.php?year=2022&instid=800000043093',
  'https

In [10]:
len(links)

45

In [42]:

df_list = []
broken_links = []
total_links = len(links)

for counter, cuny_id in enumerate(cuny_ids, start=1):
    target_link = links[counter - 1]
    print(f"Scraping {counter} of {total_links}")
    try:
        data = pd.read_html(target_link)
        df = data[0]
        df["INSTID"] = cuny_id
        df["COLLEGE"] = cuny_names[counter - 1]
        df["YEAR"] = year_value 
        df_list.append(df)
    except Exception as e:
        print(f"{target_link}: {e}")
        broken_links.append(target_link)
    finally:
        snooze = randrange(5, 10)
        print(f"Snoozing for {snooze} seconds")
        time.sleep(snooze)

print(f"Done scraping all units")

Scraping 1 of 45
Snoozing for 9 seconds
Scraping 2 of 45
Snoozing for 9 seconds
Scraping 3 of 45
https://data.nysed.gov/highered-enrollment.php?year=2022&instid=800000047619: No tables found
Snoozing for 7 seconds
Scraping 4 of 45
Snoozing for 8 seconds
Scraping 5 of 45
Snoozing for 6 seconds
Scraping 6 of 45
Snoozing for 5 seconds
Scraping 7 of 45
Snoozing for 7 seconds
Scraping 8 of 45
Snoozing for 8 seconds
Scraping 9 of 45
Snoozing for 8 seconds
Scraping 10 of 45
Snoozing for 8 seconds
Scraping 11 of 45
Snoozing for 6 seconds
Scraping 12 of 45
Snoozing for 6 seconds
Scraping 13 of 45
https://data.nysed.gov/highered-enrollment.php?year=2022&instid=800000089728: No tables found
Snoozing for 8 seconds
Scraping 14 of 45
Snoozing for 5 seconds
Scraping 15 of 45
Snoozing for 7 seconds
Done scraping all units


In [38]:
final_df = pd.concat(df_list, ignore_index = True)
final_df = final_df.drop(columns=['Unnamed: 4'])
final_df = final_df[final_df['SUBGROUP'] != 'Total']
final_df = final_df[final_df['SUBGROUP'] != 'Gender']
final_df = final_df[final_df['SUBGROUP'] != 'Race and Ethnicity']

In [40]:
final_df

Unnamed: 0,SUBGROUP,FULL-TIME,PART-TIME,TOTAL,INSTID,COLLEGE,YEAR
0,All Students,12455,3404,15859,800000047621,CUNY BERNARD M. BARUCH COLLEGE,
1,Degree/Certificate Seeking,12400,3083,15483,800000047621,CUNY BERNARD M. BARUCH COLLEGE,
2,First-Time,2514,18,2532,800000047621,CUNY BERNARD M. BARUCH COLLEGE,
3,Transfer-In,1132,366,1498,800000047621,CUNY BERNARD M. BARUCH COLLEGE,
4,Continuing/Returning,8754,2699,11453,800000047621,CUNY BERNARD M. BARUCH COLLEGE,
...,...,...,...,...,...,...,...
268,Native Hawaiian or Other Pacific Islander,16,9,25,800000042570,CUNY YORK COLLEGE,
269,White,248,205,453,800000042570,CUNY YORK COLLEGE,
270,Multiracial,148,48,196,800000042570,CUNY YORK COLLEGE,
271,Unknown race and ethnicity,0,0,0,800000042570,CUNY YORK COLLEGE,
