I'm scraping 2023-2024 CUNY undergraduate demographic data from the New York State Education Department's website: https://data.nysed.gov/lists.php?start=67&type=higher

In [1]:
## import libraries
import pandas as pd
from random import randrange
from bs4 import BeautifulSoup
import time
import requests

In [5]:
#url to scrape
url = "https://data.nysed.gov/lists.php?start=67&type=higher"
## need to soup
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

In [7]:
## target each university on the page
uni_list = soup.find_all("div", class_="title")
uni_list

[<div class="title"><a href="profile.php?instid=800000052570">CANISIUS UNIVERSITY OF BUFFALO, NY </a></div>,
 <div class="title">HIGHER EDUCATION</div>,
 <div class="title">HIGHER EDUCATION</div>,
 <div class="title"><a href="profile.php?instid=800000042817">CATHOLIC MED CTR BROOK/QUEENS SCH NR </a></div>,
 <div class="title"><a href="profile.php?instid=800000065326">CAYUGA COMM COLLEGE - FULTON CENTER </a></div>,
 <div class="title">HIGHER EDUCATION</div>,
 <div class="title">HIGHER EDUCATION</div>,
 <div class="title"><a href="profile.php?instid=800000054552">CAYUGA COUNTY COMM COLLEGE </a></div>,
 <div class="title">HIGHER EDUCATION</div>,
 <div class="title">HIGHER EDUCATION</div>,
 <div class="title"><a href="profile.php?instid=800000050956">CAZENOVIA COLLEGE </a></div>,
 <div class="title">HIGHER EDUCATION</div>,
 <div class="title"><a href="profile.php?instid=800000040731">CENTRAL CITY BUSINESS INSTITUTE 665 </a></div>,
 <div class="title"><a href="profile.php?instid=80000005297

In [9]:
## extract all a tags in the list of divs
atags = []
for div_element in uni_list:
    for atag in div_element.find_all("a"):
        atags.append(atag)
## extract just the CUNYs
cuny_atags = []
for atag in atags:
    if "CUNY" in atag.text:
        cuny_atags.append(atag)
cuny_atags

[<a href="profile.php?instid=800000047621">CUNY BERNARD M. BARUCH COLLEGE </a>,
 <a href="profile.php?instid=800000043781">CUNY BROOKLYN COLLEGE </a>,
 <a href="profile.php?instid=800000047619">CUNY CENTRAL ADMINISTRATION </a>,
 <a href="profile.php?instid=800000047047">CUNY CITY COLLEGE </a>,
 <a href="profile.php?instid=800000041831">CUNY COLLEGE OF STATEN ISLAND </a>,
 <a href="profile.php?instid=800000047620">CUNY GRADUATE SCHOOL </a>,
 <a href="profile.php?instid=800000047622">CUNY HUNTER COLLEGE </a>,
 <a href="profile.php?instid=800000047317">CUNY JOHN JAY COL CRIM JUSTICE </a>,
 <a href="profile.php?instid=800000043091">CUNY LAW SCHOOL AT QUEENS  </a>,
 <a href="profile.php?instid=800000046077">CUNY LEHMAN COLLEGE </a>,
 <a href="profile.php?instid=800000045393">CUNY NYC COLLEGE OF TECHNOLOGY </a>,
 <a href="profile.php?instid=800000043093">CUNY QUEENS COLLEGE </a>,
 <a href="profile.php?instid=800000089728">CUNY SCHOOL OF PROFESSIONAL STUDIES </a>,
 <a href="profile.php?instid

In [11]:
## extract the hrefs
cuny_hrefs = []
for atag2 in cuny_atags:
    cuny_hrefs.append(atag2.get("href"))
    
cuny_hrefs

['profile.php?instid=800000047621',
 'profile.php?instid=800000043781',
 'profile.php?instid=800000047619',
 'profile.php?instid=800000047047',
 'profile.php?instid=800000041831',
 'profile.php?instid=800000047620',
 'profile.php?instid=800000047622',
 'profile.php?instid=800000047317',
 'profile.php?instid=800000043091',
 'profile.php?instid=800000046077',
 'profile.php?instid=800000045393',
 'profile.php?instid=800000043093',
 'profile.php?instid=800000089728',
 'profile.php?instid=800000073513',
 'profile.php?instid=800000042570']

In [13]:
## separate the institution id from the rest
cuny_ids = [href.split('=')[1] for href in cuny_hrefs]
cuny_ids

['800000047621',
 '800000043781',
 '800000047619',
 '800000047047',
 '800000041831',
 '800000047620',
 '800000047622',
 '800000047317',
 '800000043091',
 '800000046077',
 '800000045393',
 '800000043093',
 '800000089728',
 '800000073513',
 '800000042570']

In [21]:
#set up base parts of the url
base_url = "https://data.nysed.gov/highered-enrollment.php?year="
year = ["2022",
       "2023",
       "2024"]
end_url = "&instid="

In [23]:
links = [f"{base_url}{year}{end_url}{cuny_ids}" for cuny_id in cuny_ids]
links

["https://data.nysed.gov/highered-enrollment.php?year=['2022', '2023', '2024']&instid=['800000047621', '800000043781', '800000047619', '800000047047', '800000041831', '800000047620', '800000047622', '800000047317', '800000043091', '800000046077', '800000045393', '800000043093', '800000089728', '800000073513', '800000042570']",
 "https://data.nysed.gov/highered-enrollment.php?year=['2022', '2023', '2024']&instid=['800000047621', '800000043781', '800000047619', '800000047047', '800000041831', '800000047620', '800000047622', '800000047317', '800000043091', '800000046077', '800000045393', '800000043093', '800000089728', '800000073513', '800000042570']",
 "https://data.nysed.gov/highered-enrollment.php?year=['2022', '2023', '2024']&instid=['800000047621', '800000043781', '800000047619', '800000047047', '800000041831', '800000047620', '800000047622', '800000047317', '800000043091', '800000046077', '800000045393', '800000043093', '800000089728', '800000073513', '800000042570']",
 "https://dat