In [1]:
# Import statements
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import html


In [2]:
# I want to make a project where I find info about all the courses in UW Madison

# Project started on Nov. 25, 2024

# Sources
Source 1 : https://stackoverflow.com/questions/2360598/how-do-i-unescape-html-entities-in-a-string-in-python-3-1/3796917

# Step 1 
Gather basic information from https://guide.wisc.edu/courses/

In [59]:
# Setup for course lists
url = "https://guide.wisc.edu/courses/"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Get the lists (ul) from the website
    # The lists are inside <div id="atozindex"> == $0
    
data = [] 
    
atoz_div = soup.find('div', id='atozindex')

# Gathers id, abbreviations, name
if atoz_div:
    ul_elements = atoz_div.find_all('ul')
    # The site is set up from A-Z with each letter having a ul for the courses
    for ul in ul_elements:
        items = ul.find_all('li')
        for item in items:
            # this will grab the info inside the href (that holds name and course abbreviations)
                # <li><a href="/courses/acct_i_s/">Accounting and Information Systems (ACCT I S)</a></li>
            pattern = r'<li><a href="/courses/(.*?)/">(.+?) \(([^)]+)\)</a></li>'
            match = re.match(pattern, str(item))
            #print(item)            
            
            if match:
                dep_id = match.group(1) # used for website nav
                dep_name = match.group(2) # naming 
                dep_abbrev = match.group(3) # full name
                
                # SOURCE 1: Fix issue with amps like "ANAT&amp;PHY"
                dep_abbrev = html.unescape(dep_abbrev)
                dep_name = html.unescape(dep_name)
                
                data.append({"ID" : dep_id, "ABBREV": dep_abbrev, "DEPARTMENT": dep_name,})

df = pd.DataFrame(data)

# Gather the link for each course using ID 
for course in df["DEPARTMENT"]:
    df["URL"] = "https://guide.wisc.edu/courses/" + df["ID"] + "/"
for d in df["DEPARTMENT"]:
    print(d)
    
df

Accounting and Information Systems
Actuarial Science
African American Studies
African Cultural Studies
Agricultural and Applied Economics
Agroecology
Agronomy
Air Force Aerospace Studies
American Indian Studies
Anatomy
Anatomy & Physiology
Anesthesiology
Animal Sciences
Anthropology
Applied Biotechnology
Art Department
Art Education (Department of Art)
Art History
Asian American Studies
Asian Languages and Cultures
Asian Languages and Cultures: Languages
Astronomy
Atmospheric and Oceanic Sciences
Biochemistry
Biological Systems Engineering
Biology
Biology Core Curriculum
Biomedical Engineering
Biomedical Sciences and Technologies
Biomolecular Chemistry
Biostatistics and Medical Informatics
Botany
Cell and Regenerative Biology
Chemical and Biological Engineering
Chemistry
Chicana/o and Latina/o Studies
Civil and Environmental Engineering
Civil Society and Community Studies
Classics
Collaborative Nursing Program
Communication Arts
Communication Sciences and Disorders
Community and Enviro

Unnamed: 0,ID,ABBREV,DEPARTMENT,URL
0,acct_i_s,ACCT I S,Accounting and Information Systems,https://guide.wisc.edu/courses/acct_i_s/
1,act_sci,ACT SCI,Actuarial Science,https://guide.wisc.edu/courses/act_sci/
2,afroamer,AFROAMER,African American Studies,https://guide.wisc.edu/courses/afroamer/
3,african,AFRICAN,African Cultural Studies,https://guide.wisc.edu/courses/african/
4,a_a_e,A A E,Agricultural and Applied Economics,https://guide.wisc.edu/courses/a_a_e/
...,...,...,...,...
184,surgery,SURGERY,Surgery,https://guide.wisc.edu/courses/surgery/
185,surg_sci,SURG SCI,Surgical Sciences,https://guide.wisc.edu/courses/surg_sci/
186,theatre,THEATRE,Theatre and Drama,https://guide.wisc.edu/courses/theatre/
187,urb_r_pl,URB R PL,Urban and Regional Planning,https://guide.wisc.edu/courses/urb_r_pl/


# Step 2
Make code (def) so you can access all the possible courses from a specific DEPARTMENT

In [44]:
def gather_courses(department):
    # First, check if the department name is actually in the datafram
    if department in df["DEPARTMENT"].values:
        info = df.loc[df["DEPARTMENT"] == department]
                
        # Get the html loaded from new url
        url = info["URL"].iloc[0]
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        courses = []
        course_blocks = soup.find_all(class_="courseblock")
        
        for course in course_blocks:
            course_details = {}
            
            # TITLE and ID 
            course_title = course.find('p', class_='courseblocktitle').text.strip()
            
            # found issue with Zero Width Space, this is how to fix
            course_title = course_title.replace("\u200B", "")    
            
            pattern = r"([A-Za-z&\/\s0-9]+)\s*(\d{3})\s*—\s*(.+)"
            match = re.match(pattern, course_title)
            if match:
                #course_details["DEP_COURSE"] = match.group(1)
                # print(course)
                # print(str(info["ABBREV"].iloc[0]))
                # print(match)
                course_details["ABBREV"] = str(info["ABBREV"].iloc[0]) + " " + match.group(2)
                course_details["NAME"] = match.group(3)
                course_details["ID"] = int(match.group(2))
            
            # CREDITS (range)
            course_credits = course.find('p', class_='courseblockcredits').text.strip()
            pattern = r"([0-9\-]+)\scredit"
            match = re.match(pattern, course_credits)
            if match:
                course_details["CREDITS"] = match.group(1)
                
            # More info section
                # Load "cb-extras" class from course
            course_extras = course.find_all('p', class_='courseblockextra noindent clearfix')
            #print(course_extras)
                
            # DESC - Not sure if I should include in df
            course_details["DESCRIPTION"] = course.find("p", class_="courseblockdesc noindent").text.strip()
           
            # append course
            courses.append(course_details)
        
        # return courses
        return pd.DataFrame(courses)
        
    else:
        print("Not in DataFrame")
        
courses_df = gather_courses("Spanish")

courses_df

Unnamed: 0,ABBREV,NAME,ID,CREDITS,DESCRIPTION
0,Spanish and Portuguese) (SPANISH 101,FIRST SEMESTER SPANISH,101,4,All basic language skills: listening comprehen...
1,Spanish and Portuguese) (SPANISH 102,SECOND SEMESTER SPANISH,102,4,Continuation of SPANISH 101. All basic languag...
2,Spanish and Portuguese) (SPANISH 103,FIRST YEAR INTENSIVE SPANISH,103,6,Accelerated introduction to Spanish language p...
3,Spanish and Portuguese) (SPANISH 203,THIRD SEMESTER SPANISH,203,4,"Intermediate-level language review, Hispanic r..."
4,Spanish and Portuguese) (SPANISH 204,FOURTH SEMESTER SPANISH,204,4,"Intermediate-level language review, extensive ..."
...,...,...,...,...,...
96,Spanish and Portuguese) (SPANISH 861,SEMINAR-MODERN SPANISH LITERATURE,861,3,"Literary, cultural, or theoretical topics rele..."
97,Spanish and Portuguese) (SPANISH 882,SEMINAR IN HISPANIC CULTURE,882,3,"Seminar focusing on literary, cultural and/or ..."
98,Spanish and Portuguese) (SPANISH 899,INDEPENDENT READING,899,1-3,Directed study projects for graduate students ...
99,Spanish and Portuguese) (SPANISH 982,INTERDEPARTMENTAL SEMINAR IN THE LATIN-AMERICA...,982,1-3,Interdisciplinary inquiry in Latin American so...


# Part 3
Make CSV of all courses after implemting all of step 2