In [1]:
# Import statements
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import html


In [2]:
# I want to make a project where I find info about all the courses in UW Madison

# Project started on Nov. 25, 2024

# Sources
Source 1 : https://stackoverflow.com/questions/2360598/how-do-i-unescape-html-entities-in-a-string-in-python-3-1/3796917

# Step 1 
Gather basic information from https://guide.wisc.edu/courses/

In [70]:
# Setup for course lists
url = "https://guide.wisc.edu/courses/"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Get the lists (ul) from the website
    # The lists are inside <div id="atozindex"> == $0
    
data = [] 
    
atoz_div = soup.find('div', id='atozindex')

# Gathers id, abbreviations, name
if atoz_div:
    ul_elements = atoz_div.find_all('ul')
    # The site is set up from A-Z with each letter having a ul for the courses
    for ul in ul_elements:
        items = ul.find_all('li')
        for item in items:
            # this will grab the info inside the href (that holds name and course abbreviations)
                # <li><a href="/courses/acct_i_s/">Accounting and Information Systems (ACCT I S)</a></li>
            pattern = r'<li><a href="/courses/(.*?)/">(.+?) \(([^)]+)\)</a></li>'
            match = re.match(pattern, str(item))
            #print(item)            
            
            if match:
                dep_id = match.group(1) # used for website nav
                dep_name = match.group(2) # naming 
                dep_abbrev = match.group(3) # full name
                
                # SOURCE 1: Fix issue with amps like "ANAT&amp;PHY"
                dep_abbrev = html.unescape(dep_abbrev)
                dep_name = html.unescape(dep_name)
                
                dep_url = f"https://guide.wisc.edu/courses/{dep_id}/"

                data.append({"ID" : dep_id, "ABBREV": dep_abbrev, "DEPARTMENT": dep_name, "URL" : dep_url})

df = pd.DataFrame(data)

# Gather the link for each course using ID 
for course in df["DEPARTMENT"]:
    df["URL"] = "https://guide.wisc.edu/courses/" + df["ID"] + "/"
# for d in df["DEPARTMENT"]:
#     print(d)

df

Unnamed: 0,ID,ABBREV,DEPARTMENT,URL
0,acct_i_s,ACCT I S,Accounting and Information Systems,https://guide.wisc.edu/courses/acct_i_s/
1,act_sci,ACT SCI,Actuarial Science,https://guide.wisc.edu/courses/act_sci/
2,afroamer,AFROAMER,African American Studies,https://guide.wisc.edu/courses/afroamer/
3,african,AFRICAN,African Cultural Studies,https://guide.wisc.edu/courses/african/
4,a_a_e,A A E,Agricultural and Applied Economics,https://guide.wisc.edu/courses/a_a_e/
...,...,...,...,...
184,surgery,SURGERY,Surgery,https://guide.wisc.edu/courses/surgery/
185,surg_sci,SURG SCI,Surgical Sciences,https://guide.wisc.edu/courses/surg_sci/
186,theatre,THEATRE,Theatre and Drama,https://guide.wisc.edu/courses/theatre/
187,urb_r_pl,URB R PL,Urban and Regional Planning,https://guide.wisc.edu/courses/urb_r_pl/


# Step 2
Make code (def) so you can access all the possible courses from a specific DEPARTMENT

In [80]:
def gather_courses(department):
    """
    Gathers courses for the given department from its URL.

    Parameters:
        department (str): Name of the department to scrape.

    Returns:
        DataFrame: Contains details of courses for the specified department.
    """
    # First, check if the department name is actually in the datafram
    if department in df["DEPARTMENT"].values:
        info = df.loc[df["DEPARTMENT"] == department]
                
        # Get the html loaded from new url
        url = info["URL"].iloc[0]
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        courses = []
        course_blocks = soup.find_all(class_="courseblock")
        
        for course in course_blocks:
            course_details = {}
            
            # TITLE and ID 
            course_title = course.find('p', class_='courseblocktitle').text.strip()
            
            # found issue with Zero Width Space, this is how to fix
            course_title = course_title.replace("\u200B", "")    
            
            pattern = r"([A-Za-z&\-\/\s0-9]+)\s*(\d{3})\s*—\s*(.+)"
            match = re.match(pattern, course_title)
            
            if match:
                #course_details["DEP_COURSE"] = match.group(1)
                course_details["ABBREV"] = str(info["ABBREV"].iloc[0]) + " " + match.group(2)
                course_details["NAME"] = match.group(3)
                course_details["ID"] = int(match.group(2))
            
            # CREDITS (range)
            course_credits = course.find('p', class_='courseblockcredits').text.strip()
            pattern = r"([0-9\-]+)\scredit"
            match = re.match(pattern, course_credits)
            if match:
                course_details["CREDITS"] = match.group(1)
                
            # More info section
                # Load "cb-extras" class from course
            course_extras = course.find('div', class_='cb-extras')
            if course_extras:
                extras = []
                for extra in course_extras.find_all('p', class_='courseblockextra noindent clearfix'):
                    label = extra.find('span', class_='cbextra-label')
                    data = extra.find('span', class_='cbextra-data')
                    if label and data:
                        extras.append(f"{label.text.strip()} {data.text.strip()}")
                course_details["EXTRAS"] = " | ".join(extras)
            else:
                course_details["EXTRAS"] = "None"
            
            
                
            # DESC - Not sure if I should include in df
            course_details["DESCRIPTION"] = course.find("p", class_="courseblockdesc noindent").text.strip()
           
            # append course
            courses.append(course_details)
        
        # return courses
        return pd.DataFrame(courses)
        
    else:
        print("Not in DataFrame")
        return pd.DataFrame()
        
courses_df = gather_courses("Hebrew-Biblical")

courses_df

Unnamed: 0,ABBREV,NAME,ID,CREDITS,EXTRAS,DESCRIPTION
0,HEBR-BIB 103,"ELEMENTARY BIBLICAL HEBREW, I",103,4,Requisites: None | Course Designation: Frgn La...,Introduction to the language of the Hebrew Bib...
1,HEBR-BIB 104,"ELEMENTARY BIBLICAL HEBREW, II",104,4,Requisites: HEBR-BIB 103 | Course Designation:...,Introduction to the language of the Hebrew Bib...
2,HEBR-BIB 303,"ELEMENTARY BIBLICAL HEBREW, I",303,3,Requisites: Graduate/professional standing | R...,"Forms and syntax, reading of Classical Hebrew ..."
3,HEBR-BIB 304,"ELEMENTARY BIBLICAL HEBREW, II",304,3,Requisites: HEBR-BIB 303 | Repeatable for Cred...,"Forms and syntax, reading of Classical Hebrew ..."
4,HEBR-BIB 323,"INTERMEDIATE BIBLICAL HEBREW, I",323,4,"Requisites: HEBR-BIB 104, 304, or graduate/pro...","Review of grammar, introduction to reading nar..."
5,HEBR-BIB 324,"INTERMEDIATE BIBLICAL HEBREW, II",324,4,Requisites: HEBR-BIB 323 or graduate/professio...,"Review of grammar, introduction to reading nar..."
6,HEBR-BIB 332,PROPHETS OF THE BIBLE,332,4,Requisites: RELIG ST/​CLASSICS/​JEWISH/​LITTRA...,"An introduction to the thought, literature, an..."
7,HEBR-BIB 391,INTENSIVE ELEMENTARY BIBLICAL HEBREW,391,4,Requisites: Not open to students with credit f...,An intensive introduction to the grammar and v...
8,HEBR-BIB 513,"BIBLICAL TEXTS, POETRY",513,3,Requisites: HEBR-BIB 324 | Course Designation:...,Critical reading of selected texts from the Mi...
9,HEBR-BIB 514,"BIBLICAL TEXTS, POETRY",514,3,Requisites: HEBR-BIB/​JEWISH 513 | Course Des...,Critical reading of selected texts from the La...


# Part 3
Make CSV of all courses after implemting all of step 2