In [161]:
# Import statements
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import html


In [None]:
# I want to make a project where I find info about all the courses in UW Madison

# Project started on Nov. 25, 2024

# Sources
Source 1 : https://stackoverflow.com/questions/2360598/how-do-i-unescape-html-entities-in-a-string-in-python-3-1/3796917

# Step 1 
Gather basic information from https://guide.wisc.edu/courses/

In [168]:
# Setup for course lists
url = "https://guide.wisc.edu/courses/"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Get the lists (ul) from the website
    # The lists are inside <div id="atozindex"> == $0
    
data = [] 
    
atoz_div = soup.find('div', id='atozindex')

# Gathers id, abbreviations, name
if atoz_div:
    ul_elements = atoz_div.find_all('ul')
    # The site is set up from A-Z with each letter having a ul for the courses
    for ul in ul_elements:
        items = ul.find_all('li')
        for item in items:
            # this will grab the info inside the href (that holds name and course abbreviations)
                # <li><a href="/courses/acct_i_s/">Accounting and Information Systems (ACCT I S)</a></li>
            pattern = r'<li><a href="/courses/(.*?)/">(.+?) \((.+?)\)</a></li>'
            match = re.match(pattern, str(item))
            #print(item)            
            
            if match:
                dep_id = match.group(1) # used for website nav
                dep_name = match.group(2) # naming 
                dep_abbrev = match.group(3) # full name
                
                # SOURCE 1: Fix issue with amps like "ANAT&amp;PHY"
                dep_abbrev = html.unescape(dep_abbrev)
                
                data.append({"ID" : dep_id, "ABBREV": dep_abbrev, "DEPARTMENT": dep_name,})

df = pd.DataFrame(data)

In [171]:
# Gather the link for each course using ID 
for course in df["DEPARTMENT"]:
    df["URL"] = "https://guide.wisc.edu/courses/" + df["ID"] + "/"
df

Unnamed: 0,ID,ABBREV,DEPARTMENT,URL
0,acct_i_s,ACCT I S,Accounting and Information Systems,https://guide.wisc.edu/courses/acct_i_s/
1,act_sci,ACT SCI,Actuarial Science,https://guide.wisc.edu/courses/act_sci/
2,afroamer,AFROAMER,African American Studies,https://guide.wisc.edu/courses/afroamer/
3,african,AFRICAN,African Cultural Studies,https://guide.wisc.edu/courses/african/
4,a_a_e,A A E,Agricultural and Applied Economics,https://guide.wisc.edu/courses/a_a_e/
...,...,...,...,...
184,surgery,SURGERY,Surgery,https://guide.wisc.edu/courses/surgery/
185,surg_sci,SURG SCI,Surgical Sciences,https://guide.wisc.edu/courses/surg_sci/
186,theatre,THEATRE,Theatre and Drama,https://guide.wisc.edu/courses/theatre/
187,urb_r_pl,URB R PL,Urban and Regional Planning,https://guide.wisc.edu/courses/urb_r_pl/


# Step 2
Make code (def) so you can access all the possible courses from a specific DEPARTMENT

In [220]:
def gather_courses(department):
    # First, check if the department name is actually in the datafram
    if department in df["DEPARTMENT"].values:
        info = df.loc[df["DEPARTMENT"] == department]
                
        # Get the html loaded from new url
        url = info["URL"].iloc[0]
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        courses = []
        course_blocks = soup.find_all(class_="courseblock")
        
        for course in course_blocks:
            course_details = {}
            
            # TITLE and ID 
            course_title = course.find('p', class_='courseblocktitle').text.strip()
            
            # found issue with Zero Width Space, this is how to fix
            course_title = course_title.replace("\u200B", "")    
            
            pattern = r"([A-Za-z&\/\s0-9]+)\s*(\d{3})\s*—\s*(.+)"
            match = re.match(pattern, course_title)
            
            if match:
                #course_details["DEP_COURSE"] = match.group(1)
                course_details["ABBREV"] = str(info["ABBREV"].iloc[0]) + " " + match.group(2)
                course_details["NAME"] = match.group(3)
                course_details["ID"] = int(match.group(2))
            
            # CREDITS (range)
            course_credits = course.find('p', class_='courseblockcredits').text.strip()
            pattern = r"([0-9\-]+)\scredit"
            match = re.match(pattern, course_credits)
            if match:
                course_details["CREDITS"] = match.group(1)
                
            # More info section
                # Load "cb-extras" class from course
            course_extras = course.find_all('p', class_='courseblockextra noindent clearfix')
            #print(course_extras)
                
            # DESC - Not sure if I should include in df
            course_details["DESCRIPTION"] = course.find("p", class_="courseblockdesc noindent").text.strip()
           
            # append course
            courses.append(course_details)
        
        # return courses
        return pd.DataFrame(courses)
        
    else:
        print("Not in DataFrame")
        
courses_df = gather_courses("African Cultural Studies")

courses_df

[<p class="courseblockextra noindent clearfix"><span class="cbextra=label"><strong><span class="cbextra-label"><strong>Requisites: </strong></span></strong></span><span class="cbextra-data">None</span></p>, <p class="courseblockextra noindent clearfix"><span class="cbextra=label"><strong><span class="cbextra-label"><strong>Course Designation: </strong></span></strong></span><span class="cbextra-data">Breadth - Humanities<br/> Level - Elementary<br/> L&amp;S Credit - Counts as Liberal Arts and Science credit in L&amp;S</span></p>, <p class="courseblockextra noindent clearfix"><span class="cbextra=label"><strong><span class="cbextra-label"><strong>Repeatable for Credit: </strong></span></strong></span><span class="cbextra-data">No</span></p>, <p class="courseblockextra noindent clearfix"><span class="cbextra=label"><strong><span class="cbextra-label"><strong>Last Taught: </strong></span></strong></span><span class="cbextra-data">Fall 2024</span></p>]
[<p class="courseblockextra noindent 

Unnamed: 0,ABBREV,NAME,ID,CREDITS,DESCRIPTION
0,AFRICAN 100,INTRODUCTION TO AFRICAN CULTURAL EXPRESSION,100,3,An introduction to current research in African...
1,AFRICAN 106,INTRODUCTION TO AFRICAN HISTORY,106,3-4,Introductory exploration of a thematic or chro...
2,AFRICAN 129,AFRICA ON THE GLOBAL STAGE,129,3-4,Explores the interplay between Africa and the ...
3,AFRICAN 201,INTRODUCTION TO AFRICAN LITERATURE,201,3,Survey of African literary traditions and intr...
4,AFRICAN 202,INTRODUCTORY TOPICS IN AFRICAN CULTURAL STUDIES,202,3,Various topics in African cultural studies and...
...,...,...,...,...,...
99,AFRICAN 905,SEMINAR IN AFRICAN CULTURAL STUDIES: TOPICS,905,3,Examines various mediated cultural forms such ...
100,AFRICAN 926,SEMINAR IN RESEARCH METHODS IN AFRICAN CULTURA...,926,3,"Field methods, techniques, and analytical appr..."
101,AFRICAN 983,INTERDEPARTMENTAL SEMINAR IN AFRICAN STUDIES T...,983,3,Interdisciplinary inquiry in African societies...
102,AFRICAN 990,THESIS,990,1-9,"Advanced level mentored reading, writing, and ..."


# Part 3
Make CSV of all courses after implemting all of step 2