In [1]:
# Import statements
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import html


In [2]:
# I want to make a project where I find info about all the courses in UW Madison

# Project started on Nov. 25, 2024

# Sources
Source 1 : https://stackoverflow.com/questions/2360598/how-do-i-unescape-html-entities-in-a-string-in-python-3-1/3796917

# Step 1 
Gather basic information from https://guide.wisc.edu/courses/

In [233]:
# Setup for course lists
url = "https://guide.wisc.edu/courses/"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Get the lists (ul) from the website
    # The lists are inside <div id="atozindex"> == $0
    
data = [] 
    
atoz_div = soup.find('div', id='atozindex')

# Gathers id, abbreviations, name
if atoz_div:
    ul_elements = atoz_div.find_all('ul')
    # The site is set up from A-Z with each letter having a ul for the courses
    for ul in ul_elements:
        items = ul.find_all('li')
        for item in items:
            # this will grab the info inside the href (that holds name and course abbreviations)
                # <li><a href="/courses/acct_i_s/">Accounting and Information Systems (ACCT I S)</a></li>
            pattern = r'<li><a href="/courses/(.*?)/">(.+?) \(([^)]+)\)</a></li>'
            match = re.match(pattern, str(item))
            #print(item)            
            
            if match:
                dep_id = match.group(1) # used for website nav
                dep_name = match.group(2) # naming 
                dep_abbrev = match.group(3) # full name
                
                # SOURCE 1: Fix issue with amps like "ANAT&amp;PHY"
                dep_abbrev = html.unescape(dep_abbrev)
                dep_name = html.unescape(dep_name)
                
                dep_url = f"https://guide.wisc.edu/courses/{dep_id}/"

                data.append({"ID" : dep_id, "ABBREV": dep_abbrev, "DEPARTMENT": dep_name, "URL" : dep_url})

df = pd.DataFrame(data)

# Gather the link for each course using ID 
for course in df["DEPARTMENT"]:
    df["URL"] = "https://guide.wisc.edu/courses/" + df["ID"] + "/"
# for d in df["DEPARTMENT"]:
#     print(d)

df

Unnamed: 0,ID,ABBREV,DEPARTMENT,URL
0,acct_i_s,ACCT I S,Accounting and Information Systems,https://guide.wisc.edu/courses/acct_i_s/
1,act_sci,ACT SCI,Actuarial Science,https://guide.wisc.edu/courses/act_sci/
2,afroamer,AFROAMER,African American Studies,https://guide.wisc.edu/courses/afroamer/
3,african,AFRICAN,African Cultural Studies,https://guide.wisc.edu/courses/african/
4,a_a_e,A A E,Agricultural and Applied Economics,https://guide.wisc.edu/courses/a_a_e/
...,...,...,...,...
184,surgery,SURGERY,Surgery,https://guide.wisc.edu/courses/surgery/
185,surg_sci,SURG SCI,Surgical Sciences,https://guide.wisc.edu/courses/surg_sci/
186,theatre,THEATRE,Theatre and Drama,https://guide.wisc.edu/courses/theatre/
187,urb_r_pl,URB R PL,Urban and Regional Planning,https://guide.wisc.edu/courses/urb_r_pl/


# Step 2
Make code (def) so you can access all the possible courses from a specific DEPARTMENT

In [260]:
def gather_courses(department):
    """
    Gathers courses for the given department from its URL.

    Parameters:
        department (str): Name of the department to scrape.

    Returns:
        DataFrame: Contains details of courses for the specified department.
    """
    # First, check if the department name is actually in the datafram
    if department in df["DEPARTMENT"].values:
        info = df.loc[df["DEPARTMENT"] == department]
                
        # Get the html loaded from new url
        url = info["URL"].iloc[0]
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        courses = []
        course_blocks = soup.find_all(class_="courseblock")
        
        for course in course_blocks:
            course_details = {}
            
            # TITLE and ID 
            course_title = course.find('p', class_='courseblocktitle').text.strip()
            
            # found issue with Zero Width Space, this is how to fix
            course_title = course_title.replace("\u200B", "")    
            
            pattern = r"([A-Za-z&\-\/\s0-9]+)\s*(\d{3})\s*—\s*(.+)"
            match = re.match(pattern, course_title)
            
            if match:
                #course_details["DEP_COURSE"] = match.group(1)
                course_details["ABBREV"] = str(info["ABBREV"].iloc[0]) + " " + match.group(2)
                course_details["NAME"] = match.group(3)
                course_details["ID"] = int(match.group(2))
            
            # CREDITS (range)
            course_credits = course.find('p', class_='courseblockcredits').text.strip()
            pattern = r"([0-9\-]+)\scredit"
            match = re.match(pattern, course_credits)
            if match:
                course_details["CREDITS"] = match.group(1)
                
            # More info section
                # Load "cb-extras" class from course
            course_extras = course.find('div', class_='cb-extras')
            if course_extras:
                extras = []
                for extra in course_extras.find_all('p', class_='courseblockextra noindent clearfix'):
                    label = extra.find('span', class_='cbextra-label')
                    data = extra.find('span', class_='cbextra-data')
                    if label and data:
                        extras.append(f"{label.text.strip()} {data.text.strip()}")
                course_details["EXTRAS"] = " | ".join(extras)
            else:
                course_details["EXTRAS"] = "None"
            
            
                
            # DESC - Not sure if I should include in df
            course_details["DESCRIPTION"] = course.find("p", class_="courseblockdesc noindent").text.strip()
           
            # append course
            courses.append(course_details)
        
        # return courses
        return pd.DataFrame(courses)
        
    else:
        print("Not in DataFrame")
        return pd.DataFrame()
        
courses_df = gather_courses("Mathematics")

courses_df

Unnamed: 0,CREDITS,EXTRAS,DESCRIPTION,ABBREV,NAME,ID
0,3,Requisites: Placement into MATH 96 | Repeatabl...,Covers the necessary mathematical tools needed...,,,
1,3,Requisites: MATH 96 or placement into MATH 112...,"Properties of elementary functions, such as po...",MATH 112,ALGEBRA,112.0
2,3,Requisites: MATH 112 or placement into MATH 11...,"Covers the graphs, properties and geometric si...",MATH 113,TRIGONOMETRY,113.0
3,5,Requisites: MATH 96 or placement into MATH 114...,The two semester sequence MATH 112-MATH 113 co...,MATH 114,ALGEBRA AND TRIGONOMETRY,114.0
4,2,Requisites: Enrolled in the Summer Collegiate ...,A preparation and introductory math course for...,MATH 118,SUMMER COLLEGIATE EXPERIENCE MATHEMATICS COURSE,118.0
...,...,...,...,...,...,...
153,1-3,Requisites: Graduate/professional standing or ...,Selected topics in Algebra.,MATH 941,SEMINAR-ALGEBRA,941.0
154,1-3,Requisites: Graduate/professional standing or ...,Selected topics in Topology.,MATH 951,SEMINAR IN TOPOLOGY,951.0
155,1-3,Requisites: Graduate/professional standing or ...,Selected topics in Number Theory.,MATH 967,SEMINAR IN NUMBER THEORY,967.0
156,1-3,Requisites: Graduate/professional standing or ...,Selected topics in Mathematical Logic.,MATH 975,SEMINAR-THE FOUNDATIONS OF MATHEMATICS,975.0


In [85]:
API_TOKEN = "9766d059e02f47c4a5fda3ccd4b83eca"

import requests
import pandas as pd

def get_gpa_info():
    """
    Fetches GPA data from MadGrades API.

    Returns:
        dict: Contains the GPA information from the API response.
    """
    
    # curl -H "Authorization: Token token=9766d059e02f47c4a5fda3ccd4b83eca" https://api.madgrades.com/v1/courses
    headers = {
        'Authorization': f'Token token={API_TOKEN}'
    }
    url = "https://api.madgrades.com/v1/courses"
    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        data = response.json()
        # Extracting GPA information from the JSON response.
        gpa_info = []
        for course in data.get('courses', []):
            if 'gpa' in course:
                gpa_info.append({
                    'course_id': course.get('id'),
                    'gpa': course.get('gpa')
                })
        return gpa_info
    else:
        print(f"Failed to fetch data: {response.status_code}")
        return None

# Call the function and display the results in the Jupyter Notebook
gpa_info = get_gpa_info()

if gpa_info:
    # Convert the list of dictionaries to a Pandas DataFrame for easier viewing
    df_gpa_info = pd.DataFrame(gpa_info)
    print(df_gpa_info)
else:
    print("No GPA information found.")


No GPA information found.


test_url = "https://api.madgrades.com/v1/courses"
headers = {
        'Authorization': 'Token token=9766d059e02f47c4a5fda3ccd4b83eca'
    }
response = requests.get(test_url, headers=headers)

if response.status_code == 200:
    print(response.json())  # Display the first two posts as a sample
else:
    print(response.status_code)

import requests

def fetch_course_data(department_abbr, course_number):
    base_url = "https://api.madgrades.com/v1/courses"
    headers = {
        'Authorization': 'Token token=9766d059e02f47c4a5fda3ccd4b83eca'
    }
    response = requests.get(base_url, headers=headers)
    
    if response.status_code == 200:
        data = response.json()
        courses = data['results']
        print(courses)
        
        # Search for the course with the matching department abbreviation and course number
        for course in courses:
            for subject in course['subjects']:
                if subject['abbreviation'] == department_abbr and str(subject['code']) == str(course_number):
                    print(f"Course found: {course['name']}")
                    print(f"Details: {subject}")
                    return course  # Return the course details
        print("Course not found.")
    else:
        print(f"Failed to connect to the API. Status code: {response.status_code}")
        return None

# Example usage:
fetch_course_data('COMP SCI', 320)


import requests
import json

def fetch_all_courses(base_url):
    # List to store all course data
    all_courses = []
    
    # Start with the first page of the courses
    url = base_url
    headers = {
        'Authorization': 'Token token=9766d059e02f47c4a5fda3ccd4b83eca'
    }

    while url:
        # Make a request to the current page URL
        #print(f"Requesting: {url}")  
        response = requests.get(url, headers=headers)
        response.raise_for_status()  
        
        # Parse the JSON response
        data = response.json()
        
        # Append the 'results' (list of courses) to the all_courses list
        all_courses.extend(data.get('results', []))
        
        # Update the URL to the next page, if it exists
        url = data.get('nextPageUrl')  # This assumes the API uses 'next' as the key for pagination
        
        # Debug print to check if the next page URL is correct
        #print("Next page URL:", url)
    
    # Save the collected course data to a JSON file
    with open('all_courses.json', 'w', encoding='utf-8') as file:
        json.dump(all_courses, file, ensure_ascii=False, indent=4)
    
    print(f"Fetched {len(all_courses)} courses and saved to 'all_courses.json'")
    return all_courses

base_url = 'https://api.madgrades.com/v1/courses'
all_courses = fetch_all_courses(base_url)

In [113]:
test_url = "https://api.madgrades.com/v1/courses"
headers = {
        'Authorization': 'Token token=9766d059e02f47c4a5fda3ccd4b83eca'
    }
response = requests.get(test_url, headers=headers)

if response.status_code == 200:
    print(response.json())  # Display the first two posts as a sample
else:
    print(response.status_code)

{'currentPage': 1, 'totalPages': 445, 'totalCount': 11124, 'nextPageUrl': 'https://api.madgrades.com/v1/courses?page=2', 'results': [{'uuid': 'a3e3e1c3-543d-3bb5-ae65-5f2aec4ad1de', 'number': 1, 'name': 'Cooperative Education Program', 'names': ['Cooperative Education Prog'], 'subjects': [{'name': 'Biomedical Engineering', 'abbreviation': 'B M E', 'code': '207'}, {'name': 'Chemical and Biological Engineering', 'abbreviation': 'CBE', 'code': '220'}, {'name': 'Civil and Environmental Engineering', 'abbreviation': 'CIV ENGR', 'code': '240'}, {'name': 'Electrical and Computer Engineering', 'abbreviation': 'E C E', 'code': '320'}, {'name': 'Engineering Mechanics and Aerospace Engineering', 'abbreviation': 'E M A', 'code': '346'}, {'name': 'Geological Engineering', 'abbreviation': 'G L E', 'code': '418'}, {'name': 'Industrial and Systems Engineering', 'abbreviation': 'I SY E', 'code': '490'}, {'name': 'Mechanical Engineering', 'abbreviation': 'M E', 'code': '612'}, {'name': 'Materials Scienc

In [120]:
import json

with open("all_courses.json", 'r', encoding='utf-8') as file:
    courses = json.load(file)    
courses[1000]

{'uuid': '4128f4c5-6eed-3433-8c19-8513b7f3088e',
 'number': 214,
 'name': 'Literatures of Central Asia in Translation',
 'names': ['Lits of Cent Asia in Translatn'],
 'subjects': [{'name': 'Literature in Translation',
   'abbreviation': 'LITTRANS',
   'code': '551'}],
 'url': 'https://api.madgrades.com/v1/courses/4128f4c5-6eed-3433-8c19-8513b7f3088e'}

In [254]:
def madgrade_course(abbrev, code):
    """
    Searches for courses in the 'courses' list that have a subject with the given abbreviation
    and whose course number matches the given code. Returns course that matches

    Parameters:
    abbrev (str): The abbreviation of the subject to search for.
    code (str or int): The course number to match against the course's number.

    Returns:
    dict: First item of a list of courses where a matching subject with the given abbreviation and code is found.
    """
    found_courses = []
    
    for course in courses:
        number = course.get("number")
        #print(number)
        #print(course)
        for subject in course.get('subjects'):
            if int(number) == int(code) and subject.get("abbreviation") == abbrev:
                found_courses.append(course)
                break  
                
    return found_courses[0]

test = madgrade_course("ACCT I S", "401")
test

IndexError: list index out of range

In [250]:
test["url"]

'https://api.madgrades.com/v1/courses/d8c56964-7cbe-36cb-ab8c-15aa992708b8'

In [251]:
test_url = test["url"] + "/grades"
print(test_url)
headers = {
        'Authorization': 'Token token=9766d059e02f47c4a5fda3ccd4b83eca'
    }
response = requests.get(test_url, headers=headers)

if response.status_code == 200:
    grades = response.json()
    print(response.json()) 
else:
    print(response.status_code)

https://api.madgrades.com/v1/courses/d8c56964-7cbe-36cb-ab8c-15aa992708b8/grades
{'courseUuid': 'd8c56964-7cbe-36cb-ab8c-15aa992708b8', 'cumulative': {'total': 12, 'aCount': 12, 'abCount': 0, 'bCount': 0, 'bcCount': 0, 'cCount': 0, 'dCount': 0, 'fCount': 0, 'sCount': 0, 'uCount': 0, 'crCount': 0, 'nCount': 0, 'pCount': 0, 'iCount': 0, 'nwCount': 0, 'nrCount': 0, 'otherCount': 0}, 'courseOfferings': [{'termCode': 1244, 'cumulative': {'total': 0, 'aCount': 0, 'abCount': 0, 'bCount': 0, 'bcCount': 0, 'cCount': 0, 'dCount': 0, 'fCount': 0, 'sCount': 0, 'uCount': 0, 'crCount': 0, 'nCount': 0, 'pCount': 0, 'iCount': 0, 'nwCount': 0, 'nrCount': 0, 'otherCount': 0}, 'sections': [{'sectionNumber': 130, 'instructors': [{'id': 5443845, 'name': 'DANIEL LYNCH'}], 'total': 0, 'aCount': 0, 'abCount': 0, 'bCount': 0, 'bcCount': 0, 'cCount': 0, 'dCount': 0, 'fCount': 0, 'sCount': 0, 'uCount': 0, 'crCount': 0, 'nCount': 0, 'pCount': 0, 'iCount': 0, 'nwCount': 0, 'nrCount': 0, 'otherCount': 0}, {'section

In [252]:
grades["cumulative"]

grade_percentage = {}

for grade in grades["cumulative"]:
    score = round((grades["cumulative"][grade]/grades["cumulative"]["total"] * 100), 2)
    grade_percentage[grade] = score
    
grade_percentage

{'total': 100.0,
 'aCount': 100.0,
 'abCount': 0.0,
 'bCount': 0.0,
 'bcCount': 0.0,
 'cCount': 0.0,
 'dCount': 0.0,
 'fCount': 0.0,
 'sCount': 0.0,
 'uCount': 0.0,
 'crCount': 0.0,
 'nCount': 0.0,
 'pCount': 0.0,
 'iCount': 0.0,
 'nwCount': 0.0,
 'nrCount': 0.0,
 'otherCount': 0.0}

In [253]:
def filter_grades(grades):
    """
    Barebones implementation of a filtering system to remove unneeded counts from percentages
    """
    filtered = {}
    filtered["aCount"] = grades["aCount"]
    filtered["abCount"] = grades["abCount"]
    filtered["bCount"] = grades["bCount"]
    filtered["bcCount"] = grades["bcCount"]
    filtered["cCount"] = grades["cCount"]
    filtered["dCount"] = grades["dCount"]
    filtered["fCount"] = grades["fCount"]
    
    return filtered

def calculate_gpa(grades):
    """
    Calculate GPA using the GPA scale provided by UW-Madison, using a filtered list ONLY containing the letter
    grades
    """
    # Define the grade points
    grade_points = {
        'aCount': 4.0,
        'abCount': 3.5,
        'bCount': 3.0,
        'bcCount': 2.5,
        'cCount': 2.0,
        'dCount': 1.0,
        'fCount': 0.0
    }

    # Calculate the total weighted points and the total count of grades
    total_points = 0
    total_count = 0
    
    for grade, count in grades.items():
        total_points += count * grade_points[grade]
        total_count += count
    
    # Calculate GPA
    gpa = total_points / total_count if total_count > 0 else 0
    return round(gpa, 2)

filtered = filter_grades(grade_percentage)
gpa = calculate_gpa(filtered)
gpa

4.0