In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import networkx as nx 
import matplotlib.pyplot as plt
import re

In [2]:
CATALOG_URL = 'https://app.testudo.umd.edu/soc/202508/'
COURSE_PATTERN = re.compile(r"\b[A-Z]{4}\d{3}\b") 

In [3]:
def get_prereq(department):
    
    url = f"{CATALOG_URL}{department}"
    
    res = requests.get(url)
    soup = BeautifulSoup(res.text, "html.parser")
    
    courses = {}
    
    
    for course_div in soup.find_all("div", class_="course"):
        course_id = course_div.get("id")
        
        if not course_id: 
            continue
        
        # TODO: remove when done
        print(course_id)
        
        prereq_text = ""
        
        prereq_div = course_div.find("div", class_="approved-course-texts-container")
        if prereq_div: 
            prereq_section = prereq_div.find("strong", string="Prerequisite:")
            if prereq_section: 
                prereq_text = prereq_section.next_sibling.strip() if prereq_section.next_sibling else ""
                
        prerequisites = COURSE_PATTERN.findall(prereq_text)
        courses[course_id] = prerequisites
        
    
    return courses
        
    
    
    

In [4]:
# Example: Get prerequisites for INST, CMSC, and MATH
departments = ["INST", "CMSC", "MATH"]
umd_courses = {}

for dept in departments:
    umd_courses.update(get_prereq(dept))

INST101
INST104
INST123
INST126
INST151
INST152
INST153
INST154
INST155
INST156
INST201
INST204
INST204S
INST210
INST227
INST228
INST301
INST309
INST311
INST314
INST320
INST326
INST327
INST335
INST341
INST346
INST347
INST352
INST354
INST362
INST363
INST364
INST366
INST367
INST377
INST380
INST381
INST388E
INST388F
INST402
INST405
INST406
INST407
INST408I
INST408R
INST408V
INST408Z
INST410
INST414
INST441
INST442
INST447
INST451
INST453
INST454
INST455
INST461
INST462
INST464
INST466
INST490
INST600
INST604
INST607
INST608Q
INST610
INST611
INST613
INST615
INST620
INST627
INST630
INST631
INST639N
INST643
INST650
INST651
INST653
INST661
INST680
INST704
INST705
INST709
INST710
INST711
INST728D
INST728F
INST731
INST735
INST737
INST752
INST753
INST754
INST760
INST764
INST771
INST775
INST782
INST784
INST787
INST799
INST800
INST801
INST818
INST878D
INST898
INST899
CMSC100
CMSC106
CMSC122
CMSC125
CMSC131
CMSC132
CMSC133
CMSC216
CMSC250
CMSC298A
CMSC320
CMSC330
CMSC335
CMSC351
CMSC396H
CMSC411
CM

In [5]:
for key, value in umd_courses.items(): 
    print(f'Course: {key}, Prereq: {value} ')

Course: INST101, Prereq: [] 
Course: INST104, Prereq: [] 
Course: INST123, Prereq: [] 
Course: INST126, Prereq: ['STAT100'] 
Course: INST151, Prereq: [] 
Course: INST152, Prereq: [] 
Course: INST153, Prereq: [] 
Course: INST154, Prereq: [] 
Course: INST155, Prereq: [] 
Course: INST156, Prereq: [] 
Course: INST201, Prereq: [] 
Course: INST204, Prereq: [] 
Course: INST204S, Prereq: [] 
Course: INST210, Prereq: [] 
Course: INST227, Prereq: [] 
Course: INST228, Prereq: [] 
Course: INST301, Prereq: [] 
Course: INST309, Prereq: [] 
Course: INST311, Prereq: ['PSYC100', 'SOCY105'] 
Course: INST314, Prereq: ['INST126', 'GEOG276', 'STAT100', 'MATH115'] 
Course: INST320, Prereq: [] 
Course: INST326, Prereq: ['INST126', 'GEOG276'] 
Course: INST327, Prereq: ['INST126', 'GEOG276'] 
Course: INST335, Prereq: ['INST201', 'INST301', 'PSYC100'] 
Course: INST341, Prereq: ['INST311'] 
Course: INST346, Prereq: ['INST201', 'INST301', 'INST326', 'INST327'] 
Course: INST347, Prereq: ['INST346'] 
Course: INST35

In [6]:
df = pd.DataFrame(list(umd_courses.items()), columns=['Course', 'Prerequesites'])

df.to_csv("umd_course_prerequesites.csv", index=False)

df.head()

#TODO: get rid of the lists, make it so that every row has one prerequesitis

Unnamed: 0,Course,Prerequesites
0,INST101,[]
1,INST104,[]
2,INST123,[]
3,INST126,[STAT100]
4,INST151,[]


In [8]:
expanded_data = []

for course,prereqs in umd_courses.items(): 
    for prereq in prereqs:
        expanded_data.append((course,prereq))
        
df_expanded = pd.DataFrame(expanded_data, columns=['Course', 'Prerequisite'])

df_expanded.to_csv('umd_course_prerequisites_expanded.csv', index=False)

In [14]:
g = nx.DiGraph()

for _, row in df_expanded.iterrows():
    course = row["Course"]
    
    prereq = row["Prerequisite"]
    
    if prereq != "None": 
        g.add_edge(prereq, course)

print(g.number_of_nodes())
print(g.number_of_edges())

196
457


In [16]:
from pyvis.network import Network

net = Network(notebook=True, directed=True, height='800px', width='100%', bgcolor='#222222', font_color='white')
for node in g.nodes: 
    net.add_node(node, label=node, title=f"Course: {node}")
    

for edge in g.edges:
    net.add_edge(edge[0], edge[1])
    

net.show("umd_prereq_network.html")




umd_prereq_network.html
