In [3]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import networkx as nx 
import matplotlib.pyplot as plt
import re

In [4]:
CATALOG_URL = 'https://app.testudo.umd.edu/soc/202508/'
COURSE_PATTERN = re.compile(r"\b[A-Z]{4}\d{3}\b") 

In [5]:
def get_prereq(department):
    
    url = f"{CATALOG_URL}{department}"
    
    res = requests.get(url)
    soup = BeautifulSoup(res.text, "html.parser")
    
    courses = {}
    
    
    for course_div in soup.find_all("div", class_="course"):
        course_id = course_div.get("id")
        
        if not course_id: 
            continue
        
        # TODO: remove when done
        print(course_id)
        
        prereq_text = ""
        
        prereq_div = course_div.find("div", class_="approved-course-texts-container")
        if prereq_div: 
            prereq_section = prereq_div.find("strong", string="Prerequisite:")
            if prereq_section: 
                prereq_text = prereq_section.next_sibling.strip() if prereq_section.next_sibling else ""
                
        prerequisites = COURSE_PATTERN.findall(prereq_text)
        courses[course_id] = prerequisites
        
    
    return courses
        
    
    
    

In [14]:
# Example: Get prerequisites for STEM Departments UMD offers
departments = [
    "AOSC", "ANSC", "ASTR", "BCHM", "BIOE", "BIOL", "BIPH", "CBMG", "CHBE", "CHEM",
    "CMSC", "ENCE", "ENAE", "ENEE", "ENFP", "ENMA", "ENME", "ENSP", "ENST", "ENTM",
    "GEOG", "GEOL", "MATH", "NEUR", "PHYS", "PLSC", "STAT", "INST", "DATA"
]
umd_courses = {}

for dept in departments:
    umd_courses.update(get_prereq(dept))

AOSC123
AOSC200
AOSC201
AOSC375
AOSC400
AOSC420
AOSC431
AOSC440
AOSC470
AOSC493
AOSC494
AOSC499
AOSC600
AOSC610
AOSC620
AOSC642
AOSC652
AOSC670
AOSC680
AOSC798
AOSC898
AOSC899
ANSC101
ANSC103
ANSC105
ANSC201
ANSC210
ANSC232
ANSC242
ANSC246
ANSC252
ANSC255
ANSC260
ANSC270
ANSC314
ANSC359
ANSC371
ANSC388
ANSC389
ANSC399
ANSC436
ANSC437
ANSC446
ANSC447
ANSC450
ANSC454
ANSC455
ANSC460
ANSC624
ANSC625
ANSC660
ANSC688I
ANSC688O
ANSC699
ANSC799
ANSC898
ANSC899
ASTR100
ASTR101
ASTR120
ASTR230
ASTR288
ASTR288B
ASTR288I
ASTR310
ASTR330
ASTR340
ASTR350
ASTR380
ASTR399
ASTR421
ASTR422
ASTR498
ASTR601
ASTR610
ASTR622
ASTR688B
ASTR695
ASTR699
ASTR799
ASTR898
ASTR899
BCHM461
BCHM462
BCHM463
BCHM464
BCHM465
BCHM477
BCHM661
BCHM662
BCHM671
BCHM698
BCHM699
BCHM799
BCHM889A
BCHM889C
BCHM889D
BCHM889F
BCHM889J
BCHM889M
BCHM889P
BCHM898
BCHM899
BIOE120
BIOE121
BIOE221
BIOE232
BIOE241
BIOE246
BIOE331
BIOE340
BIOE371
BIOE372
BIOE389M
BIOE399
BIOE399H
BIOE411
BIOE413
BIOE420
BIOE437
BIOE442
BIOE453
BIOE457
BI

In [15]:
for key, value in umd_courses.items(): 
    print(f'Course: {key}, Prereq: {value} ')

Course: AOSC123, Prereq: [] 
Course: AOSC200, Prereq: ['MATH107', 'MATH110', 'MATH115'] 
Course: AOSC201, Prereq: [] 
Course: AOSC375, Prereq: ['MATH120'] 
Course: AOSC400, Prereq: ['PHYS171', 'PHYS161', 'MATH141'] 
Course: AOSC420, Prereq: ['MATH141', 'PHYS141'] 
Course: AOSC431, Prereq: ['PHYS171', 'PHYS161', 'MATH141'] 
Course: AOSC440, Prereq: ['PHYS171', 'PHYS161', 'AOSC401', 'GEOG301'] 
Course: AOSC470, Prereq: ['AOSC431', 'AOSC432'] 
Course: AOSC493, Prereq: [] 
Course: AOSC494, Prereq: ['AOSC431', 'AOSC432'] 
Course: AOSC499, Prereq: [] 
Course: AOSC600, Prereq: ['AOSC610'] 
Course: AOSC610, Prereq: ['MATH462'] 
Course: AOSC620, Prereq: ['MATH461'] 
Course: AOSC642, Prereq: ['PHYS171', 'PHYS161', 'AOSC401', 'GEOG301'] 
Course: AOSC652, Prereq: ['PHYS141', 'MATH241'] 
Course: AOSC670, Prereq: [] 
Course: AOSC680, Prereq: [] 
Course: AOSC798, Prereq: [] 
Course: AOSC898, Prereq: [] 
Course: AOSC899, Prereq: [] 
Course: ANSC101, Prereq: [] 
Course: ANSC103, Prereq: ['ANSC101'] 
Co

In [None]:
df = pd.DataFrame(list(umd_courses.items()), columns=['Course', 'Prerequesites'])

df.to_csv("umd_course_prerequesites.csv", index=False)

df.head()


Unnamed: 0,Course,Prerequesites
0,AOSC123,[]
1,AOSC200,"[MATH107, MATH110, MATH115]"
2,AOSC201,[]
3,AOSC375,[MATH120]
4,AOSC400,"[PHYS171, PHYS161, MATH141]"


In [17]:
expanded_data = []

for course,prereqs in umd_courses.items(): 
    for prereq in prereqs:
        expanded_data.append((course,prereq))
        
df_expanded = pd.DataFrame(expanded_data, columns=['Course', 'Prerequisite'])

df_expanded.to_csv('umd_course_prerequisites_expanded.csv', index=False)

In [18]:
g = nx.DiGraph()

for _, row in df_expanded.iterrows():
    course = row["Course"]
    
    prereq = row["Prerequisite"]
    
    if prereq != "None": 
        g.add_edge(prereq, course)

print(g.number_of_nodes())
print(g.number_of_edges())

665
1407


In [22]:
# Centrality Measures
in_degree = nx.in_degree_centrality(g)
betweenness = nx.betweenness_centrality(g)
closeness = nx.closeness_centrality(g)

# Create DataFrame
centrality_df = pd.DataFrame({
    "Course": list(in_degree.keys()),
    "In-Degree Centrality (Importance)" : list(in_degree.values()),
    "Betweenness Centrality (Chokepoint)" : list(betweenness.values()),
    "Closeness Centrality (Efficiency)" : list(closeness.values())
})

# Sort the dataframe by chokepoint and importance Top 20
bottleneck_courses = centrality_df.sort_values(by=['Betweenness Centrality (Chokepoint)','In-Degree Centrality (Importance)'], ascending=False).head(20)

bottleneck_courses.to_csv("umd_stem_bottleneck_courses.csv", index=False)

bottleneck_courses


Unnamed: 0,Course,In-Degree Centrality (Importance),Betweenness Centrality (Chokepoint),Closeness Centrality (Efficiency)
55,MATH140,0.001506,0.00189,0.002711
9,MATH141,0.001506,0.001518,0.002677
3,MATH115,0.004518,0.001509,0.003012
103,MATH246,0.001506,0.000414,0.002689
4,MATH120,0.004518,0.000347,0.003389
44,CHEM231,0.010542,0.000346,0.010327
205,MATH340,0.003012,0.000335,0.003765
149,CHEM241,0.004518,0.000321,0.009413
151,CHEM242,0.004518,0.000302,0.009413
564,STAT100,0.00753,0.000297,0.006275


In [26]:
from pyvis.network import Network

net = Network(notebook=True, directed=True, height='800px', width='100%', bgcolor='#222222', font_color='white')
net.toggle_physics(True)
for node in g.nodes:
    
    # conditions to color nodes that are in top 20 bottleneck courses
    if node in bottleneck_courses['Course'].values:
        net.add_node(node, label=node, title=f"BOTTLENECK: {node}", color = 'red', font={"size": 16})
    else:
        
        net.add_node(node, label=node, title=f"{node} (STEM Course)", color = 'blue',font={"size": 14})
        
    
for edge in g.edges:
    net.add_edge(edge[0], edge[1])
    

net.show("umd_prereq_network.html")


 

umd_prereq_network.html
