In [27]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import networkx as nx 
import matplotlib.pyplot as plt
import re

In [28]:
# Global Variables
CATALOG_URL = 'https://app.testudo.umd.edu/soc/202501/' # Catalog for SPRING 2025 Classes
COURSE_PATTERN = re.compile(r"\b[A-Z]{4}\d{3}\b") 

In [None]:
def get_prereq(department):
    
    url = f"{CATALOG_URL}{department}" # host url 
    
    res = requests.get(url)
    soup = BeautifulSoup(res.text, "html.parser")
    
    # Courses dictionary to turn into a pandas dataframe
    courses = {}
    
    # Extracts each div containing relevant metadata and takes the pattern from the prerequesites metadata as a course
    for course_div in soup.find_all("div", class_="course"):
        course_id = course_div.get("id")
        
        if not course_id: 
            continue
        
        # TODO: remove when done
        print(course_id)
        
        prereq_text = ""
        
        prereq_div = course_div.find("div", class_="approved-course-texts-container")
        if prereq_div: 
            prereq_section = prereq_div.find("strong", string="Prerequisite:")
            if prereq_section: 
                prereq_text = prereq_section.next_sibling.strip() if prereq_section.next_sibling else ""
                
        prerequisites = COURSE_PATTERN.findall(prereq_text)
        courses[course_id] = prerequisites
        
    
    return courses
        
    
    
    

In [30]:
# Example: Get prerequisites for STEM Departments UMD offers
departments = [
    "AOSC", "ANSC", "ASTR", "BCHM", "BIOE", "BIOL", "BIPH", "CBMG", "CHBE", "CHEM",
    "CMSC", "ENCE", "ENAE", "ENEE", "ENFP", "ENMA", "ENME", "ENSP", "ENST", "ENTM",
    "GEOG", "GEOL", "MATH", "NEUR", "PHYS", "PLSC", "STAT", "INST", "DATA"
]
umd_courses = {}

for dept in departments:
    umd_courses.update(get_prereq(dept))

AOSC123
AOSC200
AOSC201
AOSC247
AOSC360
AOSC399
AOSC401
AOSC424
AOSC432
AOSC433
AOSC447
AOSC472
AOSC494
AOSC499
AOSC602
AOSC611
AOSC615
AOSC617
AOSC621
AOSC625
AOSC633
AOSC647
AOSC650
AOSC658Z
AOSC798
AOSC898
AOSC899
ANSC101
ANSC103
ANSC105
ANSC115
ANSC210
ANSC212
ANSC214
ANSC227
ANSC237
ANSC245
ANSC247
ANSC250
ANSC282
ANSC315
ANSC327
ANSC330
ANSC340
ANSC359
ANSC379
ANSC388
ANSC389
ANSC399
ANSC435
ANSC440
ANSC445
ANSC452
ANSC453
ANSC624
ANSC627
ANSC630
ANSC660
ANSC688J
ANSC688W
ANSC699
ANSC799
ANSC898
ANSC899
ASTR100
ASTR101
ASTR121
ASTR220
ASTR230
ASTR288
ASTR288B
ASTR288I
ASTR288T
ASTR300
ASTR315
ASTR320
ASTR330
ASTR340
ASTR380
ASTR399
ASTR406
ASTR435
ASTR498
ASTR615
ASTR670
ASTR688C
ASTR699
ASTR799
ASTR898
ASTR899
BCHM461
BCHM462
BCHM463
BCHM464
BCHM465
BCHM477
BCHM485
BCHM669C
BCHM669D
BCHM675
BCHM676
BCHM698
BCHM699
BCHM799
BCHM889A
BCHM889B
BCHM889D
BCHM889E
BCHM889M
BCHM898
BCHM899
BIOE120
BIOE121
BIOE221
BIOE232
BIOE241
BIOE331
BIOE340
BIOE371
BIOE372
BIOE386
BIOE389C
BIOE389F


In [None]:
for key, value in umd_courses.items(): 
    print(f'Course: {key}, Prereq: {value} ')
    
    # ISSUE: each row has multiple cell values

Course: AOSC123, Prereq: [] 
Course: AOSC200, Prereq: ['MATH107', 'MATH110', 'MATH115'] 
Course: AOSC201, Prereq: [] 
Course: AOSC247, Prereq: ['MATH140'] 
Course: AOSC360, Prereq: ['MATH107', 'MATH113', 'MATH115'] 
Course: AOSC399, Prereq: [] 
Course: AOSC401, Prereq: ['AOSC400', 'AOSC200', 'MATH141', 'PHYS161', 'PHYS171'] 
Course: AOSC424, Prereq: ['PHYS171', 'PHYS161', 'MATH141'] 
Course: AOSC432, Prereq: ['AOSC431'] 
Course: AOSC433, Prereq: ['CHEM131', 'CHEM135', 'CHEM146', 'MATH241'] 
Course: AOSC447, Prereq: ['MATH140'] 
Course: AOSC472, Prereq: ['AOSC432', 'AOSC600', 'AOSC610', 'AOSC470'] 
Course: AOSC494, Prereq: ['AOSC431', 'AOSC432'] 
Course: AOSC499, Prereq: [] 
Course: AOSC602, Prereq: [] 
Course: AOSC611, Prereq: [] 
Course: AOSC615, Prereq: [] 
Course: AOSC617, Prereq: ['AOSC610'] 
Course: AOSC621, Prereq: ['MATH462'] 
Course: AOSC625, Prereq: [] 
Course: AOSC633, Prereq: ['CHEM131', 'CHEM135', 'CHEM146'] 
Course: AOSC647, Prereq: ['MATH140'] 
Course: AOSC650, Prereq: []

In [32]:
df = pd.DataFrame(list(umd_courses.items()), columns=['Course', 'Prerequesites'])

df.to_csv("umd_course_prerequesites.csv", index=False)

df.head()


Unnamed: 0,Course,Prerequesites
0,AOSC123,[]
1,AOSC200,"[MATH107, MATH110, MATH115]"
2,AOSC201,[]
3,AOSC247,[MATH140]
4,AOSC360,"[MATH107, MATH113, MATH115]"


In [38]:
expanded_data = []

for course,prereqs in umd_courses.items(): 
    for prereq in prereqs:
        expanded_data.append((course,prereq))
        
df_expanded = pd.DataFrame(expanded_data, columns=['Course', 'Prerequisite'])

df_expanded.to_csv('umd_course_prerequisites_expanded.csv', index=False)

df_expanded.head()

Unnamed: 0,Course,Prerequisite
0,AOSC200,MATH107
1,AOSC200,MATH110
2,AOSC200,MATH115
3,AOSC247,MATH140
4,AOSC360,MATH107


In [34]:
g = nx.DiGraph()

for _, row in df_expanded.iterrows():
    course = row["Course"]
    
    prereq = row["Prerequisite"]
    
    if prereq != "None": 
        g.add_edge(prereq, course)

print(g.number_of_nodes())
print(g.number_of_edges())

724
1454


In [35]:
# Centrality Measures
in_degree = nx.in_degree_centrality(g)
betweenness = nx.betweenness_centrality(g)
closeness = nx.closeness_centrality(g)

# Create DataFrame
centrality_df = pd.DataFrame({
    "Course": list(in_degree.keys()),
    "In-Degree Centrality (Importance)" : list(in_degree.values()),
    "Betweenness Centrality (Chokepoint)" : list(betweenness.values()),
    "Closeness Centrality (Efficiency)" : list(closeness.values())
})

# Sort the dataframe by chokepoint and importance Top 20
bottleneck_courses = centrality_df.sort_values(by=['Betweenness Centrality (Chokepoint)','In-Degree Centrality (Importance)'], ascending=False).head(20)

bottleneck_courses.to_csv("umd_stem_bottleneck_courses.csv", index=False)

bottleneck_courses


Unnamed: 0,Course,In-Degree Centrality (Importance),Betweenness Centrality (Chokepoint),Closeness Centrality (Efficiency)
4,MATH140,0.001383,0.001411,0.00249
10,MATH141,0.001383,0.001324,0.002459
3,MATH115,0.004149,0.001137,0.002766
153,MATH120,0.004149,0.000302,0.003112
20,MATH241,0.001383,0.000293,0.00247
107,MATH246,0.001383,0.000292,0.00247
182,CMSC216,0.002766,0.00028,0.003987
187,CMSC351,0.002766,0.000272,0.004481
161,CHEM241,0.004149,0.00027,0.008215
185,CMSC330,0.002766,0.000256,0.004481


In [None]:
from pyvis.network import Network

net = Network(notebook=True, directed=True, height='800px', width='100%', bgcolor='white', font_color='black')
net.toggle_physics(True)
for node in g.nodes:
    
    # conditions to color nodes that are in top 20 bottleneck courses
    if node in bottleneck_courses['Course'].values:
        net.add_node(node, label=node, title=f"BOTTLENECK: {node}", color = 'red', font={"size": 16})
    else:
        
        net.add_node(node, label=node, title=f"{node} (STEM Course)", color = 'blue',font={"size": 14})
        
    
for edge in g.edges:
    net.add_edge(edge[0], edge[1])
    

net.show("umd_prereq_network.html")