In [22]:
import pandas as pd

In [23]:
fall_data = pd.read_json("CS_fall_2025.json")
spring_data = pd.read_json("CS_spring_2026.json")

In [24]:
fall = pd.DataFrame(fall_data)

In [25]:
spring = pd.DataFrame(spring_data)

### Removing Duplicate Courses

**Note that some courses have the same codes** *(independent studies, grad level courses, etc)*
#### Keep only unique course titles rather than just codes

In [26]:
fall = fall.drop_duplicates(subset=["course_code", "title"])
spring = spring.drop_duplicates(subset=["course_code", "title"])

merge the two. should be 51 from fall, 59 from spring (110 total before dedupe)

In [27]:
all_courses = pd.concat([fall, spring], ignore_index=True)
all_courses = all_courses.sort_values(by="course_code").reset_index(drop=True)

### Now if 2 courses match by code and title, this means they are offered in both semesters. 
#### Update the 'semester' of one to Fall & Spring, delete the other. Note 60 duplicates exist. 30 should be kept. new total should be 80

In [28]:
all_courses["semester"] = (
    all_courses.groupby(["title", "course_code"])["semester"]
    .transform(lambda x: "Fall and Spring 2026" if len(x) > 1 else x.iloc[0])
)

In [29]:
all_courses = all_courses.drop_duplicates(subset=["course_code", "title"])

### NLP

#### adding 2 new fields to store course requirement information for graph building
prerequisites: holds logical structure for storing required courses  
other_requirements: holds extra relevant qualitative information for given course  
uncertain: if the model is unsure of it's output for the first 2 fields

In [52]:
from dotenv import load_dotenv
import os
import openai

load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

In [31]:
all_courses['prerequisites'] = None
all_courses['other_requirements'] = None
all_courses['uncertain'] = None

In [32]:
def extract_info(title, course_info):
    prompt = f"""
You are extracting structured prerequisite information from university course descriptions.

Rules for prerequisites:
- Represent prerequisites in JSON format using a nested AND/OR structure.
- If no prerequisites are listed, return null.
- Course codes must remain as strings, exactly as written (e.g., "MATH 2248").
- Only list valid course codes (e.g., "CS 1010", "MATH 2248") in the "prerequisites" field.
- Do NOT include any words, phrases, or text that are not course codes (e.g., "equivalent experience", "Junior standing").
- If a prerequisite is not a course code, move it to "other_requirements".
- Use this schema:
  - Single prerequisite: "CS 1010"
  - OR relationship: {{"OR": ["MATH 1010", "STAT 2010"]}}
  - AND relationship: {{"AND": ["MATH 1010", "CS 1210"]}}
  - Mixed logic: nest them, e.g.:
    {{"AND": ["MATH 2248", {{"OR": ["MATH 2522", "MATH 2544", "MATH 3201"]}}, "CS 1210"]}}
- "Recommended" courses are not prerequisites
- If the course information contains conflicting or ambiguous prerequisite information, 
still return the JSON, but also include a field "uncertain": true. 
Otherwise, set "uncertain": false.


Also extract "other_requirements" (like "Junior standing", "Department consent", "Open to degree and PACE students", "CS Majors Only") as a list of strings. If none, return [].

Return only valid JSON with two fields:
{{
  "prerequisites": ...,
  "other_requirements": ...,
  "uncertain": ...
}}

Now process this course description:

{course_info}
"""
    completion = client.chat.completions.create(
        model="gpt-4o-mini",  # Cheap and fast model
        messages=[{"role": "user", "content": prompt}],
        temperature=0,  # For consistency
        response_format={"type": "json_object"}
    )
    print("Completed: ", course_info[:15],"-", title)
    return json.loads(completion.choices[0].message.content)

In [53]:
from openai import OpenAI
client = OpenAI()
import json

In [34]:
for idx, row in all_courses.iterrows():
    course_description = f"""Course: {row['course_code']} - {row['title']} Description: {row['description']} More Description: {row['section_description']} Section Expectations: {row['section_expectations']} Evaluation: {row['evaluation']} SOC Comments: {row['soc_comments']}"""
    title = row['title']
    result = extract_info(title, course_description)
    all_courses.at[idx, "prerequisites"] = result.get("prerequisites")
    all_courses.at[idx, "other_requirements"] = result.get("other_requirements")
    all_courses.at[idx, "uncertain"] = result.get("uncertain", False)

Completed:  Course: CS 1080 - Intro to Web Site Dev
Completed:  Course: CS 1210 - Comp Prog I: for CS majors
Completed:  Course: CS 1210 - Computer Programming I
Completed:  Course: CS 1320 - Puzzles, Games & Algorithms
Completed:  Course: CS 1500 - Seminar for New CS Majors
Completed:  Course: CS 1640 - Discrete Structures
Completed:  Course: CS 1870 - Intro to Data Science
Completed:  Course: CS 1994 - Teaching Assistantship
Completed:  Course: CS 2100 - Intermediate Programming
Completed:  Course: CS 2210 - Computer Organization
Completed:  Course: CS 2240 - Data Struc & Algorithms
Completed:  Course: CS 2250 - Computability& Complexity
Completed:  Course: CS 2300 - Advanced Programming
Completed:  Course: CS 2500 - Intro to Database Systems
Completed:  Course: CS 2510 - Intro Artificial Intelligence
Completed:  Course: CS 2660 - Cybersecurity Principles
Completed:  Course: CS 2870 - Basics of Data Science
Completed:  Course: CS 2920 - SL:Girls Who Code
Completed:  Course: CS 2993 -

In [45]:
# row = all_courses.iloc[0]
# course_description = f"""Course: {row['course_code']} - {row['title']} Description: {row['description']} More Description: {row['section_description']} Section Expectations: {row['section_expectations']} Evaluation: {row['evaluation']} SOC Comments: {row['soc_comments']}"""
# title = row['title']
# result = extract_info(title, course_description)

In [43]:
for idx, row in all_courses.iterrows():
    print(f'{row['course_code']} - {row['title']} | Prereqs: {row['prerequisites']}')

CS 1080 - Intro to Web Site Dev | Prereqs: None
CS 1210 - Comp Prog I: for CS majors | Prereqs: None
CS 1210 - Computer Programming I | Prereqs: None
CS 1320 - Puzzles, Games & Algorithms | Prereqs: None
CS 1500 - Seminar for New CS Majors | Prereqs: None
CS 1640 - Discrete Structures | Prereqs: {'AND': [{'OR': ['CS 1210', 'CS 2100']}, {'OR': ['MATH 1234', 'MATH 1242']}]}
CS 1870 - Intro to Data Science | Prereqs: None
CS 1994 - Teaching Assistantship | Prereqs: None
CS 2100 - Intermediate Programming | Prereqs: CS 1210
CS 2210 - Computer Organization | Prereqs: CS 2100
CS 2240 - Data Struc & Algorithms | Prereqs: {'AND': ['CS 2100']}
CS 2250 - Computability& Complexity | Prereqs: {'OR': ['CS 1640', 'MATH 2055']}
CS 2300 - Advanced Programming | Prereqs: CS 2240
CS 2500 - Intro to Database Systems | Prereqs: {'AND': ['CS 1210']}
CS 2510 - Intro Artificial Intelligence | Prereqs: CS 2240
CS 2660 - Cybersecurity Principles | Prereqs: CS 2210
CS 2870 - Basics of Data Science | Prereqs: {'

In [72]:
# all_courses[0:50][['title', 'prerequisites', 'other_requirements']]
# all_courses[0:50][['title', 'prerequisites', 'other_requirements']].to_csv("preview.csv", index=False)

In [36]:
all_courses[0:50]

Unnamed: 0,course_code,section,title,credit_hours,meeting_info,instructor,description,section_description,section_expectations,evaluation,soc_comments,semester,prerequisites,other_requirements,uncertain
0,CS 1080,"Section A, CRN 91695",Intro to Web Site Dev,3,TTh 4:25pm-5:40pm in JAMES M JEFFORDS HALL 127...,Murat Gungor,"Provides a strong foundation in HTML, CSS, ima...","Lays a foundation in website coding with HTML,...",Assumes no prior experience with programming.,"Quizzes, Active Learning Exercises, Lab Exerci...",Open to Degree and PACE students,Fall and Spring 2025,,[Open to Degree and PACE students],False
2,CS 1210,"Section F, CRN 92166",Comp Prog I: for CS majors,3,TTh 2:50pm-4:05pm in MARSH LIFE SCI 105 (8/25 ...,Sami Connolly,Introduction to algorithmic problem solving an...,Introduction to programming and computer scien...,Assumes no prior experience with programming. ...,,"Open to CS, CSIS, and DS majors only",Fall 2025,,"[Open to CS, CSIS, and DS majors only]",False
3,CS 1210,"Section B, CRN 90147",Computer Programming I,3,TTh 8:30am-9:45am in VOTEY BLDG 207 (8/25 to 1...,Sami Connolly,Introduction to algorithmic problem solving an...,Introduction to programming with Python.,Assumes no prior experience with programming.,"Weekly homework, lab and active learning exerc...",Open to Degree and PACE students,Fall and Spring 2025,,[Open to Degree and PACE students],False
5,CS 1320,"Section A, CRN 94171","Puzzles, Games & Algorithms",3,MWF 10:50am-11:40am in VOTEY BLDG 303 (8/25 to...,Lisa Dion,Introductory computer science through explorat...,"In this course, you will explore computational...",This course is taught in an active learning st...,This course is taught using a points-based sys...,Open to Degree and PACE students,Fall and Spring 2025,,[Open to Degree and PACE students],False
7,CS 1500,"Section A, CRN 90913",Seminar for New CS Majors,1,F 2:20pm-3:10pm in VOTEY BLDG 105 (8/25 to 12/5),Sami Connolly,A fun and accessible breadth-first introductio...,This course is intended to introduce new Compu...,You will be expected to attend each lecture pe...,Your final grade will be based on pre-class wr...,"CS, CSIS, or Data Science majors only",Fall 2025,,[Computer Science or Computer Science & Inform...,False
8,CS 1640,"Section A, CRN 90230",Discrete Structures,3,MWF 1:10pm-2pm in PERKINS BUILDING 003 (8/25 t...,Jeffrey Jewell,Introduction to analytic and formal methods of...,,,,Prereqs enforced by the system: (CS 1210 or 21...,Fall 2025,"{'AND': [{'OR': ['CS 1210', 'CS 2100']}, {'OR'...",[Open to Degree and PACE students],False
9,CS 1870,"Section A, CRN 92324",Intro to Data Science,3,MWF 1:10pm-2pm in VOTEY BLDG 209 (8/25 to 12/5),Roger Tenore,Basic techniques of data harvesting and cleani...,,,,Cross-listed with STAT 1870 A;\nTotal combined...,Fall 2025,,[Open to Degree and PACE students],False
10,CS 1994,,Teaching Assistantship,Varies by section,,,Assist in instruction of undergraduate compute...,,,,Varies by section,Fall and Spring 2025,,[Instructor permission required],False
12,CS 2100,,Intermediate Programming,Varies by section,,Jackie Horton,Intermediate programming concepts including co...,,,,Varies by section,Fall and Spring 2025,CS 1210,[],False
14,CS 2210,,Computer Organization,3,,Clayton Cafiero,Introduction to computer system organization i...,,,,Varies by section,Fall and Spring 2025,CS 2100,[],False


In [50]:
# move Independent Studies
# contains IS:, IS-, Ind Stdy, or Independent Study
import re
remove_list = ['IS:', 'IS-', 'Ind Stdy', 'Independent Study']
pattern = '|'.join(re.escape(s) for s in remove_list)

# Select only the independent study rows
independent_studies = all_courses[all_courses['title'].str.contains(pattern, case=True, na=False)]

# Export to CSV
independent_studies.to_csv('independent_studies.csv', index=False)

# Drop from all courses
all_courses = all_courses[~all_courses['title'].str.contains(pattern, case=True, na=False)]

In [49]:
all_courses[0:50]

Unnamed: 0,course_code,section,title,credit_hours,meeting_info,instructor,description,section_description,section_expectations,evaluation,soc_comments,semester,prerequisites,other_requirements,uncertain
0,CS 1080,"Section A, CRN 91695",Intro to Web Site Dev,3,TTh 4:25pm-5:40pm in JAMES M JEFFORDS HALL 127...,Murat Gungor,"Provides a strong foundation in HTML, CSS, ima...","Lays a foundation in website coding with HTML,...",Assumes no prior experience with programming.,"Quizzes, Active Learning Exercises, Lab Exerci...",Open to Degree and PACE students,Fall and Spring 2025,,[Open to Degree and PACE students],False
2,CS 1210,"Section F, CRN 92166",Comp Prog I: for CS majors,3,TTh 2:50pm-4:05pm in MARSH LIFE SCI 105 (8/25 ...,Sami Connolly,Introduction to algorithmic problem solving an...,Introduction to programming and computer scien...,Assumes no prior experience with programming. ...,,"Open to CS, CSIS, and DS majors only",Fall 2025,,"[Open to CS, CSIS, and DS majors only]",False
3,CS 1210,"Section B, CRN 90147",Computer Programming I,3,TTh 8:30am-9:45am in VOTEY BLDG 207 (8/25 to 1...,Sami Connolly,Introduction to algorithmic problem solving an...,Introduction to programming with Python.,Assumes no prior experience with programming.,"Weekly homework, lab and active learning exerc...",Open to Degree and PACE students,Fall and Spring 2025,,[Open to Degree and PACE students],False
5,CS 1320,"Section A, CRN 94171","Puzzles, Games & Algorithms",3,MWF 10:50am-11:40am in VOTEY BLDG 303 (8/25 to...,Lisa Dion,Introductory computer science through explorat...,"In this course, you will explore computational...",This course is taught in an active learning st...,This course is taught using a points-based sys...,Open to Degree and PACE students,Fall and Spring 2025,,[Open to Degree and PACE students],False
7,CS 1500,"Section A, CRN 90913",Seminar for New CS Majors,1,F 2:20pm-3:10pm in VOTEY BLDG 105 (8/25 to 12/5),Sami Connolly,A fun and accessible breadth-first introductio...,This course is intended to introduce new Compu...,You will be expected to attend each lecture pe...,Your final grade will be based on pre-class wr...,"CS, CSIS, or Data Science majors only",Fall 2025,,[Computer Science or Computer Science & Inform...,False
8,CS 1640,"Section A, CRN 90230",Discrete Structures,3,MWF 1:10pm-2pm in PERKINS BUILDING 003 (8/25 t...,Jeffrey Jewell,Introduction to analytic and formal methods of...,,,,Prereqs enforced by the system: (CS 1210 or 21...,Fall 2025,"{'AND': [{'OR': ['CS 1210', 'CS 2100']}, {'OR'...",[Open to Degree and PACE students],False
9,CS 1870,"Section A, CRN 92324",Intro to Data Science,3,MWF 1:10pm-2pm in VOTEY BLDG 209 (8/25 to 12/5),Roger Tenore,Basic techniques of data harvesting and cleani...,,,,Cross-listed with STAT 1870 A;\nTotal combined...,Fall 2025,,[Open to Degree and PACE students],False
10,CS 1994,,Teaching Assistantship,Varies by section,,,Assist in instruction of undergraduate compute...,,,,Varies by section,Fall and Spring 2025,,[Instructor permission required],False
12,CS 2100,,Intermediate Programming,Varies by section,,Jackie Horton,Intermediate programming concepts including co...,,,,Varies by section,Fall and Spring 2025,CS 1210,[],False
14,CS 2210,,Computer Organization,3,,Clayton Cafiero,Introduction to computer system organization i...,,,,Varies by section,Fall and Spring 2025,CS 2100,[],False


In [37]:
all_courses.to_csv('all_courses.csv', index=False)