In [1]:
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup

#### 1. Read the 2022 course schedule into a DataFrame

In [88]:
# Read html
with open ('Spring 2022 CS course schedule.html') as f:
    dom = BeautifulSoup(f, 'html.parser')

# Find Number-Section, Course-Name and Number
schedule1 = dom.select('.col-xs-12.col-sm-3')
number_section = []
course_name = []
number_schedule = []
for i in schedule1:
    li = ' '.join(i.text.strip().replace('\u200b', '').split())
    if ('-UA' in li) or ('-GA' in li):
        number_section.append(li.split('(')[0])
    elif not li.strip()[1:-1].isnumeric():
        course_name.append(li)

for i in number_section:
    if len(i) == 16:
        number_schedule.append(i[:11])
    else:
        number_schedule.append(i[:12])

# Find Instructor and Time
schedule2 = dom.select('.col-xs-12.col-sm-2')
instructor = []
time = []

for i in range(len(schedule2)):
    if i % 3 == 0:
        instructor.append(schedule[i].getText("|", strip=True))
    elif i % 3 == 1:
        time.append(schedule[i].getText("|", strip=True))
        
# Put Number-Section, Name, Instructor, Time and Number into a dict and convert it into a dataframe
di_schedule = {'Number-Section':number_section, 'Name':course_name, 'Instructor':instructor, 'Time':time, 'Number':number_schedule}
schedule_df = pd.DataFrame(di_schedule, columns=di.keys())
display(schedule_df)

Unnamed: 0,Number-Section,Name,Instructor,Time,Number
0,CSCI-GA.1144-001,PAC II*,Hasan Aljabbouli,T 6:00-8:30PM,CSCI-GA.1144
1,CSCI-GA.1144-002,PAC II Recitation,Anish Mahishi,R 7:10-8:00PM,CSCI-GA.1144
2,CSCI-GA.1170-001 DS-GA.1170,Fundamental Algorithms,Chee Yap,T 7:10-9:10PM,CSCI-GA.1170
3,CSCI-GA.1170-002 DS-GA.1170,Fundamental Algorithms Recitation,Eli Goldin,R 8:10-9:00PM,CSCI-GA.1170
4,CSCI-GA.1170-003 DS-GA.1170,Fundamental Algorithms Recitation,Zihan Feng,R 8:10-9:00PM,CSCI-GA.1170
...,...,...,...,...,...
154,CSCI-UA.0480-051,Special Topics: Parallel Computing,Mohamed Zahran,TR 2:00-3:15PM,CSCI-UA.0480
155,CSCI-UA.0480-057,Special Topics: Natural Language Processing,Adam Meyers,TR 9:30-10:45AM,CSCI-UA.0480
156,CSCI-UA.0480-061,Special Topics: Open Source Software Development,Joanna Klukowska,TR 12:30-1:45PM,CSCI-UA.0480
157,CSCI-UA.0480-069,Special Topics: Agile Software Development and...,Amos Bloomberg,MW 12:30-1:45PM,CSCI-UA.0480


#### 2. Read the 2022 course catalog into a DataFrame

In [91]:
# Read html
with open ('course_catalog.html') as f:
    dom = BeautifulSoup(f, 'html.parser')

# Find Number, Prereqs and Points
catalog = dom.select('.courses .courses-listing li')
number_catolog = []
prereqs_catolog = []
points_catolog = []
for li in catalog:
    paragraphs = li.select('p')
    number = paragraphs[0].text.strip().replace('\u200b', '').split()
    number_catolog.append(number[0])
    points = paragraphs[1].text.strip().split('.')
    points_catolog.append(points[0])
    prereqs = paragraphs[2].text.strip().replace('Prerequisites:', '')
    prereqs_catolog.append(prereqs)

# Put Number, Prereqs and Points into a dict and convert it into a dataframe
di_catolog = {'Number':number_catolog, 'Prereqs':prereqs_catolog, 'Points':points_catolog}
catolog_df = pd.DataFrame(di_catolog, columns=di_catolog.keys())
display(catolog_df)

Unnamed: 0,Number,Prereqs,Points
0,CSCI-GA.1133,,4 Points
1,CSCI-GA.1144,CSCI-GA 1133 or departmental permission.,4 Points
2,CSCI-GA.1170,At least one year of experience with a high-l...,3 Points
3,CSCI-GA.1180,,3 Points
4,CSCI-GA.2110,Students taking this class should already hav...,3 Points
...,...,...,...
94,CSCI-UA.0897,Restricted to declared computer science major...,1 - 4 Points
95,CSCI-UA.0898,Restricted to declared computer science major...,1 - 4 Points
96,CSCI-UA.0997,Permission of the department. Does not satisf...,1 - 4 Points
97,CSCI-UA.0998,Permission of the department. Does not satisf...,1 - 4 Points


#### 3. Put together both DataFrames

In [96]:
merged_left = pd.merge(left=schedule_df, right=catolog_df, how='left', left_on='Number', right_on='Number')
display(merged_left)

Unnamed: 0,Number-Section,Name,Instructor,Time,Number,Prereqs,Points
0,CSCI-GA.1144-001,PAC II*,Hasan Aljabbouli,T 6:00-8:30PM,CSCI-GA.1144,CSCI-GA 1133 or departmental permission.,4 Points
1,CSCI-GA.1144-002,PAC II Recitation,Anish Mahishi,R 7:10-8:00PM,CSCI-GA.1144,CSCI-GA 1133 or departmental permission.,4 Points
2,CSCI-GA.1170-001 DS-GA.1170,Fundamental Algorithms,Chee Yap,T 7:10-9:10PM,CSCI-GA.1170,At least one year of experience with a high-l...,3 Points
3,CSCI-GA.1170-002 DS-GA.1170,Fundamental Algorithms Recitation,Eli Goldin,R 8:10-9:00PM,CSCI-GA.1170,At least one year of experience with a high-l...,3 Points
4,CSCI-GA.1170-003 DS-GA.1170,Fundamental Algorithms Recitation,Zihan Feng,R 8:10-9:00PM,CSCI-GA.1170,At least one year of experience with a high-l...,3 Points
...,...,...,...,...,...,...,...
154,CSCI-UA.0480-051,Special Topics: Parallel Computing,Mohamed Zahran,TR 2:00-3:15PM,CSCI-UA.0480,Topics determine prerequisites.,4 Points
155,CSCI-UA.0480-057,Special Topics: Natural Language Processing,Adam Meyers,TR 9:30-10:45AM,CSCI-UA.0480,Topics determine prerequisites.,4 Points
156,CSCI-UA.0480-061,Special Topics: Open Source Software Development,Joanna Klukowska,TR 12:30-1:45PM,CSCI-UA.0480,Topics determine prerequisites.,4 Points
157,CSCI-UA.0480-069,Special Topics: Agile Software Development and...,Amos Bloomberg,MW 12:30-1:45PM,CSCI-UA.0480,Topics determine prerequisites.,4 Points


#### 4. Conclusion

The scheduled courses don't have all the catolog courses, so we merge the two table based on what the scheduled courses have.