# Part 1 - Combine Sp21 Course Info with Requirements

In [1]:
import pandas as pd
from bs4 import BeautifulSoup

## 1. Read the 2021 course schedule into a DataFrame

In [2]:
with open('./schedule.html') as p:
    dom = BeautifulSoup(p, "html.parser")

#### Anomalies handling:
1. For courses that have two course numbers, only the first course number is preserved.
2. For courses that have multiple instructors, all instructors are put in a single field.

In [3]:
divs = dom.select('.schedule-listing li div')
table = list()
for div in divs:
    row = {}
    spans = div.select('span')
    try: 
        row['Number-Section'] = spans[0].get_text().strip().replace('\u200b','')[:17].replace('\n', '')
        row['Name'] = spans[1].get_text().strip().replace('\n', ' ').replace('*', '')
        row['Instructor'] = spans[2].get_text().strip().replace('Office Hours', '').replace('\n', ' ')
        row['Time'] = spans[3].get_text().strip()
        table.append(row)
    except IndexError:
        continue
schedule = pd.DataFrame(table)
schedule

Unnamed: 0,Number-Section,Name,Instructor,Time
0,CSCI-GA.1144-001,PAC II,Mohamed Zahran,T 6:00-8:30PM
1,CSCI-GA.1144-002,PAC II Recitation,Gurkirat Singh Bajwa,R 7:10-8:00PM
2,CSCI-GA.1170-001,Fundamental Algorithms,Yevgeniy Dodis,T 7:10-9:00PM
3,CSCI-GA.1170-002,Fundamental Algorithms Recitation,Alex Bienstock ...,R 8:10-9:00PM
4,CSCI-GA.1170-003,Fundamental Algorithms Recitation,Harish Karthikeyan,R 8:10-9:00PM
...,...,...,...,...
147,CSCI-UA.0480-057,Special Topics: Natural Language Processing,Adam Meyers,TR 9:30-10:45AM
148,CSCI-UA.0480-069,Special Topics: Agile Software Development and...,Amos Bloomberg,MW 3:30-4:45PM
149,CSCI-UA.0480-521,Special Topics: Algorithmic Problem Solving,Joanna Klukowska,MW 11:00-12:15PM
150,CSCI-UA.0480-522,Special Topics: Algorithmic Problem Solving,Kunal Khatri,W 3:30-4:45PM


### Break apart `Number-Section` column  into two separate columns: `Number` and `Section`

In [4]:
schedule[['Number', 'Section']] = schedule['Number-Section'].str.extract(r'(\w+-\w+.\d+)-(\d+)')
schedule = schedule.drop(columns = 'Number-Section')
schedule = schedule.reindex(columns = ['Number', 'Section', 'Name', 'Instructor', 'Time'])

In [5]:
schedule.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 152 entries, 0 to 151
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Number      152 non-null    object
 1   Section     152 non-null    object
 2   Name        152 non-null    object
 3   Instructor  152 non-null    object
 4   Time        152 non-null    object
dtypes: object(5)
memory usage: 6.1+ KB


In [6]:
schedule.head()

Unnamed: 0,Number,Section,Name,Instructor,Time
0,CSCI-GA.1144,1,PAC II,Mohamed Zahran,T 6:00-8:30PM
1,CSCI-GA.1144,2,PAC II Recitation,Gurkirat Singh Bajwa,R 7:10-8:00PM
2,CSCI-GA.1170,1,Fundamental Algorithms,Yevgeniy Dodis,T 7:10-9:00PM
3,CSCI-GA.1170,2,Fundamental Algorithms Recitation,Alex Bienstock ...,R 8:10-9:00PM
4,CSCI-GA.1170,3,Fundamental Algorithms Recitation,Harish Karthikeyan,R 8:10-9:00PM


In [7]:
schedule.tail()

Unnamed: 0,Number,Section,Name,Instructor,Time
147,CSCI-UA.0480,57,Special Topics: Natural Language Processing,Adam Meyers,TR 9:30-10:45AM
148,CSCI-UA.0480,69,Special Topics: Agile Software Development and...,Amos Bloomberg,MW 3:30-4:45PM
149,CSCI-UA.0480,521,Special Topics: Algorithmic Problem Solving,Joanna Klukowska,MW 11:00-12:15PM
150,CSCI-UA.0480,522,Special Topics: Algorithmic Problem Solving,Kunal Khatri,W 3:30-4:45PM
151,CSCI-UA.0480,523,Special Topics: Algorithmic Problem Solving,Samasth Ananda,W 3:30-4:45PM


In [8]:
schedule.sample(5)

Unnamed: 0,Number,Section,Name,Instructor,Time
143,CSCI-UA.0479,1,Data Management and Analysis,Joseph Versoza,MW 11:00-12:15PM
69,CSCI-UA.0002,1,Intro To Computer Programming (No Prior Experi...,Shaheer Mohammed Haroon,MW 8:00-9:15AM
124,CSCI-UA.0201,61,Computer Systems Organization - Recitation,Aishwarya Rajan,T 11:00-12:15PM
137,CSCI-UA.0380,1,Topics of General Interest: Drawing on the Web,Joshua Clayton,TR 9:30-10:45AM
18,CSCI-GA.2433,11,Database Systems,Xiaoyang Sean Wang,M 3:45-5:35PM


## 2. Read the 2021 course catalog into a DataFrame

In [9]:
with open('./catalog.html') as p2:
    dom2 = BeautifulSoup(p2, "html.parser")

In [10]:
lis = dom2.select('.courses-listing li')
table2 = list()
for li in lis:
    row = {}
    ps = li.select('p')
    row['Number'] = ps[0].get_text().replace('\n','').strip()
    row['Prereqs'] = ps[2].get_text().replace('Prerequisites: ', '').strip()
    row['Points'] = ps[1].get_text().strip()
    #print(ps[1].get_text().strip())
    table2.append(row)
catalog = pd.DataFrame(table2)

In [11]:
catalog['Points'] = catalog['Points'].str.extract(r'(\d\s?-?\s?\d?)')
catalog['Number'] = catalog['Number'].str.extract(r'(\w+-\w+.\d+)')

#### Anomalies handling: 
correct the points of `CSCI-GA.3813` as `1-3 (MS), 1-12 (PhD)`

In [12]:
row_index = catalog.loc[catalog['Number'] == 'CSCI-GA.3813'].index[0]
catalog.loc[row_index, ['Points']] = '1-3 (MS), 1-12 (PhD)'

In [13]:
catalog.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Number   100 non-null    object
 1   Prereqs  100 non-null    object
 2   Points   99 non-null     object
dtypes: object(3)
memory usage: 2.5+ KB


In [14]:
catalog.head()

Unnamed: 0,Number,Prereqs,Points
0,CSCI-GA.1133,,4
1,CSCI-GA.1144,CSCI-GA 1133 or departmental permission.,4
2,CSCI-GA.1170,At least one year of experience with a high-le...,3
3,CSCI-GA.1180,,3
4,CSCI-GA.2110,Students taking this class should already have...,3


In [15]:
catalog.tail()

Unnamed: 0,Number,Prereqs,Points
95,CSCI-UA.0897,Restricted to declared computer science majors...,1 - 4
96,CSCI-UA.0898,Restricted to declared computer science majors...,1 - 4
97,CSCI-UA.0997,Permission of the department. Does not satisfy...,1 - 4
98,CSCI-UA.0998,Permission of the department. Does not satisfy...,1 - 4
99,FRSEM-UA.0597,"Some programming experience in Python, Java, J...",4


In [16]:
catalog.sample(5)

Unnamed: 0,Number,Prereqs,Points
21,CSCI-GA.2437,"Prerequisites include experience with Hadoop, ...",3
34,CSCI-GA.2620,"CSCI-GA 1170, CSCI-GA 2110, and CSCI-GA 2250.",3
47,CSCI-GA.3110,Permission of the instructor for master’s stud...,4
61,CSCI-GA.3870,Permission of Director of Graduate Studies.,1-3
31,CSCI-GA.2585,"Familiarity with basics in linear algebra, pro...",3


## 3. Put together both DataFrames

In [17]:
merged = schedule.merge(catalog, how = 'left', on = 'Number').drop(columns = 'Section')
merged.head()

Unnamed: 0,Number,Name,Instructor,Time,Prereqs,Points
0,CSCI-GA.1144,PAC II,Mohamed Zahran,T 6:00-8:30PM,CSCI-GA 1133 or departmental permission.,4
1,CSCI-GA.1144,PAC II Recitation,Gurkirat Singh Bajwa,R 7:10-8:00PM,CSCI-GA 1133 or departmental permission.,4
2,CSCI-GA.1170,Fundamental Algorithms,Yevgeniy Dodis,T 7:10-9:00PM,At least one year of experience with a high-le...,3
3,CSCI-GA.1170,Fundamental Algorithms Recitation,Alex Bienstock ...,R 8:10-9:00PM,At least one year of experience with a high-le...,3
4,CSCI-GA.1170,Fundamental Algorithms Recitation,Harish Karthikeyan,R 8:10-9:00PM,At least one year of experience with a high-le...,3
