In [1]:
from collections import defaultdict
from bs4 import BeautifulSoup, Tag
import pandas as pd
import requests
import re

BASE_URL = 'http://www.columbia.edu'

In [2]:
def subjUrl(letter):
    return BASE_URL + '/cu/bulletin/uwb/sel/subj-{}.html'.format(letter)

def getSubjects(letter):
    r = requests.get(subjUrl(letter))
    r.raise_for_status()
    soup = BeautifulSoup(r.text, 'lxml')
    table = soup.table
    rows = table.find_all('tr')[3:-2]
    return rows

In [3]:
paths = []
for c in range(65, 91):
    rows = getSubjects(chr(c))
    for row in rows:
        href = row.a['href']
        if 'subj/AU' in href: # auditing
            break
        if '__' in href: # seemingly invalid subjects
            continue
        paths.append(href)

In [53]:
def parseForSection(contents):   
    components = contents[2].contents 
    section = {
         'number': contents[0].a.string.replace('Section ', ''),
    }
    header = None
    for component in components:
        if header is not None:
            text = component.strip()
            try:
                text = int(text)
            except ValueError:
                pass # This is expected since not all values are integers (e.g. instructor names)
            section[header] = text
            header = None
        if isinstance(component, Tag) and component.name == 'b':
            header = component.string.replace(':', '').strip()
    return section

def getCoursesFromPath(p):
    r = requests.get(BASE_URL + p)
    r.raise_for_status()
    soup = BeautifulSoup(r.text, 'lxml')
    course = None
    courses = []
    for tr in soup.table.find_all('tr')[2:-1]:
        if tr.td.get('colspan') is not None:        
            course = tr.td.b.contents[-1]
            continue
        section = parseForSection(tr.contents)
        section['course'] = course
        courses.append(section)
    return courses

In [54]:
getCoursesFromPath(paths[0])

[{'number': '200',
  'Call Number': 19291,
  'Points': 3,
  'Enrollment': '41 students as of March  2, 2019',
  'Instructors': 'Amir Ziv and Felicia C Goodman',
  'course': 'Accounting I: Financial Accoun'},
 {'number': '001',
  'Call Number': 62180,
  'Points': 3,
  'Day/Time': 'MW 2:15pm-3:45pm',
  'Location': '141 Uris Hall',
  'Enrollment': '62 students as of March  2, 2019',
  'Instructor': 'Urooj Khan',
  'course': 'Accounting I: Financial Accoun'},
 {'number': '002',
  'Call Number': 72707,
  'Points': 3,
  'Day/Time': 'MW 10:45am-12:15pm',
  'Location': '141 Uris Hall',
  'Enrollment': '63 students as of March  2, 2019',
  'course': 'Accounting I: Financial Accoun'},
 {'number': '003',
  'Call Number': 63796,
  'Points': 3,
  'Day/Time': 'MW 9:00am-10:30am',
  'Location': '141 Uris Hall',
  'Enrollment': '67 students as of March  2, 2019',
  'course': 'Accounting I: Financial Accoun'},
 {'number': '001',
  'Call Number': 11109,
  'Points': 3,
  'Enrollment': '14 students as of 

In [22]:
subjPattern = re.compile('subj\/(\w{4})')
def getSubjectFromPath(p):
    matches = subjPattern.findall(p)
    if len(matches) == 0:
        return None
    return matches[0]    

In [27]:
courses = {}
for p in paths:    
    courses = {**courses, **getCoursesFromPath(p)}
    print(f'Scanned courses for subject {getSubjectFromPath(p)}.')

Scanned courses for subject ACCT.
Scanned courses for subject ACTU.
Scanned courses for subject ADDN.
Scanned courses for subject AFCV.
Scanned courses for subject AFAS.
Scanned courses for subject AFRS.
Scanned courses for subject AKAD.
Scanned courses for subject AMST.
Scanned courses for subject ANCS.
Scanned courses for subject ANES.
Scanned courses for subject ANTH.
Scanned courses for subject APAN.
Scanned courses for subject APMA.
Scanned courses for subject APPH.
Scanned courses for subject APAM.
Scanned courses for subject ARCH.
Scanned courses for subject AHIS.
Scanned courses for subject ASCE.
Scanned courses for subject ASCM.
Scanned courses for subject AHUM.
Scanned courses for subject AHMM.
Scanned courses for subject ASST.
Scanned courses for subject ASTR.
Scanned courses for subject ASPH.
Scanned courses for subject ACLS.
Scanned courses for subject BHSC.
Scanned courses for subject BENG.
Scanned courses for subject BERL.
Scanned courses for subject BCHM.
Scanned course

Scanned courses for subject SWHL.
Scanned courses for subject TMGT.
Scanned courses for subject THTR.
Scanned courses for subject THEA.
Scanned courses for subject TIBT.
Scanned courses for subject UKRN.
Scanned courses for subject URBS.
Scanned courses for subject UROL.
Scanned courses for subject UTCE.
Scanned courses for subject VIET.
Scanned courses for subject WLOF.
Scanned courses for subject WMST.
Scanned courses for subject YIDD.


In [51]:
locations = defaultdict(int)
ignored = {
    'ONLINE ONLY': [],
    'To be announced': [],
    'OTHR OTHER': [],
}

for sections in courses.values():
    for s in sections:
        if 'Location' in s:
            loc = s['Location']
            if loc in ignored:
                ignored[loc].append(s)
            else:
                locations[s['Location']] += 1
                
ignoredCourses = sum(ignored.values(), []) # sums values into an initially empty list
print(f'Found {len(locations)} locations, but forced to ignore {len(ignoredCourses)}.')

Found 518 locations, but forced to ignore 769.


In [43]:
sortedLocs = sorted(locations.items(), key=lambda kv: kv[1], reverse=True)
sortedLocs

[('5TH FLR Pupin Laboratories', 36),
 ('317 Hamilton Hall', 26),
 ('311 Fayerweather', 25),
 ('467 EXT Schermerhorn Hall [SCH]', 24),
 ('302 Fayerweather', 24),
 ('402B International Affairs Building', 24),
 ('801 International Affairs Building', 24),
 ('401 Hamilton Hall', 23),
 ('501 International Affairs Building', 23),
 ('320 Havemeyer Hall', 23),
 ('405A International Affairs Building', 23),
 ('1127 Seeley W. Mudd Building', 22),
 ('402 International Affairs Building', 22),
 ('207 Mathematics Building', 21),
 ('313 Fayerweather', 21),
 ('304 Hamilton Hall', 21),
 ('501A International Affairs Building', 21),
 ('301M Fayerweather', 21),
 ('303 Seeley W. Mudd Building', 21),
 ('823 International Affairs Building', 21),
 ('200 S Fayerweather', 20),
 ('501 Schermerhorn Hall [SCH]', 20),
 ('545 Seeley W. Mudd Building', 20),
 ('209 Havemeyer Hall', 20),
 ('407 International Affairs Building', 20),
 ('301 School of Social Work', 20),
 ('304 School of Social Work', 20),
 ('627 Seeley W. M

Why are there so many ignored courses?

In [50]:
ignored['To be announced']

[{'number': '001',
  'Call Number': 87280,
  'Points': 3,
  'Day/Time': 'MW 11:40am-12:55pm',
  'Location': 'To be announced',
  'Enrollment': '45 students (65 max) as of March  2, 2019',
  'Notes': 'PRIORITY TO ACTU STUDENTS ONLY; OPEN TO UNIVERSITY 1/16/19',
  'Instructors': 'Abraham Weishaus and Lina Xu'},
 {'number': '001',
  'Call Number': 64618,
  'Points': 3,
  'Day/Time': 'M 11:00am-1:45pm',
  'Location': 'To be announced',
  'Enrollment': '0 students (12 max) as of March  2, 2019',
  'Notes': 'COURSE MEETS @ NYU. CONTACT RT2655@COLUMBIA.EDU FOR INFO'},
 {'number': '002',
  'Call Number': 23153,
  'Points': 0,
  'Day/Time': 'R 12:10pm-1:00pm',
  'Location': 'To be announced',
  'Enrollment': '14 students (30 max) as of March  2, 2019',
  'Instructor': 'Clare R Casey'},
 {'number': '002',
  'Call Number': 9780,
  'Points': 4,
  'Day/Time': 'MW 10:00am-12:50pm',
  'Location': 'To be announced',
  'Enrollment': '10 students as of March  2, 2019',
  'Notes': 'Attend first class for