In [1]:
from collections import defaultdict
from bs4 import BeautifulSoup, Tag
import pandas as pd
import requests
import re

BASE_URL = 'http://www.columbia.edu'

In [7]:
def subjUrl(letter):
    return BASE_URL + '/cu/bulletin/uwb/sel/subj-{}.html'.format(letter)

def getSubjects(letter):
    r = requests.get(subjUrl(letter))
    r.raise_for_status()
    soup = BeautifulSoup(r.text, 'lxml')
    table = soup.table
    rows = table.find_all('tr')[3:-2]
    return rows

In [12]:
paths = []
for c in range(65, 91):
    rows = getSubjects(chr(c))
    for row in rows:
        anchors = row.findAll('a')
        for a in anchors:
            href = a['href']
            if 'subj/AU' in href: # auditing
                break
            if '__' in href: # seemingly invalid subjects
                continue
            if 'Fall2019' not in href:
                continue
            paths.append(href)

In [14]:
subjPattern = re.compile('subj\/(\w{4})')

def getSubjectFromPath(p):
    matches = subjPattern.findall(p)
    if len(matches) == 0:
        return None
    return matches[0]    

In [15]:
def parseForSection(contents):   
    components = contents[2].contents 
    section = {
         'number': contents[0].a.string.replace('Section ', ''),
    }
    header = None
    for component in components:
        if header is not None:
            text = component.strip()
            try:
                text = int(text)
            except ValueError:
                pass # This is expected since not all values are integers (e.g. instructor names)
            section[header] = text
            header = None
        if isinstance(component, Tag) and component.name == 'b':
            header = component.string.replace(':', '').strip()
    return section

def getCoursesFromPath(p):
    r = requests.get(BASE_URL + p)
    r.raise_for_status()
    soup = BeautifulSoup(r.text, 'lxml')
    subject = getSubjectFromPath(p)
    course = None
    courses = []
    for tr in soup.table.find_all('tr')[2:-1]:
        if tr.td.get('colspan') is not None:        
            course = tr.td.b.contents[-1]
            continue
        section = parseForSection(tr.contents)
        section['course'] = course
        section['subject'] = subject
        courses.append(section)
    return courses

In [18]:
courses = []
for p in paths:    
    courses += getCoursesFromPath(p)
    print(f'Scanned courses for subject {getSubjectFromPath(p)}.')

Scanned courses for subject ACCT.
Scanned courses for subject ACTU.
Scanned courses for subject ADDN.
Scanned courses for subject AFCV.
Scanned courses for subject AFAS.
Scanned courses for subject AFRS.
Scanned courses for subject AFEN.
Scanned courses for subject AKAD.
Scanned courses for subject AMST.
Scanned courses for subject ANAT.
Scanned courses for subject ANCS.
Scanned courses for subject ANES.
Scanned courses for subject ANTH.
Scanned courses for subject ANHS.
Scanned courses for subject ANME.
Scanned courses for subject APAN.
Scanned courses for subject APCH.
Scanned courses for subject APMA.
Scanned courses for subject APPH.
Scanned courses for subject APAM.
Scanned courses for subject APBM.
Scanned courses for subject ARAM.
Scanned courses for subject ARCH.
Scanned courses for subject AHIS.
Scanned courses for subject ASCE.
Scanned courses for subject ASCM.
Scanned courses for subject AHUM.
Scanned courses for subject AHMM.
Scanned courses for subject ASST.
Scanned course

Scanned courses for subject SPAN.
Scanned courses for subject SPJS.
Scanned courses for subject SPRT.
Scanned courses for subject STAT.
Scanned courses for subject STOM.
Scanned courses for subject SURG.
Scanned courses for subject SUMA.
Scanned courses for subject SUSC.
Scanned courses for subject SDEV.
Scanned courses for subject SWHL.
Scanned courses for subject TMGT.
Scanned courses for subject THTR.
Scanned courses for subject THEA.
Scanned courses for subject TIBT.
Scanned courses for subject UKRN.
Scanned courses for subject URBS.
Scanned courses for subject UROL.
Scanned courses for subject VIET.
Scanned courses for subject VIAR.
Scanned courses for subject WLOF.
Scanned courses for subject WMST.
Scanned courses for subject YIDD.


In [19]:
locations = defaultdict(int)
ignored = {
    'ONLINE ONLY': [],
    'To be announced': [],
    'undefined': [],
    'OTHR OTHER': [],
}

for s in courses:
    loc = s.get('Location', 'undefined')
    if loc in ignored:
        ignored[loc].append(s)
    else:
        locations[s['Location']] += 1
                
ignoredCourses = sum(ignored.values(), []) # sums values into an initially empty list
print(f'Found {len(locations)} locations, but forced to ignore {len(ignoredCourses)}.')

Found 501 locations, but forced to ignore 3784.


In [20]:
sortedLocs = sorted(locations.items(), key=lambda kv: kv[1], reverse=True)
sortedLocs

[('5TH FLR Pupin Laboratories', 32),
 ('320 Havemeyer Hall', 31),
 ('405 International Affairs Building', 28),
 ('404 International Affairs Building', 26),
 ('302 Fayerweather', 25),
 ('317 Hamilton Hall', 23),
 ('903 School of Social Work', 23),
 ('309 Havemeyer Hall', 23),
 ('467 EXT Schermerhorn Hall [SCH]', 22),
 ('207 Mathematics Building', 22),
 ('409 International Affairs Building', 22),
 ('311 Fayerweather', 22),
 ('312 Mathematics Building', 21),
 ('407 International Affairs Building', 21),
 ('407 Mathematics Building', 20),
 ('963 EXT Schermerhorn Hall [SCH]', 20),
 ('716 Philosophy Hall', 20),
 ('627 Seeley W. Mudd Building', 20),
 ('633 Seeley W. Mudd Building', 20),
 ('1102 International Affairs Building', 20),
 ('402 Chandler', 20),
 ('801 International Affairs Building', 20),
 ('508 Dodge Hall', 20),
 ('711 International Affairs Building', 20),
 ('301M Fayerweather', 19),
 ('420 Pupin Laboratories', 19),
 ('402B International Affairs Building', 19),
 ('507 Dodge Hall', 1

Why are there so many ignored courses?

In [21]:
ignoredCourses = defaultdict(int)
for c in ignored['To be announced']:
    ignoredCourses[c['course']] += 1
sortedCourses = sorted(ignoredCourses.items(), key=lambda kv: kv[1], reverse=True)
sortedCourses

[('PHYSICAL EDUCATION ACTIVITIES', 59),
 ('ORGANIC CHEMISTRY - REC', 23),
 ('GENERAL PHYSICS I - REC', 17),
 ('INTRO BIO I: BIOCHEM,GEN,MOLEC', 14),
 ('INTERNATIONAL POLITICS-DISC', 14),
 ('INTRO TO MECH & THERMO - REC', 12),
 ('GENERAL CHEMISTRY I - REC', 9),
 ('GLOBAL URBANISM-DISCUSSION', 8),
 ('GRADUATE LECTURE', 7),
 ('CONTRACTS', 7),
 ('FOUNDATIONS OF SW PRACTICE', 7),
 ('INTRO-E ASIAN CIV:CHINA-DISC', 6),
 ('INTRO EAST ASIAN CIV:JAPAN-DIS', 6),
 ('INTRO EAST ASIAN CIV:TIBET-DIS', 6),
 ('2ND TERM GEN CHEM(INTENS)-REC', 6),
 ('CIVIL PROCEDURE', 6),
 ('SEM-NEGOTIATION WORKSHOP', 6),
 ('PHYSICS I:MECHANICS/RELATIVITY-REC', 6),
 ('THE SOCIAL WORLD - DISC', 6),
 ('Applied Value Investing', 5),
 ('GRADUATE SEMINAR', 5),
 ('S DEALS WORKSHOP', 5),
 ('THEORY AND CULTURE-DISC', 5),
 ('INTRO TO STATISTICS', 5),
 ('INTRO EAST ASIAN CIV-VIETNAM DISC', 4),
 ('INTRO TO ISLAMIC CIV-REC', 4),
 ('Pivot_Professional Development', 4),
 ('LEGAL METHODS', 4),
 ('LAWYER LEADERSHIP', 4),
 ('HUMAN BEHAV/

Physical education was an obvious result, but the rest of the distribution doesn't say much. Let's look at subjects.

In [22]:
ignoredSubjects = defaultdict(int)
for c in ignored['To be announced']:
    ignoredSubjects[c['subject']] += 1
sortedSubjects = sorted(ignoredSubjects.items(), key=lambda kv: kv[1], reverse=True)
sortedSubjects

[('LAW_', 194),
 ('PHED', 59),
 ('SOCW', 50),
 ('HPMN', 42),
 ('PHYS', 42),
 ('CHEM', 40),
 ('FINC', 35),
 ('BIST', 25),
 ('EPID', 24),
 ('SOCI', 23),
 ('ASCE', 22),
 ('HIST', 19),
 ('POLS', 17),
 ('SOSC', 17),
 ('MRKT', 16),
 ('MDES', 16),
 ('BIOL', 15),
 ('ENGL', 15),
 ('THEA', 14),
 ('ECON', 13),
 ('EHSC', 13),
 ('PSCA', 12),
 ('MGMT', 11),
 ('NURS', 11),
 ('POPF', 11),
 ('STAT', 10),
 ('EESC', 9),
 ('FILM', 9),
 ('GENE', 9),
 ('DROM', 8),
 ('ENGI', 8),
 ('APAN', 7),
 ('BUSI', 7),
 ('BUEC', 7),
 ('IEOR', 6),
 ('PHIL', 6),
 ('ASTR', 5),
 ('JPNS', 5),
 ('JOUR', 5),
 ('PLAN', 5),
 ('ACCT', 4),
 ('ARCH', 4),
 ('ASCM', 4),
 ('CHEN', 4),
 ('CHNS', 4),
 ('ELEN', 4),
 ('NBHV', 4),
 ('SOAR', 4),
 ('SPAN', 4),
 ('BINF', 3),
 ('CANT', 3),
 ('CMBS', 3),
 ('EEEB', 3),
 ('FREN', 3),
 ('GEND', 3),
 ('IRSH', 3),
 ('MICR', 3),
 ('PATH', 3),
 ('QMSS', 3),
 ('ASST', 2),
 ('BMEN', 2),
 ('CLCV', 2),
 ('COMS', 2),
 ('COCI', 2),
 ('DNCE', 2),
 ('EAEE', 2),
 ('ECBM', 2),
 ('EECS', 2),
 ('FILI', 2),
 ('HIFO

In [23]:
ignoredSubjects = defaultdict(int)
for c in ignored['undefined']:
    ignoredSubjects[c['subject']] += 1
sortedSubjects = sorted(ignoredSubjects.items(), key=lambda kv: kv[1], reverse=True)
sortedSubjects

[('LAW_', 179),
 ('SOCW', 162),
 ('BMEN', 150),
 ('ELEN', 129),
 ('NURS', 105),
 ('IEOR', 99),
 ('ANTH', 94),
 ('AHIS', 94),
 ('MECE', 83),
 ('PHIL', 73),
 ('CHEN', 65),
 ('CIEN', 59),
 ('MUSI', 54),
 ('PUBH', 53),
 ('RELI', 53),
 ('POLS', 52),
 ('PSYC', 51),
 ('ARCH', 42),
 ('BINF', 42),
 ('APAM', 37),
 ('ORTH', 36),
 ('CHEM', 31),
 ('INTC', 31),
 ('PHED', 30),
 ('PHYT', 29),
 ('OCCT', 28),
 ('MGMT', 26),
 ('ECON', 24),
 ('VIAR', 24),
 ('DNSC', 23),
 ('BUSI', 21),
 ('ANCS', 20),
 ('ENDO', 20),
 ('PROS', 20),
 ('COMS', 19),
 ('HIST', 18),
 ('PDNT', 18),
 ('DROM', 15),
 ('FINC', 15),
 ('MRKT', 15),
 ('EAEE', 14),
 ('MEDI', 14),
 ('ADDN', 13),
 ('BIOL', 13),
 ('BUEC', 13),
 ('NMED', 13),
 ('EEEB', 12),
 ('ENGI', 12),
 ('JOUR', 12),
 ('SPAN', 12),
 ('CLPH', 11),
 ('FILM', 11),
 ('GREK', 11),
 ('EPID', 10),
 ('IMPL', 10),
 ('PATH', 10),
 ('RESI', 10),
 ('SOCI', 10),
 ('ACCT', 9),
 ('AFAS', 8),
 ('APPH', 8),
 ('COMM', 8),
 ('ENGL', 8),
 ('LATN', 8),
 ('NEUR', 8),
 ('PSCA', 8),
 ('BIET', 7),