In [1]:
from collections import defaultdict
from bs4 import BeautifulSoup, Tag
import pandas as pd
import requests
import re

BASE_URL = 'http://www.columbia.edu'

In [2]:
def subjUrl(letter):
    return BASE_URL + '/cu/bulletin/uwb/sel/subj-{}.html'.format(letter)

def getSubjects(letter):
    r = requests.get(subjUrl(letter))
    r.raise_for_status()
    soup = BeautifulSoup(r.text, 'lxml')
    table = soup.table
    rows = table.find_all('tr')[3:-2]
    return rows

In [3]:
paths = []
for c in range(65, 91):
    rows = getSubjects(chr(c))
    for row in rows:
        anchors = row.findAll('a')
        for a in anchors:
            href = a['href']
            if 'subj/AU' in href: # auditing
                break
            if '__' in href: # seemingly invalid subjects
                continue
            if 'Fall2019' not in href:
                continue
            paths.append(href)

In [4]:
subjPattern = re.compile('subj\/(\w{4})')

def getSubjectFromPath(p):
    matches = subjPattern.findall(p)
    if len(matches) == 0:
        return None
    return matches[0]    

In [5]:
def parseForSection(contents):   
    components = contents[2].contents 
    section = {
         'number': contents[0].a.string.replace('Section ', ''),
    }
    header = None
    for component in components:
        if header is not None:
            text = component.strip()
            try:
                text = int(text)
            except ValueError:
                pass # This is expected since not all values are integers (e.g. instructor names)
            section[header] = text
            header = None
        if isinstance(component, Tag) and component.name == 'b':
            header = component.string.replace(':', '').strip()
    return section

def getCoursesFromPath(p):
    r = requests.get(BASE_URL + p)
    r.raise_for_status()
    soup = BeautifulSoup(r.text, 'lxml')
    subject = getSubjectFromPath(p)
    course = None
    courses = []
    for tr in soup.table.find_all('tr')[2:-1]:
        if tr.td.get('colspan') is not None:        
            course = tr.td.b.contents[-1]
            continue
        section = parseForSection(tr.contents)
        section['course'] = course
        section['subject'] = subject
        courses.append(section)
    return courses

In [22]:
[c for c in courses if c['subject'].find('A') != 0]

[{'number': '083',
  'Call Number': 14995,
  'Points': 0,
  'Enrollment': '99 students as of August 31, 2019',
  'Instructor': 'Julie A Connolly',
  'course': 'ETHICS',
  'subject': 'BHSC'},
 {'number': '083',
  'Call Number': 14996,
  'Points': 0,
  'Enrollment': '98 students as of August 31, 2019',
  'Instructor': 'Aaron G Myers',
  'course': 'SPECIAL NEEDS ROTATION PEDIATRICS',
  'subject': 'BHSC'},
 {'number': '084',
  'Call Number': 14968,
  'Points': 0,
  'Enrollment': '91 students as of August 31, 2019',
  'Instructor': 'Gregory N Bunza',
  'course': 'SPECIAL NEEDS ROTATION',
  'subject': 'BHSC'},
 {'number': '001',
  'Call Number': 55365,
  'Points': 4,
  'Day/Time': 'TR 4:10pm-6:00pm',
  'Location': '255 International Affairs Building',
  'Enrollment': '8 students (20 max) as of August 31, 2019',
  'Instructor': 'Dwijen Bhattacharjya',
  'course': 'ELEMENTARY BENGALI I',
  'subject': 'BENG'},
 {'number': '001',
  'Call Number': 55400,
  'Points': 4,
  'Day/Time': 'TR 6:10pm-8:

In [6]:
courses = []
for p in paths:    
    courses += getCoursesFromPath(p)
    print(f'Scanned courses for subject {getSubjectFromPath(p)}.')

Scanned courses for subject ACCT.
Scanned courses for subject ACTU.
Scanned courses for subject ADDN.
Scanned courses for subject AFCV.
Scanned courses for subject AFAS.
Scanned courses for subject AFRS.
Scanned courses for subject AFEN.
Scanned courses for subject AKAD.
Scanned courses for subject AMST.
Scanned courses for subject ANAT.
Scanned courses for subject ANCS.
Scanned courses for subject ANES.
Scanned courses for subject ANTH.
Scanned courses for subject ANHS.
Scanned courses for subject ANME.
Scanned courses for subject APAN.
Scanned courses for subject APCH.
Scanned courses for subject APMA.
Scanned courses for subject APPH.
Scanned courses for subject APAM.
Scanned courses for subject APBM.
Scanned courses for subject ARAM.
Scanned courses for subject ARCH.
Scanned courses for subject AHIS.
Scanned courses for subject ASCE.
Scanned courses for subject ASCM.
Scanned courses for subject AHUM.
Scanned courses for subject AHMM.
Scanned courses for subject ASST.
Scanned course

Scanned courses for subject SPAN.
Scanned courses for subject SPJS.
Scanned courses for subject SPRT.
Scanned courses for subject STAT.
Scanned courses for subject STOM.
Scanned courses for subject SURG.
Scanned courses for subject SUMA.
Scanned courses for subject SUSC.
Scanned courses for subject SDEV.
Scanned courses for subject SWHL.
Scanned courses for subject TMGT.
Scanned courses for subject THTR.
Scanned courses for subject THEA.
Scanned courses for subject TIBT.
Scanned courses for subject UKRN.
Scanned courses for subject URBS.
Scanned courses for subject UROL.
Scanned courses for subject VIET.
Scanned courses for subject VIAR.
Scanned courses for subject WLOF.
Scanned courses for subject WMST.
Scanned courses for subject YIDD.


Faculty

In [17]:
instructors = defaultdict(int)
for s in courses:
    if 'REC' in s['course']:
        continue
    i = s.get('Instructor', None)
    instructors[i] += 1
sortedInstructors = sorted(instructors.items(), key=lambda kv: kv[1], reverse=True)
sortedInstructors

[(None, 1370),
 ('Faculty', 27),
 ('Giuseppina C Cambareri', 23),
 ('Sahng Gyoon Kim', 20),
 ('Nurit Bittner', 18),
 ('Jing Chen', 17),
 ('Sunil Wadhwa', 15),
 ('Jan Hammerquist', 14),
 ('David G Vallancourt', 13),
 ('Hyoseon Lee', 13),
 ('Sinisa Vukelic', 13),
 ('Guanhao Sun', 12),
 ('Kenneth A Lawson', 12),
 ('Philip Kang', 11),
 ('Katherine T Fox-Glassman', 11),
 ('Gerard H Ateshian', 10),
 ('Kenneth Shepard', 10),
 ('Matthew E Buckingham', 10),
 ('Kevin Fellezs', 9),
 ('Helen Lu', 9),
 ('Zoran Kostic', 9),
 ('Gregory M Eirich', 9),
 ('Michael Como', 9),
 ('Roseanna Graham', 8),
 ('Aaron M Kyle', 8),
 ('Edward F Leonard', 8),
 ('Katherine Reuther', 8),
 ('Anna Ghurbanyan', 8),
 ('Richard W Longman', 8),
 ('Xiaodong Wang', 8),
 ('Henry Lam', 8),
 ('Robert Gooding-Williams', 7),
 ('Marcus Folch', 7),
 ('Brian Boyd', 7),
 ('Lance Kam', 7),
 ('Elisa Konofagou', 7),
 ('Medini Singh', 7),
 ('Christopher J Durning', 7),
 ('Ioannis Kymissis', 7),
 ('Christine P Hendon', 7),
 ('Predrag R Jel

In [18]:
[c for c in courses if c.get('Instructor', None) == 'Helen Lu']

[{'number': '017',
  'Call Number': 43326,
  'Points': '1-3',
  'Enrollment': '0 students (50 max) as of August 31, 2019',
  'Instructor': 'Helen Lu',
  'course': 'PROJECTS IN BIOMEDICAL ENGIN',
  'subject': 'BMEN'},
 {'number': '001',
  'Call Number': 43420,
  'Points': '1-2',
  'Enrollment': '0 students (45 max) as of August 31, 2019',
  'Instructor': 'Helen Lu',
  'course': 'UNDERGRADUATE FIELDWORK',
  'subject': 'BMEN'},
 {'number': '001',
  'Call Number': 43412,
  'Points': '1-2',
  'Enrollment': '1 student (45 max) as of August 31, 2019',
  'Instructor': 'Helen Lu',
  'course': 'FIELDWORK',
  'subject': 'BMEN'},
 {'number': '001',
  'Call Number': 43415,
  'Points': 3,
  'Day/Time': 'T 10:10am-12:40pm',
  'Location': '343 Seeley W. Mudd Building',
  'Enrollment': '25 students (25 max) as of August 31, 2019 / Full',
  'Instructor': 'Helen Lu',
  'course': 'ADV SCAFFOLD DES/COMPLX TISSUE',
  'subject': 'BMEN'},
 {'number': '017',
  'Call Number': 43437,
  'Points': '1-6',
  'Enroll

In [7]:
locations = defaultdict(int)
ignored = {
    'ONLINE ONLY': [],
    'To be announced': [],
    'undefined': [],
    'OTHR OTHER': [],
}

for s in courses:
    loc = s.get('Location', 'undefined')
    if loc in ignored:
        ignored[loc].append(s)
    else:
        locations[s['Location']] += 1
                
ignoredCourses = sum(ignored.values(), []) # sums values into an initially empty list
print(f'Found {len(locations)} locations, but forced to ignore {len(ignoredCourses)}.')

Found 501 locations, but forced to ignore 3784.


In [8]:
sortedLocs = sorted(locations.items(), key=lambda kv: kv[1], reverse=True)
sortedLocs

[('5TH FLR Pupin Laboratories', 32),
 ('320 Havemeyer Hall', 31),
 ('405 International Affairs Building', 28),
 ('404 International Affairs Building', 26),
 ('302 Fayerweather', 25),
 ('317 Hamilton Hall', 23),
 ('903 School of Social Work', 23),
 ('309 Havemeyer Hall', 23),
 ('467 EXT Schermerhorn Hall [SCH]', 22),
 ('207 Mathematics Building', 22),
 ('409 International Affairs Building', 22),
 ('311 Fayerweather', 22),
 ('312 Mathematics Building', 21),
 ('407 International Affairs Building', 21),
 ('407 Mathematics Building', 20),
 ('963 EXT Schermerhorn Hall [SCH]', 20),
 ('716 Philosophy Hall', 20),
 ('627 Seeley W. Mudd Building', 20),
 ('633 Seeley W. Mudd Building', 20),
 ('1102 International Affairs Building', 20),
 ('402 Chandler', 20),
 ('801 International Affairs Building', 20),
 ('508 Dodge Hall', 20),
 ('711 International Affairs Building', 20),
 ('301M Fayerweather', 19),
 ('420 Pupin Laboratories', 19),
 ('402B International Affairs Building', 19),
 ('507 Dodge Hall', 1