In [1]:
from collections import defaultdict
from bs4 import BeautifulSoup, Tag
import pandas as pd
import requests
import re

BASE_URL = 'http://www.columbia.edu'

In [2]:
def subjUrl(letter):
    return BASE_URL + '/cu/bulletin/uwb/sel/subj-{}.html'.format(letter)

def getSubjects(letter):
    r = requests.get(subjUrl(letter))
    r.raise_for_status()
    soup = BeautifulSoup(r.text, 'lxml')
    table = soup.table
    rows = table.find_all('tr')[3:-2]
    return rows

In [3]:
paths = []
for c in range(65, 91):
    rows = getSubjects(chr(c))
    for row in rows:
        href = row.a['href']
        if 'subj/AU' in href: # auditing
            break
        if '__' in href: # seemingly invalid subjects
            continue
        paths.append(href)

In [16]:
def parseForSection(contents):   
    components = contents[2].contents 
    section = {
         'number': int(contents[0].a.string.replace('Section ', '')),
    }
    header = None
    for component in components:
        if header is not None:
            text = component.strip()
            try:
                text = int(text)
            except ValueError:
                pass # This is expected since not all values are integers (e.g. instructor namems)
            section[header] = text
            header = None
        if isinstance(component, Tag) and component.name == 'b':
            header = component.string.replace(':', '').strip()
    return section

def getCoursesFromPath(p):
    r = requests.get(BASE_URL + p)
    r.raise_for_status()
    soup = BeautifulSoup(r.text, 'lxml')
    course = None
    courses = defaultdict(list)
    for tr in soup.table.find_all('tr')[2:-1]:
        if tr.td.get('colspan') is not None:        
            course = tr.td.b.contents[-1]
            continue
        courses[course].append(parseForSection(tr.contents))
    return courses

In [17]:
getCoursesFromPath(paths[0])

defaultdict(list,
            {'Accounting I: Financial Accoun': [{'number': 200,
               'Call Number': 19291,
               'Points': 3,
               'Enrollment': '41 students as of February 24, 2019',
               'Instructors': 'Amir Ziv and Felicia C Goodman'},
              {'number': 1,
               'Call Number': 62180,
               'Points': 3,
               'Day/Time': 'MW 2:15pm-3:45pm',
               'Location': '141 Uris Hall',
               'Enrollment': '62 students as of February 24, 2019',
               'Instructor': 'Urooj Khan'},
              {'number': 2,
               'Call Number': 72707,
               'Points': 3,
               'Day/Time': 'MW 10:45am-12:15pm',
               'Location': '141 Uris Hall',
               'Enrollment': '63 students as of February 24, 2019'},
              {'number': 3,
               'Call Number': 63796,
               'Points': 3,
               'Day/Time': 'MW 9:00am-10:30am',
               'Location': '