In [25]:
from collections import defaultdict
from bs4 import BeautifulSoup, Tag
import pandas as pd
import requests
import re
import json
import datetime

BASE_URL = 'http://www.columbia.edu'

def subjUrl(letter):
    return BASE_URL + '/cu/bulletin/uwb/sel/subj-{}.html'.format(letter)

def getSubjects(letter):
    r = requests.get(subjUrl(letter))
    r.raise_for_status()
    soup = BeautifulSoup(r.text, 'lxml')
    table = soup.table
    rows = table.find_all('tr')[3:-2]
    return rows

subjPattern = re.compile('subj\/(\w{4})')
def getSubjectFromPath(p):
    matches = subjPattern.findall(p)
    if len(matches) == 0:
        return None
    return matches[0]

def parseForSection(contents):
    components = contents[2].contents
    section = {
         'number': contents[0].a.string.replace('Section ', ''),
    }
    header = None
    for component in components:
        if header is not None:
            text = component.strip()
            try:
                text = int(text)
            except ValueError:
                pass # This is expected since not all values are integers (e.g. instructor names)
            section[header] = text
            header = None
        if isinstance(component, Tag) and component.name == 'b':
            header = component.string.replace(':', '').strip()
    return section

CODE_PATTERN = re.compile(r'[A-Z]{1,2}\d{4}')
def getCoursesFromPath(p):
    r = requests.get(BASE_URL + p)
    r.raise_for_status()
    soup = BeautifulSoup(r.text, 'lxml')
    subject = getSubjectFromPath(p)
    course = None
    courseNumber = None
    courses = []
    for tr in soup.table.find_all('tr')[2:-1]:
        if tr.td.get('colspan') is not None:
            course = tr.td.b.contents[-1]
            courseNumber = CODE_PATTERN.search(tr.td.b.contents[0])
            if courseNumber is not None:
                courseNumber = courseNumber.group()
            continue
        section = parseForSection(tr.contents)
        section['course'] = course
        section['courseNumber'] = courseNumber
        section['subject'] = subject
        section['vergilLink'] = tr.find_all('td')[1].a['href']
        courses.append(section)
    return courses

def now(hours=True):
    f_string = '%Y-%m-%d'
    if hours:
        f_string += 'T%H:%M:%S'
    return datetime.datetime.now().strftime(f_string)

def main(semester):
    paths = []
    for c in range(65, 91):
        rows = getSubjects(chr(c))
        for row in rows:
            anchors = row.findAll('a')
            for a in anchors:
                href = a['href']
                if 'subj/AU' in href: # auditing
                    break
                if '__' in href: # seemingly invalid subjects
                    continue
                if semester not in href:
                    continue
                paths.append(href)

    print(f'Following {len(paths)} paths...')

    courses = []
    for p in paths:
        courses += getCoursesFromPath(p)

    print(f'Scanned {len(courses)} courses.')
    
    return courses

    with open(f'archive-{semester}/{now()}.json', 'w') as f:
        f.write(json.dumps(courses))

In [26]:
courses = main('Fall2019')

Following 265 paths...
Scanned 7967 courses.


In [32]:
cours = [c for c in courses if c.get('Instructor', None) is not None]

In [38]:
s = BeautifulSoup(requests.get('https://vergil.registrar.columbia.edu/#/courses/ACCTB5001_001_2019_3').content)

In [48]:
s.find('div', {'class': 'course-section'}) is None

True

In [31]:
facultyUnis = {}

for c in courses:
    instructor = c.get('Instructor', None)
    if instructor is None:
        continue    
    if instructor in facultyUnis:
        continue
    
    vlink = c['vergilLink']
    
    print(c)
    facultyUnis[instructor] = 'hi'
    

{'number': '001', 'Call Number': 14677, 'Points': 3, 'Enrollment': '66 students (73 max) as of September  5, 2019', 'Instructor': 'Trevor S Harris', 'course': 'Accounting I: Financial Accoun', 'courseNumber': 'B5001', 'subject': 'ACCT', 'vergilLink': 'https://vergil.registrar.columbia.edu/#/courses/ACCTB5001_001_2019_3'}
{'number': 'XMT', 'Call Number': 18173, 'Points': 0, 'Enrollment': '1 student (5 max) as of September  5, 2019', 'Instructor': 'Jessica Soursourian', 'course': 'Accounting I: Financial Accoun', 'courseNumber': 'B5001', 'subject': 'ACCT', 'vergilLink': 'https://vergil.registrar.columbia.edu/#/courses/ACCTB5001_XMT_2019_3'}
{'number': '300', 'Call Number': 14679, 'Points': '1.5', 'Enrollment': '46 students (75 max) as of September  5, 2019', 'Instructor': 'Tim Baldenius', 'course': 'Financial Planning & Analysis', 'courseNumber': 'B5007', 'subject': 'ACCT', 'vergilLink': 'https://vergil.registrar.columbia.edu/#/courses/ACCTB5007_300_2019_3'}
{'number': '001', 'Call Numbe

{'number': '023', 'Call Number': 10440, 'Points': 3, 'Day/Time': 'TR 1:10pm-2:25pm', 'Location': '404 Dodge Hall', 'Enrollment': '25 students (24 max) as of September  5, 2019 / Full', 'Instructor': 'Joshua Navon', 'course': 'MASTERPIECES OF WESTERN MUSIC', 'courseNumber': 'UN1123', 'subject': 'HUMA', 'vergilLink': 'https://vergil.registrar.columbia.edu/#/courses/HUMAW1123_023_2019_3'}
{'number': '024', 'Call Number': 10441, 'Points': 3, 'Day/Time': 'TR 1:10pm-2:25pm', 'Location': '716 Hamilton Hall', 'Enrollment': '25 students (24 max) as of September  5, 2019 / Full', 'Instructor': 'Sonja G Wermager', 'course': 'MASTERPIECES OF WESTERN MUSIC', 'courseNumber': 'UN1123', 'subject': 'HUMA', 'vergilLink': 'https://vergil.registrar.columbia.edu/#/courses/HUMAW1123_024_2019_3'}
{'number': '025', 'Call Number': 10442, 'Points': 3, 'Day/Time': 'TR 2:40pm-3:55pm', 'Location': '404 Dodge Hall', 'Enrollment': '25 students (24 max) as of September  5, 2019 / Full', 'Instructor': 'Magdalena Bacze

{'number': '002', 'Call Number': 16477, 'Points': '0-9', 'Enrollment': '0 students (10 max) as of September  5, 2019', 'Instructor': 'Teena Brooks', 'course': 'SECOND YEAR FIELD WORK', 'courseNumber': 'T6020', 'subject': 'SOCW', 'vergilLink': 'https://vergil.registrar.columbia.edu/#/courses/SOCWT6020_002_2019_3'}
{'number': '003', 'Call Number': 16478, 'Points': '0-9', 'Enrollment': '0 students (10 max) as of September  5, 2019', 'Instructor': 'Robert E Cortes', 'course': 'SECOND YEAR FIELD WORK', 'courseNumber': 'T6020', 'subject': 'SOCW', 'vergilLink': 'https://vergil.registrar.columbia.edu/#/courses/SOCWT6020_003_2019_3'}
{'number': '004', 'Call Number': 16479, 'Points': '0-9', 'Enrollment': '0 students (10 max) as of September  5, 2019', 'Instructor': 'Amira Crawford', 'course': 'SECOND YEAR FIELD WORK', 'courseNumber': 'T6020', 'subject': 'SOCW', 'vergilLink': 'https://vergil.registrar.columbia.edu/#/courses/SOCWT6020_004_2019_3'}
{'number': '005', 'Call Number': 16480, 'Points': 

In [19]:
instructors = [c['Instructors'] for c in courses if c.get('Instructors', None) is not None]

In [23]:
[i.split(' and ') for i in instructors]

[['Anne Katcher', 'Lina Xu'],
 ['Hengyong Mo', 'Lina Xu', 'Yuhong Xue'],
 ['Dariush Akhtari', 'Lina Xu'],
 ['Yubo Wang', 'Lina Xu'],
 ['Abraham Weishaus', 'Lina Xu'],
 ['Abraham Weishaus', 'Lina Xu'],
 ['Abraham Weishaus', 'Lina Xu'],
 ['Yubo Wang', 'Lina Xu'],
 ['Yubo Wang', 'Lina Xu'],
 ['Gary Venter', 'Lina Xu'],
 ['Donald F Mango', 'Lina Xu', 'James Maher'],
 ['Maria C McCormack', 'Lina Xu'],
 ['John N Vitucci', 'Lina Xu'],
 ['Daniel Fleming', 'Stephane A Charitos'],
 ['Craig D Blinderman', 'Robert E Pollack', 'Brigid M Connelly'],
 ['Saundra Curry', 'Helen Velazquez'],
 ['Saundra Curry', 'Helen Velazquez'],
 ['Saundra Curry', 'Helen Velazquez'],
 ['Saundra Curry', 'Helen Velazquez'],
 ['Lesley Sharp', 'Nadia Abu El-Haj', 'Severin Fowles'],
 ['Akeel Bilgrami', 'Partha Chatterjee'],
 ['Stephen L Ostrow', 'Marco Zaider'],
 ['Anna Rozenshtein', 'Monique C Katz', 'Matthew P Moy'],
 ['Karen Fairbanks', 'Joeb Moore'],
 ['Danielle S Smoller', 'David E Benjamin'],
 ['Carol A Willis', 'Rosa