# Scrape the USC class schedules and write them out as CSV and JSON files

Our Model is a Catalog of Courses. Each Course can have many sections. Each section can have many Offering times.

In [None]:
import csvkit
import json

class Offering:
    def __init__(self,days,time = None,location = None,instructors =None, dateRange = None, scheduleType= None):
        self.days = days
        self.time = time
        self.location = location
        self.instructors = instructors
        self.dateRange = dateRange
        self.scheduleType = scheduleType

class Course:
    def __init__(self, number, name, credits, semesterAndYear):
        numList = number.split(' ')
        self.dept = numList[0] #CSCE
        self.courseNumber = numList[1] #101, or 799L
        self.number = number
        self.name = name
        self.credits = credits
        self.sections = {} #key is section number, value is list of Offering
        self.crn= {} #key is section number, value is crn
        self.semesterAndYear = semesterAndYear #the semesterAndYear of this offering: 
        self.term = None
        self.levels = None
        self.attributes = None
        self.description = None
        
    def getKey(self):
        return self.number

    def getNumber(self):
#        return str(departments.index(self.dept) + 1).zfill(3) + self.courseNumber.zfill(3)
        return self.dept + self.courseNumber
    
    def getId(self, section):
        return self.semesterAndYear + self.getNumber() + section
    
    def __repr__(self):
        result = ''
        for (section,offerings) in self.sections.iteritems():
            for offer in offerings:
                result += "%s:%s:%s:%s:%s:%s:%s:\n" % (self.getId(section),self.number.encode('utf-8'), 
                                section.encode('utf-8'), self.name.encode('utf-8'), 
                                offer.days.encode('utf-8'), offer.time.encode('utf-8'), 
                                offer.location.encode('utf-8'))
        return result
    
    def writeToCSVFile(self,writer, count):
        for (section,offerings) in self.sections.iteritems():
            for offer in offerings:
                writer.writerow([count, self.getId(section), self.semesterAndYear, self.dept, 
                                 self.courseNumber, section, self.name, self.credits, 
                                 offer.days, offer.time, offer.location, offer.instructors])
                count = count + 1
        return count
    
    def toDictionaryFlat(self):
        """Return an array of dictionaries, each dictionary is an offering (section)."""
        result = []
        for (section,offerings) in self.sections.iteritems():
            for offer in offerings:
                o = {'identifier': self.getId(section),
                     'semesteryear': self.semesterAndYear,
                     'dept': self.dept,
                     'number': self.courseNumber, 
                     'section': section,
                     'name': self.name,
                     'credits': self.credits,
                     'days': offer.days,
                     'time': offer.time,
                     'location': offer.location,
                     'instructors': offer.instructors}
                result.append(o)
        return result
    
    def toDictionary(self):
        """Return an array of dictionaries, each one is a class. Each class has a sections: key with
        a list of offerings (sections) for that class."""
        classDictionary = {
            'classid': self.dept + self.courseNumber,
            'semesteryear': self.semesterAndYear,
            'dept': self.dept,
            'number': self.courseNumber, 
            'name': self.name,
            'credits': self.credits,
            'sections': []}
        if self.description:
            classDictionary['description'] = self.description
        if self.levels:
            classDictionary['levels'] = self.levels
        if self.term:
            classDictionary['term'] = self.term
        if self.attributes:
            classDictionary['attributes'] = self.attributes
        for (sectionNumber,offerings) in self.sections.iteritems():
            section = {
                'section': sectionNumber,
                'offerings': []
            }
            if self.crn.has_key(sectionNumber):
                section['crn'] = self.crn[sectionNumber]
            for offer in offerings:
                o = {
                    'days': offer.days,
                    'time': offer.time,
                    'date': offer.dateRange,
                    'location': offer.location,
                    'instructors': offer.instructors,
                    'type': offer.scheduleType}
                section['offerings'].append(o)
            classDictionary['sections'].append(section)
        return classDictionary
    
    def addOffering(self, sectionNumber, crn, days = None, time=None, location=None, instructors=None, dateRange = None, scheduleType =None):
        """Add an offering time to this course, for sectionNumber. A section can have many offerings."""
        offer = Offering(days, time, location, instructors, dateRange, scheduleType)
        if self.sections.has_key(sectionNumber): #this sectionNumber already exists so add this offering to it
            self.sections[sectionNumber].append(offer)
        else:
            self.crn[sectionNumber] = crn
            self.sections[sectionNumber] = [offer] #first offering
            
    def mergeWith(self, other):
        """Add the sections of other to this course."""
        for (sectionNumber,offerings) in other.sections.iteritems():
            self.crn[sectionNumber] = other.crn[sectionNumber]
            if self.sections.has_key(sectionNumber):
                self.sections[sectionNumber] = self.sections[sectionNumber] + offerings
            else:
                self.sections[sectionNumber] = offerings
            
class Catalog:
    def __init__(self):
        self.courses = {}
        
    def add(self, course):
        """Add course, if one exists with that name then we just add in its sections."""
        if self.courses.has_key(course.getKey()): #already there, so add a new section
            currentCourse = self.courses[course.getKey()]
            currentCourse.mergeWith(course)
        else:
            self.courses[course.getKey()] = course
            
    def __getitem__(self, name):
        return self.courses[name]
    
    def __repr__(self):
        result = ''
        for c in self.courses:
            result += str(self.courses[c])
        return result
    
    def writeToCSVFile(self,writer):
        writer.writerow(["id", "identifier", "semesteryear", "department","number", "section", 
                         "name", "credits", "days", "time", "location", "instructors"])
        count = 1
        for c in self.courses:
            count = self.courses[c].writeToCSVFile(writer, count)
            
    def toDictionary(self):
        result = []
        count = 1
        for c in self.courses:
            course = self.courses[c].toDictionary()
            course['id'] = count
            count = count + 1
            result.append(course)
        return result
            
    def writeToFile(self,filename):
        if filename[-4:] == '.csv':
            with open(filename, 'w') as csvfile:
                writer = csvkit.writer(csvfile)
                self.writeToCSVFile(writer)
            return
        elif filename[-5:] == '.json':
            with open(filename, 'w') as jsonfile:
                json.dump(self.toDictionary(), jsonfile)
            return
        print "ERROR: Filetype is not supported: %s" % filename

We define a couple of functions helpful in scraping the content.

In [None]:
def gatherFollowString(soup, keyList):
    """Returns a dictionaray where each key is in keyList and its value is the string that follows the 
    key in soup, after soup.stripped_strings()"""
    result = {}
    for (first, second) in inpairs(soup.stripped_strings):
        if first in keyList:
            result[first] = second
    return result

def getStringThatEndsWith(soup, value):
    """Returns the first text string in soup that ends with value. Return the empty string if none found"""
    for v in soup.stripped_strings:
        if v.endswith(value):
            return v
    return ''

def gatherTable(soup):
    """Assume that soup is a table, and its first row contains the keys, in th. 
    Returns an array of dictionaries, where each dictionary contains the values from a row, 
    and the keys are from the first row."""
    rows = soup.find_all('tr')

    #set all the keys
    keys = []
    for e in rows[0].find_all('th'):
        keys.append(e.string)

    #set the rows
    result = []
    del rows[0]
    for row in rows: #the rest of the table
        r = {}
        for (key,e) in zip(keys,row.find_all('td')): #we assume that all rows use td
            r[key] = ' '.join(e.stripped_strings) #all the strings joined together.
        result.append(r)
    return result

from itertools import tee, izip

def pairwise(iterable):
    "s -> (s0,s1), (s2,3),...."
    a = iter(iterable)
    return izip(a, a)

def inpairs(iterable):
    "s -> (s0,s1), (s1,s2), (s2, s3), ..."
    a, b = tee(iterable)
    next(b, None)
    return izip(a, b)

Function to read the raw HTML and add it to a catalog.

In [None]:
from bs4 import BeautifulSoup
import re

departments = ["ACCT","AERO","AFAM","AFCI","AFYS","AMST","ANES","ANTH","ARAB","ARMY","ARTE","ARTH","ARTS","ASLG",
"ASTR","ASUP","ATEP","BADM","BIOL","BIOS","BMEN","BMSC","CAST","CHEM","CHIN","CLAS","COMD","COMM","COSM","CPLT",
"CRJU","CSCE","CSCI","CSXE","DANC","DMSB","ECHE","ECIV","ECON","EDAD","EDCE","EDCF","EDCI","EDCS","EDEC","EDEL",
"EDET","EDEX","EDFI","EDFN","EDFO","EDHE","EDHL","EDLC","EDLD","EDLP","EDML","EDPH","EDPY","EDRD","EDRE","EDRM",
"EDSC","EDSE","EDTE","EDUC","EDVI","ELCT","EMCH","EMED","ENCP","ENFS","ENGL","ENHS","ENSL","ENVR","EPID","ETMG",
"EURO","EXSC","FILM","FINA","FORL","FPMD","FREN","GEOG","GEOL","GERM","GMED","GRAD","GREK","HGEN","HIMS","HIST",
"HMSV","HONS","HPEB","HPRO","HRSM","HRTM","HSPM","IBUS","IDST","INFO","INTL","ITAL","ITEC","JAPA","JOUR","JSTU",
"LASP","LATN","LBST","LIBR","LING","LOGC","MART","MATH","MBAD","MBIM","MCBA","MEDI","MGMT","MGSC","MILS","MKTG",
"MSCI","MUED","MUSC","MUSM","NAVY","NEUR","NPAD","NPSY","NURS","OBGY","ORSU","PALM","PATH","PEDI","PEDU","PHAR",
"PHIL","PHPH","PHYS","PHYT","PMDR","POLI","PORT","PSYC","PUBH","RADI","RCAM","RELG","RETL","RHAB","RUSS","SAEL",
"SCCP","SCHC","SLIS","SMED","SOCY","SOST","SOWK","SPAN","SPCH","SPTE","STAT","SURG","THEA","UNIV","USC","WGST"]

#Turn 'CSCE 490' into an integer
def numberToInt(number):
    return departments.index(number[:4])* 1000 + int(number[-3:])    

def parsePage(webpage, catalog,semesterAndYear):
    """Parse the webpage and add its contents to the catalog"""
    soup = BeautifulSoup(webpage,"lxml")
    table = soup.find_all("table", class_="datadisplaytable")
    rows = [row for row in table[0].children if row.name == u'tr']
    for name,times in pairwise(rows):
        nameParts = name.th.a.string.split(' - ')
        detailsUrl = name.th.a.get('href')
        crn = re.match(r'.*crn_in=(\d+).*', detailsUrl, re.I).group(1)
        number = nameParts[-2].strip() #next-to-last is 'CSCE 101'
        name = (' - '.join(nameParts[:-3])).strip()
        section = nameParts[-1].strip() #last one is section #
        credits = getStringThatEndsWith(times, "Credits")
        if len(credits) > 0:
            credits = credits[0:-8]
        course = Course(number,name,credits,semesterAndYear)
        TERM = "Associated Term:"
        LEVELS = "Levels:"
        ATTR = "Attributes:"
        info = gatherFollowString(times, [TERM, LEVELS, ATTR])
        if info.has_key(LEVELS):
            course.levels = info[LEVELS]
        if info.has_key(ATTR):
            course.attributes = info[ATTR]
        if times.table: #section has some offerings
            offerings = gatherTable(times.table)
            for offer in offerings:
                #replace non-breaking unicode spaces (from &nbsp; via BS) with nothing.
                days = offer['Days'].replace(u'\xa0', '') 
                course.addOffering(section, crn,days, offer['Time'], offer['Where'], offer['Instructors'], offer['Date Range'],offer['Schedule Type'])
        else: #no offerings, add this section with empty values for the offering, as it appears on the webpage
            course.addOffering(section,crn)
        catalog.add(course)


Functions to fetch and parse the course descriptions and add them to a catalog.

In [None]:
# https://ssb.onecarolina.sc.edu/BANP/bwckctlg.p_display_courses?term_in=201508&one_subj=CSCE&sel_crse_strt=101&sel_crse_end=101&sel_subj=&sel_levl=&sel_schd=&sel_coll=&sel_divs=&sel_dept=&sel_attr=
import time
import sys
import requests

def parseCourseDescriptionPage(webpage, course):
    """Extract the course description from webpage and add it to course"""
    soup = BeautifulSoup(webpage,"lxml")
    table = soup.find_all("table", class_="datadisplaytable")
    rows = [row for row in table[0].children if row.name == u'tr']
    course.description = rows[1].td.contents[0].strip()

def setCourseDescriptions(catalog):
    """Fetches the course descriptions for all the courses and adds them to the course.description of each one."""
    print "Fetching %d courses" % len(catalog.courses)
    i = 1
    for name,course in catalog.courses.iteritems():
        print "%d. %s" % (i, name)
        sys.stdout.flush()
        i = i + 1
        payload = 'term_in='+ course.semesterAndYear +'&one_subj='+ course.dept +'&sel_crse_strt='+ course.courseNumber +'&sel_crse_end='+ course.courseNumber +'&sel_subj=&sel_levl=&sel_schd=&sel_coll=&sel_divs=&sel_dept=&sel_attr='
        r = requests.post('https://ssb.onecarolina.sc.edu/BANP/bwckctlg.p_display_courses', data=payload)
        parseCourseDescriptionPage(r.text,course)
        time.sleep(.5)

Download the webpages, parse and add them to the catalog. Then write each one out to a file.

In [None]:
#201501 is Spring
#201505 is Summer
#201508 is Fall
#semesterAndYear = '201508'

#departments = ['CSCE'] #for testing
#catalog = 3 #for testing

def scrape():
#    global catalog
    for semesterAndYear in ['201501', '201505', '201508']:
        catalog = Catalog()
        for dept in departments:
            payload = 'term_in=' + semesterAndYear + '&sel_subj=dummy&sel_day=dummy&sel_schd=dummy&sel_insm=dummy&sel_camp=dummy&sel_levl=dummy&sel_sess=dummy&sel_instr=dummy&sel_ptrm=dummy&sel_attr=dummy&sel_camp=COL&sel_subj=' + dept + '&sel_crse=&sel_title=&sel_from_cred=&sel_to_cred=&sel_levl=%25&sel_ptrm=%25&sel_sess=%25&sel_attr=%25&begin_hh=0&begin_mi=0&begin_ap=a&end_hh=0&end_mi=0&end_ap=a'
            print dept
            sys.stdout.flush()
            r = requests.post("https://ssb.onecarolina.sc.edu/BANP/bwckschd.p_get_crse_unsec", data=payload)
            parsePage(r.text,catalog,semesterAndYear)
            time.sleep(1)
        setCourseDescriptions(catalog)
        print "Writing Files"
        sys.stdout.flush()
        catalog.writeToFile('schedule-' + semesterAndYear + '.csv')
        catalog.writeToFile('schedule-'+ semesterAndYear +'.json')
        
scrape()

## Things that did not work out

I first tried to send the data using a dictionary, the normal way, but the website responded with the search webpage, not the results. I fixed this problem by sending the same data string as my browser does.

So, either I have a typo on my payload below, which I still cannot find, or the arguments have to be in this specific order for the USC server to work. I noticed that when I use the dictionary payload the arguments are sent to the server in a different order.

In [None]:
#Using this one does not work because it sends the keys in the wrong order
payload = {'term_in': 201501,
'sel_subj': 'dummy',
'sel_day': 'dummy',
'sel_schd': 'dummy',
'sel_insm': 'dummy',
'sel_camp': 'dummy',
'sel_levl': 'dummy',
'sel_sess': 'dummy',
'sel_instr': 'dummy',
'sel_ptrm': 'dummy',
'sel_attr': 'dummy',
'sel_camp': 'COL',
'sel_subj': 'CSCE',
'sel_crse': '',
'sel_title': '',
'sel_from_cred': '',
'sel_to_cred': '',
'sel_levl':'%',
'sel_ptrm':'%',
'sel_sess':'%',
'sel_attr':'%',
'begin_hh': 0,
'begin_mi': 0,
'begin_ap': 'a',
'end_hh': 0,
'end_mi': 0,
'end_ap': 'a'}

#cookies: We don't need these.
cookies= {
    '_utma':'2872370.68114765.1407249036.1412703684.1412714272.86',
    '_utmz': '2872370.1410959235.55.3.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided)',
    '_ga': 'GA1.2.68114765.1407249036',
    'accessibility': 'false',
    'sghe_magellan_locale': 'en_US',
    'sghe_magellan_username': ''}

#We don't need these headers either.
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.101 Safari/537.36',
           'Referer': 'https://ssb.onecarolina.sc.edu/BANP/bwckgens.p_proc_term_date',
           'Origin': 'https://ssb.onecarolina.sc.edu',
           'Host': 'ssb.onecarolina.sc.edu',
           'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
           'Accept-Language':'en-US,en;q=0.8,es;q=0.6'}

#We must use a string for the data because (it seems) the keys have to be in this order!!!
payload = 'term_in=201501&sel_subj=dummy&sel_day=dummy&sel_schd=dummy&sel_insm=dummy&sel_camp=dummy&sel_levl=dummy&sel_sess=dummy&sel_instr=dummy&sel_ptrm=dummy&sel_attr=dummy&sel_camp=COL&sel_subj=CSCE&sel_crse=&sel_title=&sel_from_cred=&sel_to_cred=&sel_levl=%25&sel_ptrm=%25&sel_sess=%25&sel_attr=%25&begin_hh=0&begin_mi=0&begin_ap=a&end_hh=0&end_mi=0&end_ap=a'