In [1]:
import re
import pandas as pd

In [129]:
def handle_section(section):
    sectionCode = section[0].split('-')[0]
    
    sectionData = {
        "admin": {},
        "days": []
    }
    
    ## Handling admin stuff
    sectionData["admin"]["code"] = sectionCode
    sectionData["admin"]["type"] = section[0].split('-')[1].split('\n')[0]
    sectionData["admin"]["duration"] = section[0].split('-')[1].split('\n')[1]
    sectionData["admin"]["isOpen"] = 'Open' in section[5]
    
    ## Handling days
    
    ### Helper function to extract datetime details
    def handle_section_datetime(datetime):
        return {
            "weekday": datetime[:2],
            "time.start": datetime[3:].split(' - ')[0],
            "time.end": datetime[3:].split(' - ')[1]
        }

    ### Extract the relevant details in a very hacky way :)
    def mini_day_extractor(sectionToHandle, index):
        return {
            **handle_section_datetime(sectionToHandle[1].split('\n')[index]),
            "location": sectionToHandle[2].split('\n')[index],
            "prof": sectionToHandle[3].split('\n')[index],
            "date.start": sectionToHandle[4].split('\n')[index].split(' - ')[0],
            "date.end": sectionToHandle[4].split('\n')[index].split(' - ')[1]
        }
    
    ### Given the number of days (counted by the number of `\n` in the datetime string...), extract them
    for index in range(0, section[1].count('\n') + 1):
        sectionData["days"].append(mini_day_extractor(section, index))
            
    return {sectionCode: sectionData}

In [147]:
###
# Get the courses from a file, into a dict.
###

def load_courses_from_file(discipline, year, term):
    courses = {}

    with open(str('./pages/' + discipline + '-' + str(year) + '-' + term + '.txt'), 'r') as course_file:
        # Read course file into one big string
        data = course_file.read()

        # Break down to courses
        data = data.split('Collapse section ')

        # Remove erroneous space items
        del data[0]

        # Key the courses into the `courses` object
        for course in data:
            courses[course[0:8]] = course

    return courses
              
courses = load_courses_from_file('HIS', 2019, 'fall')

In [148]:
###
# Get the sections from each course, adding as a list to the dict item.
###

def extract_sections_by_course(courses_to_process):
    sections_by_course = {}
    
    for courseCode, courseStr in courses_to_process.items():
        sections = {}

        if ' \n\t\t\n\t\n\t\t\n\t\n\t\t\t\n\t\n  \tSection \tDays & Times \tRoom \tInstructor \tMeeting Dates \tStatus\nDetails\n\t\n' in courseStr:
            sectionContainer = courseStr.split(' \n\t\t\n\t\n\t\t\n\t\n\t\t\t\n\t\n  \tSection \tDays & Times \tRoom \tInstructor \tMeeting Dates \tStatus\nDetails\n\t\n')[1]
        else:
            sectionContainer = courseStr.split(' \n\t\t\n\t\n\t\t\n\t\n\t\t\n\t\n  \tSection \tDays & Times \tRoom \tInstructor \tMeeting Dates \tStatus\nDetails\n\t\n')[1]

        sectionData = sectionContainer.split('\n\t\n\t\t\n\t\n  \tSection \tDays & Times \tRoom \tInstructor \tMeeting Dates \tStatus\nDetails\n\t\n')

        for section in sectionData:
            try:
                sectionInfo = handle_section(section.split('\n\t\n'))
            except:
                pass

            sections.update(sectionInfo)

        sections_by_course[courseCode] = sections
        
    return sections_by_course

courses = extract_sections_by_course(courses)

In [151]:
## Clean badly processed courses :)

def clean_badly_processed_courses(courses):
    courses_to_process = courses.copy()

    coursesToDelete = {}

    for courseCode, courseObj in courses_to_process.items():
        if isinstance(courseObj, str):
            coursesToDelete[courseCode] = courseObj

    for course in coursesToDelete:
        courses_to_process.pop(course, None)

    print(str(len(coursesToDelete)) + " courses removed due to badly formed data.")
    
    return courses_to_process

courses = clean_badly_processed_courses(courses)

0 courses removed due to badly formed data.


In [155]:
def convert_courses_to_dataframe(courses):
    ## 1. Convert the courses object to a dataframe
    df = pd.DataFrame.from_dict(courses, orient='index')
    df = pd.DataFrame(df.stack())
    
    # df
    
    
    ## 2. Unpack the `data` column
    df = df.reset_index()
    df.columns = ['course', 'code', 'data']
    df

    df2 = df.join(pd.io.json.json_normalize(df['data']))
    df2 = df2.set_index(['course', 'code'])

    # df2
    
    
    ## 3. Unpack the `days` column

    ### Convert the days column from an object to a string
    import ast

    days_as_string = df2.astype({'days': str}).reset_index()['days'].apply(ast.literal_eval)
    days_as_string

    ### Unpack the column. The column contains lists of objects with consistent keys, so each object becomes its own row
    days_by_section = pd.concat([pd.DataFrame(x) for x in days_as_string], keys=days_as_string.index)
    days_by_section

    ### Join the expanded rows with their original courses, dropping the now-unused columns
    df3 = df2.reset_index().join(days_by_section.reset_index(1, drop=True))
    df3 = df3.set_index(['course', 'code']).drop('data', 1).drop('days', 1)

    # df3
    
    
    ## 4. Reorganize columns
    
    courses_by_section_by_day = df3[[
        'admin.duration',
        'admin.isOpen',
        'admin.type',
        'prof',
        'weekday',
        'time.start',
        'time.end',
        'location',
        'date.start',
        'date.end'
    ]]
    
    return courses_by_section_by_day
    
courses_by_section_by_day = convert_courses_to_dataframe(courses)
courses_by_section_by_day

Unnamed: 0_level_0,Unnamed: 1_level_0,admin.duration,admin.isOpen,admin.type,prof,weekday,time.start,time.end,location,date.start,date.end
course,code,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
POL 1101,B00,FullSess.,True,LEC,Staff,We,19:00,21:50,125 University (MNT) 202,2019-09-04,2019-12-03
POL 1101,C00,FullSess.,True,LEC,Staff,Tu,11:30,12:50,120 University (FSS) 2005,2019-09-04,2019-12-03
POL 1101,C00,FullSess.,True,LEC,Staff,Fr,13:00,14:20,120 University (FSS) 2005,2019-09-04,2019-12-03
POL 1102,A00,FullSess.,True,LEC,Staff,We,13:00,14:20,120 University (FSS) 2005,2019-09-04,2019-12-03
POL 1102,A00,FullSess.,True,LEC,Staff,Fr,11:30,12:50,120 University (FSS) 2005,2019-09-04,2019-12-03
POL 1501,B00,FullSess.,True,LEC,Noomane Raboudi,Mo,10:00,11:20,55 Laurier (DMS) 1160,2019-09-04,2019-12-03
POL 1501,B00,FullSess.,True,LEC,Noomane Raboudi,We,08:30,09:50,55 Laurier (DMS) 1160,2019-09-04,2019-12-03
POL 1502,A00,FullSess.,True,LEC,Staff,Mo,11:30,12:50,161 Louis Pasteur (CBY) C03,2019-09-04,2019-12-03
POL 1502,A00,FullSess.,True,LEC,Staff,Th,13:00,14:20,161 Louis Pasteur (CBY) C03,2019-09-04,2019-12-03
POL 2101,A00,FullSess.,True,LEC,Luc Turgeon,Mo,08:30,09:50,161 Louis Pasteur (CBY) C03,2019-09-04,2019-12-03


In [143]:
def load_course_descriptions(discipline):
    return pd.read_csv(str('data/courses/' + discipline + '.csv')).set_index('code')

course_descriptions = load_course_descriptions('HIS')
course_descriptions

Unnamed: 0_level_0,credits,year,language,title,description,extraDetails
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
POL 1101,3,1,English,Introduction to Political Science,"Comparative study of the ideas, concepts and i...",['Course Component: Lecture']
POL 1102,3,1,English,Politics and Globalization,"Understanding globalization in its political, ...",['Course Component: Lecture']
POL 1501,3,1,French,Introduction à la science politique,"Étude comparative des idées, concepts et enjeu...",['Volet : Cours magistral']
POL 1502,3,1,French,Politique et mondialisation,Examen du phénomène de la mondialisation : ses...,['Volet : Cours magistral']
POL 2101,3,2,English,Introduction to Canadian Politics,Foundations of the Canadian political space. C...,"['Course Component: Lecture', 'Prerequisite: P..."
POL 2103,3,2,English,Introduction to International Relations and Gl...,Study of international relations and the dynam...,"['Course Component: Lecture', 'Prerequisite: P..."
POL 2104,3,2,English,Introduction to Comparative Politics,Introduction to the comparative study of polit...,"['Course Component: Lecture', 'Prerequisite: P..."
POL 2107,3,2,English,Introduction to Political Thought,Origins and development of political thought. ...,"['Course Component: Lecture', 'Prerequisite : ..."
POL 2108,3,2,English,Modern Political Thought I,Origins and development of early modern politi...,"['Course Component: Lecture', 'Prerequisite : ..."
POL 2156,3,2,English,Foundations of Research in Political Science,Introduction to the fundamental dimensions of ...,"['Course Component: Lecture', 'Prerequisite: 1..."


In [158]:
def describe_course_sections(courses_by_section_by_day, course_descriptions):
    described_courses_by_section_by_day = pd.merge(courses_by_section_by_day.reset_index(1), course_descriptions, left_index=True, right_index=True)

    described_courses_by_section_by_day = described_courses_by_section_by_day.reset_index().set_index(['index', 'code'])
    described_courses_by_section_by_day.index.names = ['course', 'section']

    return described_courses_by_section_by_day

described_courses_by_section_by_day = describe_course_sections(courses_by_section_by_day, course_descriptions)

In [159]:
## HIS fall 2019
described_courses_by_section_by_day.query('course == ["HIS 1110", "HIS 3120", "HIS 3124", "HIS 3150", "HIS 4100", "HIS 4192", "HIS 4380", "HIS 4397"]')

Unnamed: 0_level_0,Unnamed: 1_level_0,admin.duration,admin.isOpen,admin.type,prof,weekday,time.start,time.end,location,date.start,date.end,credits,year,language,title,description,extraDetails
course,section,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1


In [None]:
## HIS winter 2019
described_courses_by_section_by_day.query('course == ["HIS 3125", "HIS 3190", "HIS 4360", "HIS 4364", "HIS 4135", ""]')