In [1]:
import re
import pandas as pd

In [128]:
###
# Get the courses from a file, into a dict.
###

def load_courses_from_file(discipline, year, term):
    courses = {}

    with open(str('./pages/' + discipline + '-' + str(year) + '-' + term + '.txt'), 'r') as course_file:
        # Read course file into one big string
        data = course_file.read()

        # Break down to courses
        data = data.split('Collapse section ')

        # Remove erroneous space items
        del data[0]

        # Key the courses into the `courses` object
        for course in data:
            courses[course[0:8]] = course

    return courses
              
courses = load_courses_from_file('POL', 2019, 'fall')

In [129]:
def handle_section(section):
    sectionCode = section[0].split('-')[0]
    
    sectionData = {
        "admin": {},
        "days": []
    }
    
    ## Handling admin stuff
    sectionData["admin"]["code"] = sectionCode
    sectionData["admin"]["type"] = section[0].split('-')[1].split('\n')[0]
    sectionData["admin"]["duration"] = section[0].split('-')[1].split('\n')[1]
    sectionData["admin"]["isOpen"] = 'Open' in section[5]
    
    ## Handling days
    
    ### Helper function to extract datetime details
    def handle_section_datetime(datetime):
        return {
            "weekday": datetime[:2],
            "time.start": datetime[3:].split(' - ')[0],
            "time.end": datetime[3:].split(' - ')[1]
        }

    ### Extract the relevant details in a very hacky way :)
    def mini_day_extractor(sectionToHandle, index):
        return {
            **handle_section_datetime(sectionToHandle[1].split('\n')[index]),
            "location": sectionToHandle[2].split('\n')[index],
            "prof": sectionToHandle[3].split('\n')[index],
            "date.start": sectionToHandle[4].split('\n')[index].split(' - ')[0],
            "date.end": sectionToHandle[4].split('\n')[index].split(' - ')[1]
        }
    
    ### Given the number of days (counted by the number of `\n` in the datetime string...), extract them
    for index in range(0, section[1].count('\n') + 1):
        sectionData["days"].append(mini_day_extractor(section, index))
            
    return {sectionCode: sectionData}

In [132]:
###
# Get the sections from each course, adding as a list to the dict item.
###

def extract_sections_by_course(courses_to_process):
    sections_by_course = {}
    
    for courseCode, courseStr in courses_to_process.items():
        sections = {}

        if ' \n\t\t\n\t\n\t\t\n\t\n\t\t\t\n\t\n  \tSection \tDays & Times \tRoom \tInstructor \tMeeting Dates \tStatus\nDetails\n\t\n' in courseStr:
            sectionContainer = courseStr.split(' \n\t\t\n\t\n\t\t\n\t\n\t\t\t\n\t\n  \tSection \tDays & Times \tRoom \tInstructor \tMeeting Dates \tStatus\nDetails\n\t\n')[1]
        else:
            sectionContainer = courseStr.split(' \n\t\t\n\t\n\t\t\n\t\n\t\t\n\t\n  \tSection \tDays & Times \tRoom \tInstructor \tMeeting Dates \tStatus\nDetails\n\t\n')[1]

        sectionData = sectionContainer.split('\n\t\n\t\t\n\t\n  \tSection \tDays & Times \tRoom \tInstructor \tMeeting Dates \tStatus\nDetails\n\t\n')

        for section in sectionData:
            try:
                sectionInfo = handle_section(section.split('\n\t\n'))
            except:
                pass

            sections.update(sectionInfo)

        sections_by_course[courseCode] = sections
        
    return sections_by_course

courses = extract_sections_by_course(courses)
    
## (sometimes this errors out with "list index out of range". it's all hacky anyway. power through!)

In [131]:
test = extract_sections_by_course(courses)

test

{'POL 1101': {'B00': {'admin': {'code': 'B00',
    'duration': 'FullSess.',
    'isOpen': True,
    'type': 'LEC'},
   'days': [{'date.end': '2019-12-03',
     'date.start': '2019-09-04',
     'location': '125 University (MNT) 202',
     'prof': 'Staff',
     'time.end': '21:50',
     'time.start': '19:00',
     'weekday': 'We'}]},
  'C00': {'admin': {'code': 'C00',
    'duration': 'FullSess.',
    'isOpen': True,
    'type': 'LEC'},
   'days': [{'date.end': '2019-12-03',
     'date.start': '2019-09-04',
     'location': '120 University (FSS) 2005',
     'prof': 'Staff',
     'time.end': '12:50',
     'time.start': '11:30',
     'weekday': 'Tu'},
    {'date.end': '2019-12-03',
     'date.start': '2019-09-04',
     'location': '120 University (FSS) 2005',
     'prof': 'Staff',
     'time.end': '14:20',
     'time.start': '13:00',
     'weekday': 'Fr'}]}},
 'POL 1102': {'A00': {'admin': {'code': 'A00',
    'duration': 'FullSess.',
    'isOpen': True,
    'type': 'LEC'},
   'days': [{'dat

In [133]:
## Clean badly processed courses :)
coursesToDelete = {}

for courseCode, courseObj in courses.items():
    if isinstance(courseObj, str):
        coursesToDelete[courseCode] = courseObj
        
for course in coursesToDelete:
    courses.pop(course, None)
        
print(str(len(coursesToDelete)) + " courses removed due to badly formed data.")

0 courses removed due to badly formed data.


In [134]:
courses

{'POL 1101': {'B00': {'admin': {'code': 'B00',
    'duration': 'FullSess.',
    'isOpen': True,
    'type': 'LEC'},
   'days': [{'date.end': '2019-12-03',
     'date.start': '2019-09-04',
     'location': '125 University (MNT) 202',
     'prof': 'Staff',
     'time.end': '21:50',
     'time.start': '19:00',
     'weekday': 'We'}]},
  'C00': {'admin': {'code': 'C00',
    'duration': 'FullSess.',
    'isOpen': True,
    'type': 'LEC'},
   'days': [{'date.end': '2019-12-03',
     'date.start': '2019-09-04',
     'location': '120 University (FSS) 2005',
     'prof': 'Staff',
     'time.end': '12:50',
     'time.start': '11:30',
     'weekday': 'Tu'},
    {'date.end': '2019-12-03',
     'date.start': '2019-09-04',
     'location': '120 University (FSS) 2005',
     'prof': 'Staff',
     'time.end': '14:20',
     'time.start': '13:00',
     'weekday': 'Fr'}]}},
 'POL 1102': {'A00': {'admin': {'code': 'A00',
    'duration': 'FullSess.',
    'isOpen': True,
    'type': 'LEC'},
   'days': [{'dat

In [135]:
## Convert the courses object to a dataframe
df = pd.DataFrame.from_dict(courses, orient='index')
df = pd.DataFrame(df.stack())

df

Unnamed: 0,Unnamed: 1,0
POL 1101,B00,"{'admin': {'code': 'B00', 'type': 'LEC', 'dura..."
POL 1101,C00,"{'admin': {'code': 'C00', 'type': 'LEC', 'dura..."
POL 1102,A00,"{'admin': {'code': 'A00', 'type': 'LEC', 'dura..."
POL 1501,B00,"{'admin': {'code': 'B00', 'type': 'LEC', 'dura..."
POL 1502,A00,"{'admin': {'code': 'A00', 'type': 'LEC', 'dura..."
POL 2101,A00,"{'admin': {'code': 'A00', 'type': 'LEC', 'dura..."
POL 2103,B00,"{'admin': {'code': 'B00', 'type': 'LEC', 'dura..."
POL 2103,C00,"{'admin': {'code': 'C00', 'type': 'LEC', 'dura..."
POL 2104,A00,"{'admin': {'code': 'A00', 'type': 'LEC', 'dura..."
POL 2107,B00,"{'admin': {'code': 'B00', 'type': 'LEC', 'dura..."


In [136]:
## Unpack the `data` column
df = df.reset_index()
df.columns = ['course', 'code', 'data']
df

df2 = df.join(pd.io.json.json_normalize(df['data']))
df2 = df2.set_index(['course', 'code'])

df2

Unnamed: 0_level_0,Unnamed: 1_level_0,data,admin.code,admin.duration,admin.isOpen,admin.type,days
course,code,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
POL 1101,B00,"{'admin': {'code': 'B00', 'type': 'LEC', 'dura...",B00,FullSess.,True,LEC,"[{'weekday': 'We', 'time.start': '19:00', 'tim..."
POL 1101,C00,"{'admin': {'code': 'C00', 'type': 'LEC', 'dura...",C00,FullSess.,True,LEC,"[{'weekday': 'Tu', 'time.start': '11:30', 'tim..."
POL 1102,A00,"{'admin': {'code': 'A00', 'type': 'LEC', 'dura...",A00,FullSess.,True,LEC,"[{'weekday': 'We', 'time.start': '13:00', 'tim..."
POL 1501,B00,"{'admin': {'code': 'B00', 'type': 'LEC', 'dura...",B00,FullSess.,True,LEC,"[{'weekday': 'Mo', 'time.start': '10:00', 'tim..."
POL 1502,A00,"{'admin': {'code': 'A00', 'type': 'LEC', 'dura...",A00,FullSess.,True,LEC,"[{'weekday': 'Mo', 'time.start': '11:30', 'tim..."
POL 2101,A00,"{'admin': {'code': 'A00', 'type': 'LEC', 'dura...",A00,FullSess.,True,LEC,"[{'weekday': 'Mo', 'time.start': '08:30', 'tim..."
POL 2103,B00,"{'admin': {'code': 'B00', 'type': 'LEC', 'dura...",B00,FullSess.,True,LEC,"[{'weekday': 'Tu', 'time.start': '19:00', 'tim..."
POL 2103,C00,"{'admin': {'code': 'C00', 'type': 'LEC', 'dura...",C00,FullSess.,True,LEC,"[{'weekday': 'Tu', 'time.start': '16:00', 'tim..."
POL 2104,A00,"{'admin': {'code': 'A00', 'type': 'LEC', 'dura...",A00,FullSess.,True,LEC,"[{'weekday': 'Tu', 'time.start': '08:30', 'tim..."
POL 2107,B00,"{'admin': {'code': 'B00', 'type': 'LEC', 'dura...",B00,FullSess.,True,LEC,"[{'weekday': 'Mo', 'time.start': '11:30', 'tim..."


In [137]:
## Unpack the `days` column

### Convert the days column from an object to a string
import ast

days_as_string = df2.astype({'days': str}).reset_index()['days'].apply(ast.literal_eval)
days_as_string

### Unpack the column. The column contains lists of objects with consistent keys, so each object becomes its own row
days_by_section = pd.concat([pd.DataFrame(x) for x in days_as_string], keys=days_as_string.index)
days_by_section

### Join the expanded rows with their original courses, dropping the now-unused columns
df3 = df2.reset_index().join(days_by_section.reset_index(1, drop=True))
df3 = df3.set_index(['course', 'code']).drop('data', 1).drop('days', 1)

df3

Unnamed: 0_level_0,Unnamed: 1_level_0,admin.code,admin.duration,admin.isOpen,admin.type,date.end,date.start,location,prof,time.end,time.start,weekday
course,code,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
POL 1101,B00,B00,FullSess.,True,LEC,2019-12-03,2019-09-04,125 University (MNT) 202,Staff,21:50,19:00,We
POL 1101,C00,C00,FullSess.,True,LEC,2019-12-03,2019-09-04,120 University (FSS) 2005,Staff,12:50,11:30,Tu
POL 1101,C00,C00,FullSess.,True,LEC,2019-12-03,2019-09-04,120 University (FSS) 2005,Staff,14:20,13:00,Fr
POL 1102,A00,A00,FullSess.,True,LEC,2019-12-03,2019-09-04,120 University (FSS) 2005,Staff,14:20,13:00,We
POL 1102,A00,A00,FullSess.,True,LEC,2019-12-03,2019-09-04,120 University (FSS) 2005,Staff,12:50,11:30,Fr
POL 1501,B00,B00,FullSess.,True,LEC,2019-12-03,2019-09-04,55 Laurier (DMS) 1160,Noomane Raboudi,11:20,10:00,Mo
POL 1501,B00,B00,FullSess.,True,LEC,2019-12-03,2019-09-04,55 Laurier (DMS) 1160,Noomane Raboudi,09:50,08:30,We
POL 1502,A00,A00,FullSess.,True,LEC,2019-12-03,2019-09-04,161 Louis Pasteur (CBY) C03,Staff,12:50,11:30,Mo
POL 1502,A00,A00,FullSess.,True,LEC,2019-12-03,2019-09-04,161 Louis Pasteur (CBY) C03,Staff,14:20,13:00,Th
POL 2101,A00,A00,FullSess.,True,LEC,2019-12-03,2019-09-04,161 Louis Pasteur (CBY) C03,Luc Turgeon,09:50,08:30,Mo


In [138]:
## Reorganize columns
courses_by_section_by_day = df3[[
    'admin.duration',
    'admin.isOpen',
    'admin.type',
    'prof',
    'weekday',
    'time.start',
    'time.end',
    'location',
    'date.start',
    'date.end'
]]
courses_by_section_by_day

Unnamed: 0_level_0,Unnamed: 1_level_0,admin.duration,admin.isOpen,admin.type,prof,weekday,time.start,time.end,location,date.start,date.end
course,code,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
POL 1101,B00,FullSess.,True,LEC,Staff,We,19:00,21:50,125 University (MNT) 202,2019-09-04,2019-12-03
POL 1101,C00,FullSess.,True,LEC,Staff,Tu,11:30,12:50,120 University (FSS) 2005,2019-09-04,2019-12-03
POL 1101,C00,FullSess.,True,LEC,Staff,Fr,13:00,14:20,120 University (FSS) 2005,2019-09-04,2019-12-03
POL 1102,A00,FullSess.,True,LEC,Staff,We,13:00,14:20,120 University (FSS) 2005,2019-09-04,2019-12-03
POL 1102,A00,FullSess.,True,LEC,Staff,Fr,11:30,12:50,120 University (FSS) 2005,2019-09-04,2019-12-03
POL 1501,B00,FullSess.,True,LEC,Noomane Raboudi,Mo,10:00,11:20,55 Laurier (DMS) 1160,2019-09-04,2019-12-03
POL 1501,B00,FullSess.,True,LEC,Noomane Raboudi,We,08:30,09:50,55 Laurier (DMS) 1160,2019-09-04,2019-12-03
POL 1502,A00,FullSess.,True,LEC,Staff,Mo,11:30,12:50,161 Louis Pasteur (CBY) C03,2019-09-04,2019-12-03
POL 1502,A00,FullSess.,True,LEC,Staff,Th,13:00,14:20,161 Louis Pasteur (CBY) C03,2019-09-04,2019-12-03
POL 2101,A00,FullSess.,True,LEC,Luc Turgeon,Mo,08:30,09:50,161 Louis Pasteur (CBY) C03,2019-09-04,2019-12-03


In [139]:
## Find all courses taught by “Staff”
courses_by_section_by_day[courses_by_section_by_day['prof'].str.contains('Staff')]

Unnamed: 0_level_0,Unnamed: 1_level_0,admin.duration,admin.isOpen,admin.type,prof,weekday,time.start,time.end,location,date.start,date.end
course,code,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
POL 1101,B00,FullSess.,True,LEC,Staff,We,19:00,21:50,125 University (MNT) 202,2019-09-04,2019-12-03
POL 1101,C00,FullSess.,True,LEC,Staff,Tu,11:30,12:50,120 University (FSS) 2005,2019-09-04,2019-12-03
POL 1101,C00,FullSess.,True,LEC,Staff,Fr,13:00,14:20,120 University (FSS) 2005,2019-09-04,2019-12-03
POL 1102,A00,FullSess.,True,LEC,Staff,We,13:00,14:20,120 University (FSS) 2005,2019-09-04,2019-12-03
POL 1102,A00,FullSess.,True,LEC,Staff,Fr,11:30,12:50,120 University (FSS) 2005,2019-09-04,2019-12-03
POL 1502,A00,FullSess.,True,LEC,Staff,Mo,11:30,12:50,161 Louis Pasteur (CBY) C03,2019-09-04,2019-12-03
POL 1502,A00,FullSess.,True,LEC,Staff,Th,13:00,14:20,161 Louis Pasteur (CBY) C03,2019-09-04,2019-12-03
POL 2103,C00,FullSess.,True,LEC,Staff,Tu,16:00,17:20,800 King Edward (STE) B0138,2019-09-04,2019-12-03
POL 2103,C00,FullSess.,True,LEC,Staff,Th,14:30,15:50,800 King Edward (STE) B0138,2019-09-04,2019-12-03
POL 2107,B00,FullSess.,True,LEC,Staff,Mo,11:30,12:50,115 Séraphin Marion (HGN) 302,2019-09-04,2019-12-03


In [141]:
def load_course_descriptions(discipline):
    return pd.read_csv(str('data/courses/' + discipline + '.csv')).set_index('code')

course_descriptions = load_course_descriptions('POL')
course_descriptions

Unnamed: 0_level_0,credits,year,language,title,description,extraDetails
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
HIS 1100,3,1,English,History Labs,"Using a hand-on approach, based on primary sou...","['Course Component: Lecture, Lecture']"
HIS 1101,3,1,English,The Making of Canada,"Survey of the political, social and cultural e...",['Course Component: Lecture']
HIS 1110,3,1,English,Introduction to Global History,A course of general interest focused on human ...,['Course Component: Lecture']
HIS 1111,3,1,English,The Twentieth-Century World from 1945,A course of general interest which focuses on ...,['Course Component: Lecture']
HIS 1120,3,1,English,What Is Europe? (16th-21th Century),Long term study of the changing nature of Euro...,['Course Component: Lecture']
HIS 1500,3,1,French,Laboratoires d'histoire,"Par son orientation pratique, fondée sur l'uti...","['Volet : Cours magistral, Cours magistral']"
HIS 1501,3,1,French,La formation du Canada,"Survol de l'évolution politique, sociale et cu...",['Volet : Cours magistral']
HIS 1510,3,1,French,Initiation à l'histoire globale,Cours d'intérêt général axé sur les migrations...,['Volet : Cours magistral']
HIS 1511,3,1,French,Le monde au XXe siècle depuis 1945,Cours d'intérêt général portant sur les questi...,['Volet : Cours magistral']
HIS 1520,3,1,French,Qu'est-ce que l'Europe ? (16e-21e siècle),Étude sur le long terme de la nature changeant...,['Volet : Cours magistral']


In [142]:
described_courses_by_section_by_day = pd.merge(courses_by_section_by_day.reset_index(1), course_descriptions, left_index=True, right_index=True)

described_courses_by_section_by_day = described_courses_by_section_by_day.reset_index().set_index(['index', 'code'])
described_courses_by_section_by_day.index.names = ['course', 'section']

described_courses_by_section_by_day

Unnamed: 0_level_0,Unnamed: 1_level_0,admin.duration,admin.isOpen,admin.type,prof,weekday,time.start,time.end,location,date.start,date.end,credits,year,language,title,description,extraDetails
course,section,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1


In [61]:
## HIS fall 2019
described_courses_by_section_by_day.query('course == ["HIS 1110", "HIS 3120", "HIS 3124", "HIS 3150", "HIS 4100", "HIS 4192", "HIS 4380", "HIS 4397"]')

Unnamed: 0_level_0,Unnamed: 1_level_0,admin.duration,admin.isOpen,admin.type,prof,weekday,time.start,time.end,location,date.start,date.end,credits,year,language,title,description,extraDetails
course,section,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
HIS 1110,A00,FullSess.,True,LEC,Eric Allina,Mo,11:30,12:50,125 University (MNT) 202,2019-09-04,2019-12-03,3,1,English,Introduction to Global History,A course of general interest focused on human ...,['Course Component: Lecture']
HIS 1110,A00,FullSess.,True,LEC,Meredith Terretta,Th,13:00,14:20,125 University (MNT) 202,2019-09-04,2019-12-03,3,1,English,Introduction to Global History,A course of general interest focused on human ...,['Course Component: Lecture']
HIS 3120,A00,FullSess.,True,LEC,Staff,Th,08:30,11:20,129 Louis Pasteur (LPR) 285,2019-09-04,2019-12-03,3,3,English,Selected Topics in the History of Canada's Reg...,,"['Course Component: Lecture', 'Prerequisite: 6..."
HIS 3124,A00,FullSess.,True,LEC,Staff,Mo,10:00,11:20,60 University (SMD) 227,2019-09-04,2019-12-03,3,3,English,Britain from 1485 to 1800,The British Isles from the advent of the Tudor...,"['Course Component: Lecture, Lecture', 'Prereq..."
HIS 3124,A00,FullSess.,True,LEC,Staff,We,08:30,09:50,65 University (MRT) 251,2019-09-04,2019-12-03,3,3,English,Britain from 1485 to 1800,The British Isles from the advent of the Tudor...,"['Course Component: Lecture, Lecture', 'Prereq..."
HIS 3150,A00,FullSess.,True,LEC,Staff,Tu,13:00,14:20,65 University (MRT) 252,2019-09-04,2019-12-03,3,3,English,Selected Topics in American History,,"['Course Component: Lecture, Lecture', 'Prereq..."
HIS 3150,A00,FullSess.,True,LEC,Staff,Th,11:30,12:50,65 University (MRT) 251,2019-09-04,2019-12-03,3,3,English,Selected Topics in American History,,"['Course Component: Lecture, Lecture', 'Prereq..."
HIS 4100,A00,FullSess.,True,SEM,Staff,Tu,08:30,11:20,120 University (FSS) 9003,2019-09-04,2019-12-03,3,4,English,Seminar in History Across Borders,"In this seminar, students will investigate com...","['Course Component: Seminar', 'Prerequisite: 8..."
HIS 4100,B00,FullSess.,True,SEM,Staff,Th,14:30,17:20,55 Laurier (DMS) 8161,2019-09-04,2019-12-03,3,4,English,Seminar in History Across Borders,"In this seminar, students will investigate com...","['Course Component: Seminar', 'Prerequisite: 8..."
HIS 4192,A00,FullSess.,True,SEM,Staff,We,08:30,11:20,120 University (FSS) 11003,2019-09-04,2019-12-03,3,4,English,Seminar in the History of the Middle East and ...,,"['Course Component: Seminar, Seminar', 'Prereq..."


In [None]:
## HIS winter 2019
described_courses_by_section_by_day.query('course == ["HIS 3125", "HIS 3190", "HIS 4360", "HIS 4364", "HIS 4135", ""]')