In [1]:
import re
import pandas as pd

In [35]:
###
# Get the courses from a file, into a dict.
###

courses = {}

with open('./pages/HIS-2019-fall.txt', 'r') as course_file:
    # Read course file into one big string
    data = course_file.read()
    
    # Break down to courses
    data = data.split('Collapse section ')
    
    # Remove erroneous space items
    del data[0]
    
    # Key the courses into the `courses` object
    for course in data:
        courses[course[0:8]] = course

In [36]:
def handle_section(section):
    sectionCode = section[0].split('-')[0]
    
    sectionData = {
        "admin": {},
        "days": []
    }
    
    ## Handling admin stuff
    sectionData["admin"]["code"] = sectionCode
    sectionData["admin"]["type"] = section[0].split('-')[1].split('\n')[0]
    sectionData["admin"]["duration"] = section[0].split('-')[1].split('\n')[1]
    sectionData["admin"]["isOpen"] = 'Open' in section[5]
    
    ## Handling days
    
    ### Helper function to extract datetime details
    def handle_section_datetime(datetime):
        return {
            "weekday": datetime[:2],
            "time.start": datetime[3:].split(' - ')[0],
            "time.end": datetime[3:].split(' - ')[1]
        }

    ### Extract the relevant details in a very hacky way :)
    def mini_day_extractor(sectionToHandle, index):
        return {
            **handle_section_datetime(sectionToHandle[1].split('\n')[index]),
            "location": sectionToHandle[2].split('\n')[index],
            "prof": sectionToHandle[3].split('\n')[index],
            "date.start": sectionToHandle[4].split('\n')[index].split(' - ')[0],
            "date.end": sectionToHandle[4].split('\n')[index].split(' - ')[1]
        }
    
    ### Given the number of days (counted by the number of `\n` in the datetime string...), extract them
    for index in range(0, section[1].count('\n') + 1):
        sectionData["days"].append(mini_day_extractor(section, index))
            
    return {sectionCode: sectionData}

In [37]:
###
# Get the sections from each course, adding as a list to the dict item.
###

for courseCode, courseStr in courses.items():
    sections = {}
    
    if ' \n\t\t\n\t\n\t\t\n\t\n\t\t\t\n\t\n  \tSection \tDays & Times \tRoom \tInstructor \tMeeting Dates \tStatus\nDetails\n\t\n' in courseStr:
        sectionContainer = courseStr.split(' \n\t\t\n\t\n\t\t\n\t\n\t\t\t\n\t\n  \tSection \tDays & Times \tRoom \tInstructor \tMeeting Dates \tStatus\nDetails\n\t\n')[1]
    else:
        sectionContainer = courseStr.split(' \n\t\t\n\t\n\t\t\n\t\n\t\t\n\t\n  \tSection \tDays & Times \tRoom \tInstructor \tMeeting Dates \tStatus\nDetails\n\t\n')[1]
    
    sectionData = sectionContainer.split('\n\t\n\t\t\n\t\n  \tSection \tDays & Times \tRoom \tInstructor \tMeeting Dates \tStatus\nDetails\n\t\n')
    
    for section in sectionData:
        sectionInfo = handle_section(section.split('\n\t\n'))
        
        sections.update(sectionInfo)
        
    courses[courseCode] = sections
    
## (sometimes this errors out with "list index out of range". it's all hacky anyway. power through!)

In [38]:
## Clean badly processed courses :)
coursesToDelete = {}

for courseCode, courseObj in courses.items():
    if isinstance(courseObj, str):
        coursesToDelete[courseCode] = courseObj
        
for course in coursesToDelete:
    courses.pop(course, None)
        
print(str(len(coursesToDelete)) + " courses removed due to badly formed data.")

0 courses removed due to badly formed data.


In [39]:
courses

{'HIS 1101': {'A00': {'admin': {'code': 'A00',
    'duration': 'FullSess.',
    'isOpen': True,
    'type': 'LEC'},
   'days': [{'date.end': '2019-12-03',
     'date.start': '2019-09-04',
     'location': '161 Louis Pasteur (CBY) C03',
     'prof': 'Damien-Claude Bélanger',
     'time.end': '14:20',
     'time.start': '13:00',
     'weekday': 'We'},
    {'date.end': '2019-12-03',
     'date.start': '2019-09-04',
     'location': '550 Cumberland (TBT) 333',
     'prof': 'Damien-Claude Bélanger',
     'time.end': '12:50',
     'time.start': '11:30',
     'weekday': 'Fr'}]}},
 'HIS 1110': {'A00': {'admin': {'code': 'A00',
    'duration': 'FullSess.',
    'isOpen': True,
    'type': 'LEC'},
   'days': [{'date.end': '2019-12-03',
     'date.start': '2019-09-04',
     'location': '125 University (MNT) 202',
     'prof': 'Eric Allina',
     'time.end': '12:50',
     'time.start': '11:30',
     'weekday': 'Mo'},
    {'date.end': '2019-12-03',
     'date.start': '2019-09-04',
     'location': '

In [40]:
## Convert the courses object to a dataframe
df = pd.DataFrame.from_dict(courses, orient='index')
df = pd.DataFrame(df.stack())

df

Unnamed: 0,Unnamed: 1,0
HIS 1101,A00,"{'admin': {'code': 'A00', 'type': 'LEC', 'dura..."
HIS 1110,A00,"{'admin': {'code': 'A00', 'type': 'LEC', 'dura..."
HIS 1111,B00,"{'admin': {'code': 'B00', 'type': 'LEC', 'dura..."
HIS 1111,WB00,"{'admin': {'code': 'WB00', 'type': 'LEC', 'dur..."
HIS 1500,A00,"{'admin': {'code': 'A00', 'type': 'LEC', 'dura..."
HIS 1501,A00,"{'admin': {'code': 'A00', 'type': 'LEC', 'dura..."
HIS 1520,A00,"{'admin': {'code': 'A00', 'type': 'LEC', 'dura..."
HIS 2100,A00,"{'admin': {'code': 'A00', 'type': 'LEC', 'dura..."
HIS 2100,B00,"{'admin': {'code': 'B00', 'type': 'LEC', 'dura..."
HIS 2101,A00,"{'admin': {'code': 'A00', 'type': 'LEC', 'dura..."


In [41]:
## Unpack the `data` column
df = df.reset_index()
df.columns = ['course', 'code', 'data']
df

df2 = df.join(pd.io.json.json_normalize(df['data']))
df2 = df2.set_index(['course', 'code'])

df2

Unnamed: 0_level_0,Unnamed: 1_level_0,data,admin.code,admin.duration,admin.isOpen,admin.type,days
course,code,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
HIS 1101,A00,"{'admin': {'code': 'A00', 'type': 'LEC', 'dura...",A00,FullSess.,True,LEC,"[{'weekday': 'We', 'time.start': '13:00', 'tim..."
HIS 1110,A00,"{'admin': {'code': 'A00', 'type': 'LEC', 'dura...",A00,FullSess.,True,LEC,"[{'weekday': 'Mo', 'time.start': '11:30', 'tim..."
HIS 1111,B00,"{'admin': {'code': 'B00', 'type': 'LEC', 'dura...",B00,FullSess.,True,LEC,"[{'weekday': 'Tu', 'time.start': '11:30', 'tim..."
HIS 1111,WB00,"{'admin': {'code': 'WB00', 'type': 'LEC', 'dur...",WB00,FullSess.,True,LEC,"[{'weekday': 'TB', 'time.start': ' 00:00', 'ti..."
HIS 1500,A00,"{'admin': {'code': 'A00', 'type': 'LEC', 'dura...",A00,FullSess.,True,LEC,"[{'weekday': 'Mo', 'time.start': '13:00', 'tim..."
HIS 1501,A00,"{'admin': {'code': 'A00', 'type': 'LEC', 'dura...",A00,FullSess.,True,LEC,"[{'weekday': 'Tu', 'time.start': '13:00', 'tim..."
HIS 1520,A00,"{'admin': {'code': 'A00', 'type': 'LEC', 'dura...",A00,FullSess.,True,LEC,"[{'weekday': 'Tu', 'time.start': '11:30', 'tim..."
HIS 2100,A00,"{'admin': {'code': 'A00', 'type': 'LEC', 'dura...",A00,FullSess.,True,LEC,"[{'weekday': 'Tu', 'time.start': '11:30', 'tim..."
HIS 2100,B00,"{'admin': {'code': 'B00', 'type': 'LEC', 'dura...",B00,FullSess.,True,LEC,"[{'weekday': 'Mo', 'time.start': '08:30', 'tim..."
HIS 2101,A00,"{'admin': {'code': 'A00', 'type': 'LEC', 'dura...",A00,FullSess.,True,LEC,"[{'weekday': 'Tu', 'time.start': '11:30', 'tim..."


In [42]:
## Unpack the `days` column

### Convert the days column from an object to a string
import ast

days_as_string = df2.astype({'days': str}).reset_index()['days'].apply(ast.literal_eval)
days_as_string

### Unpack the column. The column contains lists of objects with consistent keys, so each object becomes its own row
days_by_section = pd.concat([pd.DataFrame(x) for x in days_as_string], keys=days_as_string.index)
days_by_section

### Join the expanded rows with their original courses, dropping the now-unused columns
df3 = df2.reset_index().join(days_by_section.reset_index(1, drop=True))
df3 = df3.set_index(['course', 'code']).drop('data', 1).drop('days', 1)

df3

Unnamed: 0_level_0,Unnamed: 1_level_0,admin.code,admin.duration,admin.isOpen,admin.type,date.end,date.start,location,prof,time.end,time.start,weekday
course,code,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
HIS 1101,A00,A00,FullSess.,True,LEC,2019-12-03,2019-09-04,161 Louis Pasteur (CBY) C03,Damien-Claude Bélanger,14:20,13:00,We
HIS 1101,A00,A00,FullSess.,True,LEC,2019-12-03,2019-09-04,550 Cumberland (TBT) 333,Damien-Claude Bélanger,12:50,11:30,Fr
HIS 1110,A00,A00,FullSess.,True,LEC,2019-12-03,2019-09-04,125 University (MNT) 202,Eric Allina,12:50,11:30,Mo
HIS 1110,A00,A00,FullSess.,True,LEC,2019-12-03,2019-09-04,125 University (MNT) 202,Meredith Terretta,14:20,13:00,Th
HIS 1111,B00,B00,FullSess.,True,LEC,2019-12-03,2019-09-04,591 Cumberland (SCR) 002,Staff,12:50,11:30,Tu
HIS 1111,B00,B00,FullSess.,True,LEC,2019-12-03,2019-09-04,800 King Edward (STE) G0103,Staff,14:20,13:00,Fr
HIS 1111,WB00,WB00,FullSess.,True,LEC,2019-12-03,2019-09-04,,Staff,00:00,00:00,TB
HIS 1500,A00,A00,FullSess.,True,LEC,2019-12-03,2019-09-04,550 Cumberland (TBT) 0021,Kouky Fianu,14:20,13:00,Mo
HIS 1500,A00,A00,FullSess.,True,LEC,2019-12-03,2019-09-04,550 Cumberland (TBT) 317,Kouky Fianu,12:50,11:30,We
HIS 1501,A00,A00,FullSess.,True,LEC,2019-12-03,2019-09-04,60 University (SMD) 224,Jean-François Lozier,14:20,13:00,Tu


In [43]:
## Reorganize columns
courses_by_section_by_day = df3[[
    'admin.duration',
    'admin.isOpen',
    'admin.type',
    'prof',
    'weekday',
    'time.start',
    'time.end',
    'location',
    'date.start',
    'date.end'
]]
courses_by_section_by_day

Unnamed: 0_level_0,Unnamed: 1_level_0,admin.duration,admin.isOpen,admin.type,prof,weekday,time.start,time.end,location,date.start,date.end
course,code,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
HIS 1101,A00,FullSess.,True,LEC,Damien-Claude Bélanger,We,13:00,14:20,161 Louis Pasteur (CBY) C03,2019-09-04,2019-12-03
HIS 1101,A00,FullSess.,True,LEC,Damien-Claude Bélanger,Fr,11:30,12:50,550 Cumberland (TBT) 333,2019-09-04,2019-12-03
HIS 1110,A00,FullSess.,True,LEC,Eric Allina,Mo,11:30,12:50,125 University (MNT) 202,2019-09-04,2019-12-03
HIS 1110,A00,FullSess.,True,LEC,Meredith Terretta,Th,13:00,14:20,125 University (MNT) 202,2019-09-04,2019-12-03
HIS 1111,B00,FullSess.,True,LEC,Staff,Tu,11:30,12:50,591 Cumberland (SCR) 002,2019-09-04,2019-12-03
HIS 1111,B00,FullSess.,True,LEC,Staff,Fr,13:00,14:20,800 King Edward (STE) G0103,2019-09-04,2019-12-03
HIS 1111,WB00,FullSess.,True,LEC,Staff,TB,00:00,00:00,,2019-09-04,2019-12-03
HIS 1500,A00,FullSess.,True,LEC,Kouky Fianu,Mo,13:00,14:20,550 Cumberland (TBT) 0021,2019-09-04,2019-12-03
HIS 1500,A00,FullSess.,True,LEC,Kouky Fianu,We,11:30,12:50,550 Cumberland (TBT) 317,2019-09-04,2019-12-03
HIS 1501,A00,FullSess.,True,LEC,Jean-François Lozier,Tu,13:00,14:20,60 University (SMD) 224,2019-09-04,2019-12-03


In [44]:
## Find all courses taught by “Staff”
courses_by_section_by_day[courses_by_section_by_day['prof'].str.contains('Staff')]

Unnamed: 0_level_0,Unnamed: 1_level_0,admin.duration,admin.isOpen,admin.type,prof,weekday,time.start,time.end,location,date.start,date.end
course,code,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
HIS 1111,B00,FullSess.,True,LEC,Staff,Tu,11:30,12:50,591 Cumberland (SCR) 002,2019-09-04,2019-12-03
HIS 1111,B00,FullSess.,True,LEC,Staff,Fr,13:00,14:20,800 King Edward (STE) G0103,2019-09-04,2019-12-03
HIS 1111,WB00,FullSess.,True,LEC,Staff,TB,00:00,00:00,,2019-09-04,2019-12-03
HIS 2100,A00,FullSess.,True,LEC,Staff,Tu,11:30,12:50,60 University (SMD) 428,2019-09-04,2019-12-03
HIS 2100,A00,FullSess.,True,LEC,Staff,Fr,13:00,14:20,145 Jean-Jacq. Luss. (LMX) 242,2019-09-04,2019-12-03
HIS 2100,B00,FullSess.,True,LEC,Staff,Mo,08:30,09:50,140 Louis-Pasteur (MRN) 130,2019-09-04,2019-12-03
HIS 2100,B00,FullSess.,True,LEC,Staff,Th,10:00,11:20,200 Wilbrod (WLD) 108,2019-09-04,2019-12-03
HIS 2101,A00,FullSess.,True,LEC,Staff,Tu,11:30,12:50,800 King Edward (STE) H0104,2019-09-04,2019-12-03
HIS 2101,A00,FullSess.,True,LEC,Staff,Fr,13:00,14:20,800 King Edward (STE) H0104,2019-09-04,2019-12-03
HIS 2103,A00,FullSess.,True,LEC,Staff,We,13:00,14:20,70 Laurier (MHN) 033,2019-09-04,2019-12-03


In [46]:
course_descriptions = pd.read_csv('data/courses/HIS.csv').set_index('code')

course_descriptions

Unnamed: 0_level_0,credits,year,language,title,description,extraDetails
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
HIS 1100,3,1,English,History Labs,"Using a hand-on approach, based on primary sou...","['Course Component: Lecture, Lecture']"
HIS 1101,3,1,English,The Making of Canada,"Survey of the political, social and cultural e...",['Course Component: Lecture']
HIS 1110,3,1,English,Introduction to Global History,A course of general interest focused on human ...,['Course Component: Lecture']
HIS 1111,3,1,English,The Twentieth-Century World from 1945,A course of general interest which focuses on ...,['Course Component: Lecture']
HIS 1120,3,1,English,What Is Europe? (16th-21th Century),Long term study of the changing nature of Euro...,['Course Component: Lecture']
HIS 1500,3,1,French,Laboratoires d'histoire,"Par son orientation pratique, fondée sur l'uti...","['Volet : Cours magistral, Cours magistral']"
HIS 1501,3,1,French,La formation du Canada,"Survol de l'évolution politique, sociale et cu...",['Volet : Cours magistral']
HIS 1510,3,1,French,Initiation à l'histoire globale,Cours d'intérêt général axé sur les migrations...,['Volet : Cours magistral']
HIS 1511,3,1,French,Le monde au XXe siècle depuis 1945,Cours d'intérêt général portant sur les questi...,['Volet : Cours magistral']
HIS 1520,3,1,French,Qu'est-ce que l'Europe ? (16e-21e siècle),Étude sur le long terme de la nature changeant...,['Volet : Cours magistral']


In [47]:
described_courses_by_section_by_day = pd.merge(courses_by_section_by_day.reset_index(1), course_descriptions, left_index=True, right_index=True)

described_courses_by_section_by_day = described_courses_by_section_by_day.reset_index().set_index(['index', 'code'])
described_courses_by_section_by_day.index.names = ['course', 'section']

described_courses_by_section_by_day

Unnamed: 0_level_0,Unnamed: 1_level_0,admin.duration,admin.isOpen,admin.type,prof,weekday,time.start,time.end,location,date.start,date.end,credits,year,language,title,description,extraDetails
course,section,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
HIS 1101,A00,FullSess.,True,LEC,Damien-Claude Bélanger,We,13:00,14:20,161 Louis Pasteur (CBY) C03,2019-09-04,2019-12-03,3,1,English,The Making of Canada,"Survey of the political, social and cultural e...",['Course Component: Lecture']
HIS 1101,A00,FullSess.,True,LEC,Damien-Claude Bélanger,Fr,11:30,12:50,550 Cumberland (TBT) 333,2019-09-04,2019-12-03,3,1,English,The Making of Canada,"Survey of the political, social and cultural e...",['Course Component: Lecture']
HIS 1110,A00,FullSess.,True,LEC,Eric Allina,Mo,11:30,12:50,125 University (MNT) 202,2019-09-04,2019-12-03,3,1,English,Introduction to Global History,A course of general interest focused on human ...,['Course Component: Lecture']
HIS 1110,A00,FullSess.,True,LEC,Meredith Terretta,Th,13:00,14:20,125 University (MNT) 202,2019-09-04,2019-12-03,3,1,English,Introduction to Global History,A course of general interest focused on human ...,['Course Component: Lecture']
HIS 1111,B00,FullSess.,True,LEC,Staff,Tu,11:30,12:50,591 Cumberland (SCR) 002,2019-09-04,2019-12-03,3,1,English,The Twentieth-Century World from 1945,A course of general interest which focuses on ...,['Course Component: Lecture']
HIS 1111,B00,FullSess.,True,LEC,Staff,Fr,13:00,14:20,800 King Edward (STE) G0103,2019-09-04,2019-12-03,3,1,English,The Twentieth-Century World from 1945,A course of general interest which focuses on ...,['Course Component: Lecture']
HIS 1111,WB00,FullSess.,True,LEC,Staff,TB,00:00,00:00,,2019-09-04,2019-12-03,3,1,English,The Twentieth-Century World from 1945,A course of general interest which focuses on ...,['Course Component: Lecture']
HIS 1500,A00,FullSess.,True,LEC,Kouky Fianu,Mo,13:00,14:20,550 Cumberland (TBT) 0021,2019-09-04,2019-12-03,3,1,French,Laboratoires d'histoire,"Par son orientation pratique, fondée sur l'uti...","['Volet : Cours magistral, Cours magistral']"
HIS 1500,A00,FullSess.,True,LEC,Kouky Fianu,We,11:30,12:50,550 Cumberland (TBT) 317,2019-09-04,2019-12-03,3,1,French,Laboratoires d'histoire,"Par son orientation pratique, fondée sur l'uti...","['Volet : Cours magistral, Cours magistral']"
HIS 1501,A00,FullSess.,True,LEC,Jean-François Lozier,Tu,13:00,14:20,60 University (SMD) 224,2019-09-04,2019-12-03,3,1,French,La formation du Canada,"Survol de l'évolution politique, sociale et cu...",['Volet : Cours magistral']


In [54]:
described_courses_by_section_by_day.query('course == "HIS 1110"')

Unnamed: 0_level_0,Unnamed: 1_level_0,admin.duration,admin.isOpen,admin.type,prof,weekday,time.start,time.end,location,date.start,date.end,credits,year,language,title,description,extraDetails
course,section,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
HIS 1110,A00,FullSess.,True,LEC,Eric Allina,Mo,11:30,12:50,125 University (MNT) 202,2019-09-04,2019-12-03,3,1,English,Introduction to Global History,A course of general interest focused on human ...,['Course Component: Lecture']
HIS 1110,A00,FullSess.,True,LEC,Meredith Terretta,Th,13:00,14:20,125 University (MNT) 202,2019-09-04,2019-12-03,3,1,English,Introduction to Global History,A course of general interest focused on human ...,['Course Component: Lecture']
