We're interested in the instruction types associated with each class as well as the range of enrollment.

When loading into DB make sure to check if a given element `is None`

Note that departments and concentrations are distinct.

In [1]:
import pandas as pd
import xml.etree.ElementTree as et

In [36]:
custom_map = {
    'AESTHINT':'Aesthetic and Interpretive Understanding', 
    'AMSTDIES':'American Studies', 
    'BIOLSCI':'Biological Sciences', 
    'CULTBLF':'Culture and Belief', 
    'EMREAS':'Empirical and Mathematical Reasoning', 
    'ETHRSON':'Ethical Reasoning', 
    'FAS':'Faculty of Arts and Sciences', 
    'HIND-URD':'South Asian Studies', 
    'HLS':'Harvard Law School', 
    'IMMUN':'Medical Sciences', 
    'MODMDEST':'Near Eastern Languages & Civilizations', 
    'SCILIVSY':'Science of Living Systems', 
    'SCIPHUNV':'Science of the Physical Universe', 
    'SOCWORLD':'Societies of the World', 
    'US-WORLD': 'United States in the World',
    'SYSBIO':'Systems Biology', 
    'UKRA':'Slavic Languages & Literatures', 
    'XBRN':'Romance Languages & Literature', 
    'XBUS':'Cross-Reg HBS', 
    'XFLT':'South Asian Studies', 
    'XMIT':'Cross-Reg MIT', 
    'XLAW': 'Cross-Reg HLS',
    'TIME-A': 'Unknown',
    'TIME-C': 'Unknown',
    'TIME-R': 'Unknown',
    'TIME-T': 'Unknown',
    'RSEA': 'Regional Studies',
    'RSRA': 'Regional Studies',
    'SCRB': 'Stem Cell & Regenrative Biology',
    'SHBT': 'Medical Science',
    'BBS': 'Medical Sciences',
    'BCMP': 'Medical Sciences',
    'BCS': 'Slavic Languages & Literatures',
    'BPH': 'Bio Science in Public Health',
    'CLS-STDY': 'Classics',
    'DRB': 'Medical Sciences',
    'HBTM': 'Medical Sciences',
    'HSEMR-LE': 'House Seminar',
    'HSEMR-WI': 'House Seminar',
    'MCB': 'Molecular & Cellular Biology',
    'OEB': 'Organismic & Evolutionary Biology',
    'SLAVIC': 'Slavic Languages & Literatures',
}

In [4]:
catalog_number_to_name = {}
dpts = pd.read_csv(open('departments.csv', encoding='latin'))
mappings = dpts.groupby(['course_group_code', 'course_group']).groups
for cat_num, name in mappings:
    catalog_number_to_name[cat_num] = name

In [5]:
def extract_courses(path_to_xml):
    
    cq = lambda q: '{http://icommons.harvard.edu/Schema}' + q
    
    semester_xml = et.parse(open(path_to_xml))
    root = semester_xml.getroot()
    
    courses = []

    for course in root.findall(cq('course')):
        catalog_info = course.find(cq('catalog_info'))

        courses.append({
            'harvard_id': catalog_info.find(cq('course_code_display')).text,
            'name_short': catalog_info.find(cq('short_title')).text,
            'name_long': catalog_info.find(cq('sub_title')).text,
            'description': catalog_info.find(cq('description')).text,
            'prereq': catalog_info.find(cq('prereq')).text,
            'notes': catalog_info.find(cq('notes')).text,
            'department_short': catalog_info.find(cq('course_group_short_name')).text,
            'enrollment': catalog_info.find(cq('enrollment_limit')).text,
            'type': catalog_info.find(cq('course_type')).text,
        })
        
    return courses

In [6]:
fall_registrar = extract_courses('fall.xml')

In [7]:
spring_registrar = extract_courses('spring.xml')

In [9]:
courses = pd.concat([pd.DataFrame(fall_registrar), pd.DataFrame(spring_registrar)])

In [10]:
course_db = courses[['department_short', 'description', 'harvard_id', 'name_long', 'name_short', 'prereq']]
course_db = course_db.set_index('harvard_id').sort_index().drop_duplicates()

In [11]:
course_db

Unnamed: 0_level_0,department_short,description,name_long,name_short,prereq
harvard_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
107340,FRSEMR,John Steinbeck wrote: &quot;It is a common exp...,Dreams: Our Mind by Night,FRSEMR 26F,Course open to Freshman Students Only
107341,US-WORLD,Events such as Teach for America's 20th annive...,Dilemmas of Equity and Excellence in American ...,US-WORLD 35,
107349,MODMDEST,An introduction to Middle Eastern Studies focu...,Introduction to the Modern Middle East,MODMDEST 100,
107367,PHYSCI,An introduction to the fundamental theories of...,Quantum and Statistical Foundations of Chemistry,PHYSCI 10,Recommended: A strong background in chemistry ...
107368,PHYSCI,The Physical Sciences hold the key to solving ...,Foundations and Frontiers of Modern Chemistry:...,PHYSCI 11,Recommended: A strong background in chemistry....
107375,HIND-URD,Instruction in Hindi-Urdu in topics not covere...,Hindi-Urdu Supervised Readings,HIND-URD 91R,
107378,SAS,Individualized study of a South Asian language...,South Asian Language Tutorials,SAS 100R,
107379,SAS,Supervised reading leading to a long term pape...,Supervised Reading and Research,SAS 91R,
107380,SAS,,Tutorial - Junior Year,SAS 98R,
107381,SAS,Students must complete both terms of this cour...,Tutorial - Senior Year,SAS 99RA,


In [38]:
to_map = sorted(course_db['department_short'].unique())
for i in to_map:
    if i in custom_map:
        print(custom_map[i])
    elif i in catalog_number_to_name:
        print(catalog_number_to_name[i])
    else: 
        assert(False)

Aesthetic and Interpretive Understanding
African and African American Studies
Akkadian
American Studies
Ancient Near East
Anthropology
Applied Computation
Applied Mathematics
Applied Physics
Arabic
Aramaic
Astronomy
Medical Sciences
Medical Sciences
Slavic Languages & Literatures
Biomedical Engineering
Biological Sciences
Biophysics
Biostatistics
Bio Science in Public Health
Biological Sciences in Dental Medicine
Catalan
Cell Biology
Celtic
Chemistry
Chemical Biology
Chinese
Chinese History
Chinese Literature
Classical Hebrew
Classical Archaeology
Classical Philology
Classics
Classics
Comparative Literature
Computer Science
Chemical and Physical Biology
Culture and Belief
Czech
Design
Medical Sciences
Earth and Planetary Sciences
East Asian Buddhist Studies
East Asian Film and Media Studies
East Asian Studies
Economics
Egyptian
Ethnicity, Migration, Rights
Empirical and Mathematical Reasoning
Engineering Sciences
English
Environmental Science and Public Policy
Ethical Reasoning
Exposit