We're interested in the instruction types associated with each class as well as the range of enrollment.

When loading into DB make sure to check if a given element `is None`

Note that departments and concentrations are distinct. There's no need to link them, so concentrations can be stored as a purely UI element. 

In [39]:
import pandas as pd
import xml.etree.ElementTree as et

In [51]:
concentrations = [  # Copied from https://college.harvard.edu/academics/fields-study/concentrations
    'African and African American Studies',
    'Anthropology',
    'Applied Mathematics',
    'Astrophysics',
    'Biomedical Engineering',
    'Chemical and Physical Biology',
    'Chemistry',
    'Chemistry and Physics',
    'Classics',
    'Comparative Literature',
    'Computer Science',
    'Earth and Planetary Sciences',
    'East Asian Studies',
    'Economics',
    'Electrical Engineering',
    'Engineering Sciences',
    'English',
    'Environmental Science and Public Policy',
    'Folklore and Mythology',
    'Germanic Languages and Literatures',
    'Government',
    'History',
    'History and Literature',
    'History and Science',
    'History of Art and Architecture',
    'Human Developmental and Regenerative Biology',
    'Human Evolutionary Biology',
    'Integrative Biology',
    'Linguistics',
    'Mathematics',
    'Mechanical Engineering',
    'Molecular and Cellular Biology',
    'Music',
    'Near Eastern Languages and Civilizations',
    'Neurobiology',
    'Philosophy',
    'Physics',
    'Psychology',
    'Religion, Comparative Study of',
    'Romance Languages and Literatures',
    'Slavic Languages and Literatures',
    'Social Studies',
    'Sociology',
    'South Asian Studies',
    'Special Concentrations',
    'Statistics',
    'Theater, Dance, and Media',
    'Visual and Environmental Studies',
    'Women, Gender, and Sexuality, Studies of',
]

In [58]:
custom_map = {
    'AESTHINT':'Aesthetic and Interpretive Understanding', 
    'AMSTDIES':'American Studies', 
    'BIOLSCI':'Biological Sciences', 
    'CULTBLF':'Culture and Belief', 
    'EMREAS':'Empirical and Mathematical Reasoning', 
    'ETHRSON':'Ethical Reasoning', 
    'FAS':'Faculty of Arts and Sciences', 
    'HIND-URD':'South Asian Studies', 
    'HLS':'Harvard Law School', 
    'IMMUN':'Medical Sciences', 
    'MODMDEST':'Near Eastern Languages & Civilizations', 
    'SCILIVSY':'Science of Living Systems', 
    'SCIPHUNV':'Science of the Physical Universe', 
    'SOCWORLD':'Societies of the World', 
    'US-WORLD': 'United States in the World',
    'SYSBIO':'Systems Biology', 
    'UKRA':'Slavic Languages & Literatures', 
    'XBRN':'Romance Languages & Literature', 
    'XBUS':'Cross-Reg HBS', 
    'XFLT':'South Asian Studies', 
    'XMIT':'Cross-Reg MIT', 
    'XLAW': 'Cross-Reg HLS',
    'TIME-A': 'Unknown',
    'TIME-C': 'Unknown',
    'TIME-R': 'Unknown',
    'TIME-T': 'Unknown',
    'RSEA': 'Regional Studies',
    'RSRA': 'Regional Studies',
    'SCRB': 'Stem Cell & Regenrative Biology',
    'SHBT': 'Medical Sciences',
    'BBS': 'Medical Sciences',
    'BCMP': 'Medical Sciences',
    'BCS': 'Slavic Languages & Literatures',
    'BPH': 'Bio Science in Public Health',
    'CLS-STDY': 'Classics',
    'DRB': 'Medical Sciences',
    'HBTM': 'Medical Sciences',
    'HSEMR-LE': 'House Seminar',
    'HSEMR-WI': 'House Seminar',
    'MCB': 'Molecular & Cellular Biology',
    'OEB': 'Organismic & Evolutionary Biology',
    'SLAVIC': 'Slavic Languages & Literatures',
}

In [42]:
catalog_number_to_name = {}
dpts = pd.read_csv(open('departments.csv', encoding='latin'))
mappings = dpts.groupby(['course_group_code', 'course_group']).groups
for cat_num, name in mappings:
    catalog_number_to_name[cat_num] = name

In [43]:
def extract_courses(path_to_xml):
    
    cq = lambda q: '{http://icommons.harvard.edu/Schema}' + q
    
    semester_xml = et.parse(open(path_to_xml))
    root = semester_xml.getroot()
    
    courses = []

    for course in root.findall(cq('course')):
        catalog_info = course.find(cq('catalog_info'))

        courses.append({
            'harvard_id': catalog_info.find(cq('course_code_display')).text,
            'name_short': catalog_info.find(cq('short_title')).text,
            'name_long': catalog_info.find(cq('sub_title')).text,
            'description': catalog_info.find(cq('description')).text,
            'prereq': catalog_info.find(cq('prereq')).text,
            'notes': catalog_info.find(cq('notes')).text,
            'department_short': catalog_info.find(cq('course_group_short_name')).text,
            'enrollment': catalog_info.find(cq('enrollment_limit')).text,
            'type': catalog_info.find(cq('course_type')).text,
        })
        
    return courses

In [44]:
fall_registrar = extract_courses('fall.xml')

In [45]:
spring_registrar = extract_courses('spring.xml')

In [46]:
courses = pd.concat([pd.DataFrame(fall_registrar), pd.DataFrame(spring_registrar)])

In [47]:
course_db = courses[['department_short', 'description', 'harvard_id', 'name_long', 'name_short', 'prereq']]
course_db = course_db.set_index('harvard_id').sort_index().drop_duplicates()

In [50]:
# Make sure that we have a human readable department for every course
to_map = sorted(course_db['department_short'].unique())
for catalog_number in to_map:
    assert(i in custom_map or i in catalog_number_to_name)

In [52]:
pd.DataFrame(concentrations)

Unnamed: 0,0
0,African and African American Studies
1,Anthropology
2,Applied Mathematics
3,Astrophysics
4,Biomedical Engineering
5,Chemical and Physical Biology
6,Chemistry
7,Chemistry and Physics
8,Classics
9,Comparative Literature


In [64]:
departments_from_file = set(catalog_number_to_name.values()) - set(catalog_number_to_name.keys()) - set(custom_map.keys())
departments_from_file

{'Aesthetic and Interpretive Understanding',
 'African and African American Studies',
 'Akkadian',
 'American Civilization',
 'American Studies',
 'Ancient Near East',
 'Anthropology',
 'Applied Computation',
 'Applied Mathematics',
 'Applied Physics',
 'Arabic',
 'Aramaic',
 'Armenian',
 'Armenian Studies',
 'Astronomy',
 'Biological Sciences in Dental Medicine',
 'Biomedical Engineering',
 'Biophysics',
 'Biostatistics',
 'Business Studies',
 'Catalan',
 'Cell Biology',
 'Celtic',
 'Chemical Biology',
 'Chemical and Physical Biology',
 'Chemistry',
 'Chinese',
 'Chinese History',
 'Chinese Literature',
 'Classical Archaeology',
 'Classical Hebrew',
 'Classical Philology',
 'Classical Studies',
 'Classics',
 'Comparative Literature',
 'Computer Science',
 'Culture and Belief',
 'Czech',
 'Design',
 'Dramatic Arts',
 'Earth and Planetary Sciences',
 'East Asian Buddhist Studies',
 'East Asian Film and Media Studies',
 'East Asian Studies',
 'Economics',
 'Egyptian',
 'Empirical and Mat

In [65]:
set(catalog_number_to_name.keys())

{nan,
 'HUMAN',
 'SALLC',
 'GHHP',
 'LING',
 'GOV',
 'TIME-C',
 'DESIGN',
 'TIBET',
 'COMPSCI',
 'CULTR&BLF',
 'FOLKMYTH',
 'PLSH',
 'MICROBI',
 'URD',
 'LATIN',
 'JAPNLIT',
 'SOC-WORLD',
 'EABS',
 'FRENCH',
 'HIST-LIT',
 'SPC-CONC',
 'IMUNOL',
 'PAL',
 'SCI-LIVSYS',
 'CHEMBIO',
 'SCI-PHYUNV',
 'GERM-STD',
 'MODMIDEAST',
 'EGYPTIAN',
 'FRSEMR',
 'BSDM',
 'TWI',
 'SPANSH',
 'IRANIAN',
 'COMPLIT',
 'ARAMAIC',
 'UKRAN',
 'SPOL',
 'CHNSE',
 'CPB',
 'HSEMR-WI',
 'VIROLOGY',
 'HLTHPOL',
 'CATALAN',
 'CLASPHIL',
 'SOC-STD',
 'SUMERIAN',
 'CHNSLIT',
 'HIST',
 'NEUROBIO',
 'TURKISH',
 'XPRN',
 'HUM-FRAMES',
 'HSEMR-PF',
 'SCRB',
 'PORTUG',
 'BIOSTAT',
 'KORLIT',
 'AM-CIV',
 'MODGRK',
 'KORHIST',
 'NEP',
 'LITER',
 'TIBHIST',
 'HEB',
 'ECON',
 'HUM-QSTNS',
 'CELTIC',
 'BCMP',
 'MONGOLN',
 'AESTH&INTP',
 'AFRAMER',
 'PHYSICS',
 'ARMENST',
 'IRISH',
 'MEDLATIN',
 'MEDGREEK',
 'NEC',
 'HBTM',
 'ANTHRO',
 'BE',
 'EAFM',
 'OEB',
 'TIME-A',
 'HSEMR-LE',
 'NEURO',
 'ESPP',
 'LPS',
 'SAS',
 'ANE',
 'SCA