# Data Collector

This notebook organizes and aggregates data about faculty diversity in gender.

In [38]:
import os
import pandas as pd

The following dictionary maps my custom filename convention to metadata.

In [118]:
META = {
    'hum': 'HUM', # Humanities
    'ns': 'NS', # Natural Sciences
    'ss': 'SS', # Social Sciences
    '1': 'Tenure ineligible',
    '2': 'Tenure eligible',
    '3': 'Tenured',    
}

In [119]:
DATA_PATH = '../data/excel/'

def getMetadata(fname):
    [div, status] = fname.rstrip('.xlsx').split('-')
    div = META[div]
    status = META[status]
    return div, status

def getDF(fname):
    df = pd.read_excel(DATA_PATH + fname)
    div, status = getMetadata(fname)
    df.columns = [
        'Year', 
        '% Women', 
        'Women', 
        'Total', 
        '% Men',
        'Men', 
        'Total Duplicate'
    ]
    df.drop(columns=['Total Duplicate'], inplace=True)
    df.set_index('Year', inplace=True)
    df.division, df.status = div, status
    df.name = '{} - {}'.format(div, status)
    return df

In [121]:
writer = pd.ExcelWriter('../data/gender.xlsx', engine='xlsxwriter')

for fname in sorted(os.listdir(DATA_PATH)):
    df = getDF(fname)
    df.to_excel(writer, sheet_name=df.name, startrow=0, startcol=0)
    
writer.save()