# Data Collector

This notebook organizes and aggregates data about faculty diversity in gender.

In [2]:
import os
import re
import pandas as pd

fileProg = re.compile(r'\w{3}\-\w{2}\.xlsx')

This block collects gender data:

In [3]:
DATA_PATH = '../data/gender/'

def getGenderData(fname):
    df = pd.read_excel(DATA_PATH + fname)
    [div, status] = fname.rstrip('.xlsx').split('-')
    df.columns = [
        'Year', 
        '% Women', 
        'Women', 
        'Total', 
        '% Men',
        'Men',
        'Total Duplicate'
    ]
    df.drop(columns=['Total Duplicate'], inplace=True)
    df.set_index('Year', inplace=True)
    df.name = '{}-{}'.format(div, status)
    return df

def writeGenderData():
    writer = pd.ExcelWriter('../data/gender.xlsx', engine='xlsxwriter')
    fnames = [f for f in sorted(os.listdir(DATA_PATH)) if fileProg.match(f)]
    for fname in fnames:
        df = getGenderData(fname)
        df.to_excel(writer, sheet_name=df.name, startrow=0, startcol=0)
    writer.save()
    
# writeGenderData()

This blocks collects race and ethnicity data:

In [35]:
DATA_PATH = '../data/race-ethnicity/'

CATEGORIES = [ 'Black/African American', 'Hispanic/Latino', 'Two or More', 'Asian American' ]

# All div-status pairs have 4 categories, in the order of the CATEGORIES
# array, except the following two, which don't have 'Two or More'.
noTwoOrMore = ['HUM-TE', 'SOC-EL']

foundIncompleteColumn = False
def getRaceEthnicityData(fname):
    df = pd.read_excel(DATA_PATH + fname)
    fname = fname.rstrip('.xlsx')
    columns = ['Year']
    totalDuplicates = 0
    for c in range(len(CATEGORIES)):
        cat = CATEGORIES[c]
        if cat == 'Two or More' and fname in noTwoOrMore:
            continue
        columns += [
            '% ' + cat,
            cat,
            'Total ' + str(totalDuplicates)
        ]
        totalDuplicates += 1
    df.columns = columns
    df.fillna(0, inplace=True)
    for t in range((len(columns) - 1) // 3):
        col = df['Total ' + str(t)]
        if 0 not in list(col): # We've found a full total column
            # Remove other total columns
            df = df.drop(columns=['Total ' + str(v) for v in range((len(columns) - 1) // 3)])
            df['Total'] = col
            break

    df.name = fname
    return df

def writeRaceEthnicityData():
    writer = pd.ExcelWriter('../data/race-ethnicity.xlsx', engine='xlsxwriter')
    fnames = [f for f in sorted(os.listdir(DATA_PATH)) if fileProg.match(f)]
    for fname in fnames:
        df = getRaceEthnicityData(fname)
        df.to_excel(writer, sheet_name=df.name, startrow=0, startcol=0)
    writer.save()
    
writeRaceEthnicityData()