In [None]:
# Set up environment and define shape of data

import pandas as pd

pd.set_option('display.max_colwidth', None)

# List of columns to analyze
# Note that pandas will make any duplicate column names unique by appending a sequence number
# to subsequent occurences of them when importing said data
# (e.g. HMS Rank, HMS Rank.1, HMS Rank.2, etc.)
SCHEMA = {
    '1. Sex assigned at birth': 'string',
    '2. Gender Identity': 'string',
    'Black or African American': 'string',
    'Hispanic or Latino': 'string',
    'American Indian or Alaskan Native': 'string',
    'Native Hawaiian and other Pacific Islander': 'string',
    'Cambodian or Laotian': 'string',
    'None': 'string',
    'Prefer not to answer': 'string',
    '4. Identification as ': 'string',
    '5. Requirement for workplace accommodations: (Reasonable accommodations in the workplace support people with disability and/or chronic health conditions in performing their jobs.) If you would like more information about obtaining accommodations, contact MGH OHS.': 'string',
    '1. Are you considering or engaging in the process of retirement? ': 'string',
    '2. Are you interested in learning more about HMS promotions and discussing advancement on the HMS ladder? ': 'string',
    'MD': 'string',
    'MBChB': 'string',
    'DO': 'string',
    'PhD': 'string',
    'DPhil': 'string',
    'Sci D': 'string',
    'RDN': 'string',
    'MS': 'string',
    'MSN': 'string',
    'MPH': 'string',
    'MPA': 'string',
    'MBA': 'string',
    'Other': 'string',
    'i. HMS Rank': 'string',
    'ii. Date (yyyy)': 'string',
    'i. HMS Rank.1': 'string',
    'ii. Date (yyyy).1': 'string',
    'i. HMS Rank.2': 'string',
    'ii. Date (yyyy).2': 'string',
    'i. HMS Rank.3': 'string',
    'ii. Date (yyyy).3': 'string',
    'i. HMS Rank.4': 'string',
    'ii. Date (yyyy).4': 'string',
    'i. Job Title': 'string',
    'ii. Date (yyyy).5': 'string',
    'i. Job Title.1': 'string',
    'ii. Date (yyyy).6': 'string',
    'i. Job Title.2': 'string',
    'ii. Date (yyyy).7': 'string',
    'i. Job Title.3': 'string',
    'ii. Date (yyyy).8': 'string',
    'i. Job Title.4': 'string',
    'ii. Date (yyyy).9': 'string',
}

# Column label capitalization is inconsistent between data sets
# Convert all names to lowercase
SCHEMA = {key.lower(): val for key, val in SCHEMA.items()}

# Store a list of column labels for easier reference
COLUMNS = list(SCHEMA.keys())

# Create scaffolding for DataFrame that will house concatenated datasets
df = pd.DataFrame(columns=COLUMNS).astype(SCHEMA)

In [None]:
# Load all datasets into memory

from pathlib import Path

for filepath in Path("./data").iterdir():
    if filepath.is_file():
        print(f"Importing {filepath}")
        rawDf = pd.read_excel(
            filepath,
            header=0,
            engine="openpyxl"
        )
        # Convert raw column names to lowercase
        rawDf.rename(columns=str.lower, inplace=True)
        # Select only the COLUMNS we are interested in
        rawDf = rawDf.filter(items=COLUMNS)
        # Clear out blank rows
        trimmedDf = rawDf.dropna(how='all')
        print(f"{rawDf.shape[0] - trimmedDf.shape[0]} blank rows dropped.  Remaining rows: {trimmedDf.shape[0]}.")
        # Append rows to main DataFrame
        initial = df.shape[0]
        df = pd.concat([df, trimmedDf], ignore_index=True)
        print(f"Starting df record count: {initial}\nAdded records: {trimmedDf.shape[0]}\nEnding df record count: {df.shape[0]}\n")
        del rawDf, trimmedDf, initial

del Path, filepath

In [None]:
# Transform data