# Setup

In [None]:
# Set up environment and define shape of data

import pandas as pd
import matplotlib.pyplot as plt

pd.set_option('display.max_colwidth', None)

# List of columns to analyze
# Note that pandas will make any duplicate column names unique by appending a sequence number
# to subsequent occurences of them when importing said data
# (e.g. HMS Rank, HMS Rank.1, HMS Rank.2, etc.)
SCHEMA = {
    '1. Sex assigned at birth': 'string',
    '2. Gender Identity': 'string',
    'Black or African American': 'string',
    'Hispanic or Latino': 'string',
    'American Indian or Alaskan Native': 'string',
    'Native Hawaiian and other Pacific Islander': 'string',
    'Cambodian or Laotian': 'string',
    'None': 'string',
    'Prefer not to answer': 'string',
    '4. Identification as ': 'string',
    '5. Requirement for workplace accommodations: (Reasonable accommodations in the ' + \
    'workplace support people with disability and/or chronic health conditions in ' + \
    'performing their jobs.) If you would like more information about obtaining ' + \
    'accommodations, contact MGH OHS.': 'string',
    '1. Are you considering or engaging in the process of retirement? ': 'string',
    '2. Are you interested in learning more about HMS promotions and discussing advancement ' + \
    'on the HMS ladder? ': 'string',
    'MD': 'string',
    'MBChB': 'string',
    'DO': 'string',
    'PhD': 'string',
    'DPhil': 'string',
    'Sci D': 'string',
    'RDN': 'string',
    'MS': 'string',
    'MSN': 'string',
    'MPH': 'string',
    'MPA': 'string',
    'MBA': 'string',
    'Other': 'string',
    'i. HMS Rank': 'string',
    'ii. Date (yyyy)': 'string',
    'i. HMS Rank.1': 'string',
    'ii. Date (yyyy).1': 'string',
    'i. HMS Rank.2': 'string',
    'ii. Date (yyyy).2': 'string',
    'i. HMS Rank.3': 'string',
    'ii. Date (yyyy).3': 'string',
    'i. HMS Rank.4': 'string',
    'ii. Date (yyyy).4': 'string',
    'i. Job Title': 'string',
    'ii. Date (yyyy).5': 'string',
    'i. Job Title.1': 'string',
    'ii. Date (yyyy).6': 'string',
    'i. Job Title.2': 'string',
    'ii. Date (yyyy).7': 'string',
    'i. Job Title.3': 'string',
    'ii. Date (yyyy).8': 'string',
    'i. Job Title.4': 'string',
    'ii. Date (yyyy).9': 'string',
}

# Column label capitalization is inconsistent between data sets
# Convert all names to lowercase
SCHEMA = {key.lower(): val for key, val in SCHEMA.items()}

# Store a list of column labels for easier reference
COLUMNS = list(SCHEMA.keys())

# Create scaffolding for DataFrame that will house concatenated datasets
df = pd.DataFrame(columns=COLUMNS).astype(SCHEMA)

# Helper for percentage formula
# Default denominator is the total number of faculty invited to complete the survey
INVITED = 1095
def calcPercent(numerator, denominator=INVITED):
    return round(numerator / denominator * 100, 2)

# Bar graph bar colors
BAR_COLORS = ['red','blue','gray', 'pink', 'purple', 'yellow', 'green', 'orange']

In [None]:
# Load all datasets into memory

from pathlib import Path

for filepath in Path("./data").iterdir():
    if filepath.is_file():
        print(f"Importing {filepath}")
        rawDf = pd.read_excel(
            filepath,
            header=0,
            engine="openpyxl"
        )
        # Convert raw column names to lowercase
        rawDf.rename(columns=str.lower, inplace=True)
        # Select only the COLUMNS we are interested in
        rawDf = rawDf.filter(items=COLUMNS)
        # Clear out blank rows
        trimmedDf = rawDf.dropna(how='all')
        print(
            f"{rawDf.shape[0] - trimmedDf.shape[0]} blank rows dropped.  " + \
            f"Remaining rows: {trimmedDf.shape[0]}.\n"
        )
        # Append rows to main DataFrame
        initial = df.shape[0]
        df = pd.concat([df, trimmedDf], ignore_index=True)
        del rawDf, trimmedDf, initial

del Path, filepath

<hr style="border:3px solid gray">

# Individual datapoint analysis

# Dataset Size

In [None]:
print(
    f"{df.shape[0]} faculty provided at least some demographic or rank data for a " + \
    f"{calcPercent(df.shape[0])}% overall response rate."
)

# Sex at Birth

In [None]:
# Number of respondents that provided this datapoint
col = df[COLUMNS[0]]
responses = col.dropna().size
print(f"{responses} faculty answered this question for a {calcPercent(responses)}% response rate.")

In [None]:
# Chart counts and percentages
col.fillna(value='No Answer', inplace=True)
# Count the occurences of each unique value in the column
col = col.value_counts()

# Create a table with the unique values as the row names
tmpDf = col.rename("Count").to_frame()
# Calculate how many did not answer this question
tmpDf.at['No Answer', 'Count'] = INVITED - tmpDf.iloc[:2, 0].sum()
# Calculate percentages for each unique value
tmpDf['Percent'] = calcPercent(tmpDf['Count'])
tmpDf

In [None]:
plt.bar(
    x=tmpDf.index,
    height=tmpDf['Count'],
    color=BAR_COLORS
)
plt.title(COLUMNS[0], fontsize=18)

In [None]:
del responses, col, tmpDf

# Gender

In [None]:
# Number of respondents that provided this datapoint
col = df[COLUMNS[1]]
responses = col.dropna().size
print(f"{responses} faculty answered this question for a {calcPercent(responses)}% response rate.")

In [None]:
# Chart counts and percentages
col.fillna(value='No Answer', inplace=True)
col = col.value_counts()

tmpDf = col.rename("Count").to_frame()
tmpDf.at['No Answer', 'Count'] = INVITED - tmpDf.iloc[:2, 0].sum() - tmpDf.iloc[3:, 0].sum()
tmpDf['Percent'] = calcPercent(tmpDf['Count'])
tmpDf

In [None]:
fig = plt.figure(figsize=(15,10))
axes = fig.add_subplot()
axes.set_title(COLUMNS[1], fontsize=18)
plot = axes.bar(
    x=tmpDf.index,
    height=tmpDf['Count'],
    color=BAR_COLORS
)

In [None]:
del responses, col, tmpDf, fig, axes, plot

# URIM

In [None]:
# Select URIM columns
tmpDf = df.iloc[:, 2:9]

In [None]:
# Count up how many survey respondents supplied URIM data
tmpDf['answered'] = 0
for i in range(2, 9):
    # For each URIM column, select non-null responses and mark them in the 'answered' column
    tmpDf.loc[~tmpDf[COLUMNS[i]].isnull(), 'answered'] = 1

responses = tmpDf[tmpDf.answered == 1].shape[0]
print(f"{responses} faculty answered the URIM question " + \
      f"for a {calcPercent(responses)}% response rate.")

# Remove rows with no URIM data
tmpDf.drop(tmpDf[tmpDf.answered == 0].index, inplace=True)
# Cleanup work column
tmpDf.drop('answered', axis=1, inplace=True)

In [None]:
# Define and populate count table for each demographic, bearing in mind that
# multiple ethnicities can be selected by a single respondent
countDf = pd.DataFrame(columns=['Count', 'Percent'])
for i in range(2, 9):
    count = len(tmpDf[~tmpDf[COLUMNS[i]].isnull()])
    countDf.loc[COLUMNS[i]] = [
        count,
        calcPercent(count)
    ]

countDf.loc['No Answer', 'Count'] = INVITED - responses
countDf.loc['No Answer', 'Percent'] = calcPercent(countDf.at['No Answer', 'Count'])


countDf['Count'] = countDf['Count'].astype(int)
countDf

"Prefer not to answer" and "No answer" are kept separate because they represent different nuances.  The former represents respondents who were willing to participate to at least some degree, while the latter represents respondents who chose not to answer at all.

In [None]:
del responses, tmpDf, count, countDf

# Orientation

In [None]:
# Number of respondents that provided this datapoint
col = df[COLUMNS[9]]
responses = col.dropna().size
print(f"{responses} faculty answered this question for a {calcPercent(responses)}% response rate.")

In [None]:
# Chart counts and percentages
col.fillna(value='No Answer', inplace=True)
col = col.value_counts()

tmpDf = col.rename("Count").to_frame()
tmpDf.at['No Answer', 'Count'] = INVITED - tmpDf.iloc[:2, 0].sum() - tmpDf.iloc[3:, 0].sum()
tmpDf['Percent'] = calcPercent(tmpDf['Count'])
tmpDf

In [None]:
fig = plt.figure(figsize=(15,10))
axes = fig.add_subplot()
axes.set_title(COLUMNS[9], fontsize=18)
plot = axes.bar(
    x=tmpDf.index,
    height=tmpDf['Count'],
    color=BAR_COLORS
)

In [None]:
del responses, col, tmpDf, fig, axes, plot

# Accomodations

In [None]:
# Number of respondents that provided this datapoint
col = df[COLUMNS[10]]
responses = col.dropna().size
print(f"{responses} faculty answered this question for a {calcPercent(responses)}% response rate.")

In [None]:
# Chart counts and percentages
col.fillna(value='No Answer', inplace=True)
col = col.value_counts()

tmpDf = col.rename("Count").to_frame()
tmpDf.at['No Answer', 'Count'] = INVITED - responses
tmpDf['Percent'] = calcPercent(tmpDf['Count'])
tmpDf

In [None]:
fig = plt.figure(figsize=(15,10))
axes = fig.add_subplot()
axes.set_title(COLUMNS[10], fontsize=18)
plot = axes.bar(
    x=tmpDf.index,
    height=tmpDf['Count'],
    color=BAR_COLORS
)

In [None]:
del responses, col, tmpDf, fig, axes, plot

# Retirement

In [None]:
# Number of respondents that provided this datapoint
col = df[COLUMNS[11]]
responses = col.dropna().size
print(f"{responses} faculty answered this question for a {calcPercent(responses)}% response rate.")

In [None]:
# Chart counts and percentages
col.fillna(value='No Answer', inplace=True)
col = col.value_counts()

tmpDf = col.rename("Count").to_frame()
tmpDf.at['No Answer', 'Count'] = INVITED - responses
tmpDf['Percent'] = calcPercent(tmpDf['Count'])
tmpDf

In [None]:
fig = plt.figure(figsize=(15,10))
axes = fig.add_subplot()
axes.set_title(COLUMNS[11], fontsize=18)
plot = axes.bar(
    x=tmpDf.index,
    height=tmpDf['Count'],
    color=BAR_COLORS
)

In [None]:
del responses, col, tmpDf, fig, axes, plot

# Degrees

In [None]:
# Select degree columns
tmpDf = df.iloc[:, 13:26]

In [None]:
# Count up how many survey respondents supplied degree data
tmpDf['answered'] = 0
for i in range(13, 26):
    # For each degree column, select non-null responses and mark them in the 'answered' column
    tmpDf.loc[~tmpDf[COLUMNS[i]].isnull(), 'answered'] = 1

responses = tmpDf[tmpDf.answered == 1].shape[0]
print(f"{responses} faculty provide their degrees " + \
      f"for a {calcPercent(responses)}% response rate.")

# Remove rows with no degree data
tmpDf.drop(tmpDf[tmpDf.answered == 0].index, inplace=True)
# Cleanup work column
tmpDf.drop('answered', axis=1, inplace=True)

In [None]:
# Define and populate count table for each degree group, bearing in mind that
# multiple degrees can be selected by a single respondent
countDf = pd.DataFrame(columns=['Count', 'Percent'])

# Clinical degrees
tmpDf['Counter'] = tmpDf[COLUMNS[13]].str.cat(tmpDf[COLUMNS[14:16]], na_rep='')
countDf.loc['Clinical', 'Count'] = tmpDf[tmpDf['Counter'].str.len() > 0].shape[0]

# Non-clinical degrees
tmpDf['Counter'] = tmpDf[COLUMNS[16]].str.cat(tmpDf[COLUMNS[17:19]], na_rep='')
countDf.loc['Non-clinical', 'Count'] = tmpDf[tmpDf['Counter'].str.len() > 0].shape[0]

# Other degrees
tmpDf['Counter'] = tmpDf[COLUMNS[19]].str.cat(tmpDf[COLUMNS[20:26]], na_rep='')
countDf.loc['Other', 'Count'] = tmpDf[tmpDf['Counter'].str.len() > 0].shape[0]

# No degrees given
countDf.loc['No Answer', 'Count'] = INVITED - responses

# Calculate percentages
countDf['Percent'] = countDf['Count'].apply(calcPercent)
countDf

In [None]:
fig = plt.figure(figsize=(15,10))
axes = fig.add_subplot()
axes.set_title('Degrees', fontsize=18)
plot = axes.bar(
    x=countDf.index,
    height=countDf['Count'],
    color=BAR_COLORS
)

In [None]:
del responses, tmpDf, countDf, fig, axes, plot

<hr style="border:3px solid gray">

# Setup - Pivot Bases

In [None]:
# Suppress SettingWithCopyWarning since we are using the .loc indexing method instead of the [][]
# chain indexing method it warns against.
# pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
#pd.options.mode.chained_assignment = <'warn' || 'raise' || None>

# Add rows representing those who didn't respond to the ACC using Sex at Birth as anchor column
df[COLUMNS[0]].fillna('No Answer', inplace=True)
for i in range(0, (INVITED - len(df))):
    df.loc[len(df), COLUMNS[0]] = 'No Answer'

# Fill in empty values in other categorical columns
for i in [1, 9, 10, 11]:
    df[COLUMNS[i]].fillna('No Answer', inplace=True)

# Replace str values in multi-select column groups with boolean values
# URIM Ethnicities
for i in range(2, 9):
    df.loc[df[COLUMNS[i]].notna(), COLUMNS[i]] = 1
    df[COLUMNS[i]].fillna(0, inplace=True)

# Degrees
# Concatenate strings of different degree categories together then convert the combined strings
# to boolean
def isMember(x):
    return 1 if len(x) > 0 else 0
df['degClinical'] = df[COLUMNS[13]].str.cat(df[COLUMNS[14:16]], na_rep='').apply(isMember)
df['degNonClinical'] = df[COLUMNS[16]].str.cat(df[COLUMNS[17:19]], na_rep='').apply(isMember)
df['degOther'] = df[COLUMNS[19]].str.cat(df[COLUMNS[20:26]], na_rep='').apply(isMember)
# Add degree category that represents those who did not provide degree data
df['degNoAnswer'] = (df.iloc[:, -3] + df.iloc[:, -2] + df.iloc[:, -1]) \
.apply(lambda x: 1 if x == 0 else 0)

del i, isMember

# HMS Promotion Interest

## Overall

In [None]:
# Number of respondents that provided this datapoint
responses = df[COLUMNS[12]].notna().sum()
print(f"{responses} faculty answered this question for a {calcPercent(responses)}% response rate.")

df[COLUMNS[12]].fillna(value='No Answer', inplace=True)

In [None]:
# Chart counts and percentages
col = df[COLUMNS[12]].value_counts()

tmpDf = col.rename("Count").to_frame()
tmpDf['Percent'] = calcPercent(tmpDf['Count'])
tmpDf

In [None]:
fig = plt.figure(figsize=(15,10))
axes = fig.add_subplot()
axes.set_title(COLUMNS[12], fontsize=18)
plot = axes.bar(
    x=tmpDf.index,
    height=tmpDf['Count'],
    color=BAR_COLORS
)

In [None]:
del responses, tmpDf, fig, axes, plot

## Grouped by Sex at Birth

In [None]:
pivot = pd.crosstab(
    df[COLUMNS[0]], df[COLUMNS[12]],
    margins=True,
    margins_name='Total'
)
# Reorder indices and columns
pivot = pivot.loc[
    ['No Answer', 'Female', 'Male', 'Total'],
    ['No Answer', 'Yes', 'No', 'Total']
]
pivot

### Example reading:

Of those who did not indicate either way on HMS promotions:
 - 273 did not indicate their sex at birth
 - 12 were female at birth
 - 12 were male at birth

Of those who expressed interest in HMS promotion:
 - 5 did not provide their sex at birth
 - 244 were female at birth
 - 216 were male at birth

Of those who explicitly did not express interest in HMS promotion:
 - 3 did not provide their sex at birth
 - 166 were female at birth
 - 164 were male at birth

### Example of calculating percentages for a given category

In [None]:
# Example percentage calculation for a given category
category = pivot.loc['Female']
total = category.pop('Total')
category.apply(calcPercent, denominator=total)

In [None]:
del pivot, category, total