## Import and setup of custom helper functions

Just a quick step to import libraries and define convenience functions that simplify data manipulation, renaming columns, etc.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter 
import datetime as dt
pd.set_option('display.max_columns',50)
pd.set_option('display.max_rows',100)
pd.set_option('display.width', 500)


def string_to_date(datestr, dt_format="%m/%y"):
    return dt.datetime.strptime(datestr, dt_format)

def date_diff(start_date, end_date):
    return (end_date-start_date).days

def redo_colnames(colnames, level=0):
    newnames = []
    if isinstance(colnames, pd.MultiIndex):
        for col in colnames:
            newnames.append(col[level].replace('_',' ').capitalize())
    else:
        for col in colnames:
            newnames.append(col.repalce('_',' ').capitalize())
    return newnames

def percentile(n):
    def percentile_(x):
        return np.percentile(x, n)
    percentile_.__name__ = 'p%s' % n
    return percentile_

def spd(sensitive_attribute, dataset, predicted_labels, majority_class, minority_class):
    # Compute the spd value
    
    majority_predicted = predicted_labels[dataset[sensitive_attribute] == majority_class]
    minority_predicted = predicted_labels[dataset[sensitive_attribute] == minority_class]
    spd_val = minority_predicted.mean() - majority_predicted.mean()

    return spd_val

## EDA & Data Distribution

To start off, we present some basic plots and EDA that provide insights on the distribution of the features in the dataset. Along the way we also define functions that manipulate applicant information to simplify the analysis.

A quick note to bear: we found it advantageous to generate identical samples multiple times to gauge the variability in the models\' outputs. With such a strategy we repeated each sample 10 times, for a set of 4000 distinct applicants, totaling 40000 rows of data. In this synthetic dataset each feature was drawn independently from a random distribution.

In [None]:
df = pd.read_csv('data/audit-data.csv', keep_default_na=False)
df.head()

In [None]:
def datestr_diff(start_str, end_str):
    if pd.isna(start_str) or start_str=="N/A":
        return 0
    else:
        start_dt = string_to_date(start_str)
        
    if pd.isna(end_str) or end_str=="N/A":
        end_dt = dt.datetime.today()
    else:
        end_dt = string_to_date(end_str)
    
    exp = date_diff(start_dt, end_dt)
    return exp

vec_datestr_diff = np.vectorize(datestr_diff)


df['role1_exp'] = vec_datestr_diff(df['start1'], df['end1'])
df['role2_exp'] = vec_datestr_diff(df['start2'], df['end2'])
df['role3_exp'] = vec_datestr_diff(df['start3'], df['end3'])
df['total_exp'] = (df['role1_exp']+df['role2_exp']+df['role3_exp'])//365


df['exp_yrs'] = pd.cut(df['total_exp'], bins=[0,1,3,5,10,15,100], 
                       labels=['00-01 yr','01-03 yr','03-05 yr','05-10 yr','10-15 yr','15+ yr'],
                       include_lowest=True, right=False)

df['num_jobs'] = np.sum(df[['role1_exp','role2_exp','role3_exp']].values>0, axis=1)

df['job21_gap'] = vec_datestr_diff(df['end2'], df['start1'])
df['job32_gap'] = vec_datestr_diff(df['end3'], df['start2'])
df['job_gap_months'] = (df['job21_gap']+df['job32_gap'])//30
df.head()

In [None]:
df.groupby(['exp_yrs','num_jobs']).agg({'applicant_id':'count'}).reset_index().\
rename(columns={'applicant_id':'Number of applications', 'exp_yrs':'Experience', 'num_jobs':'Number of jobs'}).\
pivot_table(index=['Number of jobs'], columns=['Experience'], aggfunc=sum, margins=True, margins_name='Total')


The table above shows the distribution of application profiles. 3280 applications demonstrate no job history and have less than 1 year of experience. Likewise 10,320 demonstrate some job history but less than 3 years of work experience. The purpose of setting such a distribution was to ensure that different applicant profiles are well represented in our dataset.

Similarly, with sensitive features like gender and ethnicity, we have tried to achieve a rather uniform distribution. The same is shown below.

In [None]:
df.groupby(['gender','ethnicity']).agg({'applicant_id':'count'}).reset_index().\
rename(columns={'applicant_id':'Number of applications', 'gender':'Gender', 'ethnicity':'Ethnicity'}).\
pivot_table(index=['Gender'], columns=['Ethnicity'], aggfunc=sum, margins=True, margins_name='Total')


Finally, we look at the distribution of degrees and GPA. It's clear from the graphs below that degrees are distributed uniformly, while GPA is built from a normal distribution, clipped at a GPA of 4.0. The details can be found in the datagenerator file that handles generation of the synthetic dataset.

In [None]:
# create bar plot for degree in percentage
degree = ['Bachelor\'s', 'Master\'s', 'PhD']

temp_df_degree = df.groupby(by='degree').size()/df.groupby(by='degree').size().sum()
temp_df_degree.plot(kind='bar')

# rotate the x-axis labels
plt.xticks(rotation=0)

# add title and labels
plt.title('Distribution of Degrees')
plt.xlabel('')
plt.ylabel('Percentage of Samples')

plt.gca().yaxis.set_major_formatter(PercentFormatter(1, 0))

# save as png
plt.savefig('plots/degree-dist.png', dpi=300)


In [None]:
# show the distribution of GPA density in a histogram, remove grid and add n_bins
plt.hist(df['gpa'], weights=np.ones(len(df['gpa'])) / len(df['gpa']), 
         color='tab:orange', alpha=0.65, edgecolor='tab:orange', bins=20)

# add title and labels
plt.title('Distribution of GPA')
plt.xlabel('GPA')
plt.ylabel('Percentage of samples')
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))

# save as png
plt.savefig('plots/gpa-dist.png', dpi=300)

## Analysis

Now that we have completed describing different features of the synthetic dataset, and drawn some insights on the distributions, we'll move on to analyzing the resume scorer and candidate evaluator models. Bearing that each distinct sample is queried 10 times, we can find a mean score and mean selection rate for each of the 4000 distinct applicants. The dataframe `dedup_df` enables such an analysis scheme.

In [None]:
df['mean_selection_rate'] = df.groupby(['group_idx'])['prediction'].transform('mean')
df['std_selection_rate'] = df.groupby(['group_idx'])['prediction'].transform(np.std)
df['mean_resume_score'] = df.groupby(['group_idx'])['resume_score'].transform('mean')
df['std_resume_score'] = df.groupby(['group_idx'])['resume_score'].transform(np.std)


dedup_df = df[['group_idx','jobref_id','school_name','gpa','degree','location','gender',
               'veteran_status','work_auth','disability','ethnicity','exp_yrs', 'num_jobs',
               'job_gap_months','mean_selection_rate','std_selection_rate','mean_resume_score',
               'std_resume_score']].drop_duplicates()

score_bins = [0,1,2,3,4,5,6,7,8,9,10]
dedup_df['mean_resume_score_bins'] = pd.cut(dedup_df['mean_resume_score'], bins=score_bins,
                                            right=False, include_lowest=True)

gpa_bins = [0,1.0,2.0,3.0,4.0001]
dedup_df['gpa_bins'] = pd.cut(dedup_df['gpa'], bins=gpa_bins, right=False, include_lowest=True)
dedup_df.head()

### Gender based discrepancies



For our first insight, we saw a marked bias in the selection of female candidates and individuals who refused to divulge gender information either because they're non-binary or other personal reasons. What stands out is that the resume scores are similar, but the selection rate are vastly higher for male candidates. This is also beyond any reasonable bounds such as the $\frac{4}{5}^{\text{th}}$ rule. 

The table below highlights the same and shows that controlling for a similar sample size, while the mean resume scores and their standard deviations-- and by extension the standard errors, since group size is consistent at 10-- the candidate evaluator model's selection rate is vastly different. The mean and median selection rates highlight this difference-- where individuals identifying as female have a 38% lower selection rate when compared to men. For those who deny sharing gender information or identify as non-binary the selection rate is 0%.

In [None]:
gender_group_df = dedup_df.groupby(['gender']).agg({'group_idx':['count'],
                                                    'mean_resume_score':['mean'],
                                                    'std_resume_score':['mean'],
                                                    'mean_selection_rate':['mean',percentile(50)]})

rename_map = {('group_idx', 'count'): 'Number of applicants',
              ('mean_resume_score', 'mean'): 'Mean resume score',
              ('std_resume_score', 'mean'): 'Std dev resume score',
              ('mean_selection_rate', 'mean'): 'Mean selection rate',
              ('mean_selection_rate', 'p50'): 'Median selection rate'}

gender_group_df.columns = [rename_map[tuple(col)] for col in gender_group_df.columns]
gender_group_df

The above analysis can also be viewed through the lens of fairness metrics covered in class. 

Let's consider the statistical parity difference (SPD)-- defined as the difference between the proportion of positive outcomes for the majority and minority groups. Mathematically,

$$\text{SPD} = \mathbb{P}(\hat{Y} = 1|A = \text{minority}) - \mathbb{P}(\hat{Y} = 1|A = \text{majority})$$

Setting the predicted outcome $\left(\hat{Y}\right)$, in the above equation, to be output of the candidate evaluator model we can see that $\mathbb{P}(\hat{Y} = 1|A = \text{minority})  = \mathbb{E}(\hat{Y}|A = \text{minority})$, since our outcome is binary. Thus, substituting the mean selection rate in the above equation, we find that: 

$$\begin{align*}
\text{SPD}_{\text{f-m}} &= \mathbb{E}(\hat{Y}|\text{gender} = \text{female}) - \mathbb{E}(\hat{Y}|\text{gender} = \text{male}) \\
&= 0.3313 - 0.5300 = -0.1987
\end{align*}$$

Likewise, for non-binary individuals, we see:
$$\begin{align*}
\text{SPD}_{\text{na-m}} &= \mathbb{E}(\hat{Y}|\text{gender} = \text{N/A}) - \mathbb{E}(\hat{Y}|\text{gender} = \text{male}) \\
&= 0.00 - 0.53 = -0.53
\end{align*}$$

<br><br>
Disparate impact is very similar to SPD, except that it is a ratio of the above proportions as opposed to a difference. Mathematically, 
$$\text{DI} = \dfrac{\mathbb{P}(\hat{Y} = 1|A = \text{minority})}{\mathbb{P}(\hat{Y} = 1|A = \text{majority})}$$

As before, since our model is binary $\mathbb{P}(\hat Y = 1 | A = a) = \mathbb{E}(\hat Y | A = a)$, we modify the above equation with mean selection rate arrived at above, finding:

$$
\begin{align*}
\text{DI}_{\text{f/m}} &= \dfrac{\mathbb{E}(\hat{Y}|\text{gender} = \text{female})}{\mathbb{E}(\hat{Y}|\text{gender} = \text{male})} \\
&= \dfrac{0.3313}{0.5300} = 0.625
\end{align*}
$$

Similarly, for non-binary individuals:

$$
\begin{align*}
\text{DI}_{\text{na/m}} &= \dfrac{\mathbb{E}(\hat{Y}|\text{gender} = \text{N/A})}{\mathbb{E}(\hat{Y}|\text{gender} = \text{male})} \\
&= \dfrac{0.00}{0.5300} = 0.0
\end{align*}
$$


### Job gap based discrepancies


In [None]:
job_gap_months_group_df = dedup_df.groupby(['job_gap_months']).agg({'group_idx':['count'],
                                                    'mean_resume_score':['mean'],
                                                    'std_resume_score':['mean'],
                                                    'mean_selection_rate':['mean',percentile(50)]})

rename_map = {('group_idx', 'count'): 'Number of applicants',
              ('mean_resume_score', 'mean'): 'Mean resume score',
              ('std_resume_score', 'mean'): 'Std dev resume score',
              ('mean_selection_rate', 'mean'): 'Mean selection rate',
              ('mean_selection_rate', 'p50'): 'Median selection rate'}

job_gap_months_group_df.columns = [rename_map[tuple(col)] for col in job_gap_months_group_df.columns]
job_gap_months_group_df

## Appendix

In [None]:
# show the distribution of mean resume score for each group_idx in a histogram, remove grid and add n_bins
n_bins = 20

plt.hist(df[df['gender']=='M'].groupby(['group_idx'])['resume_score'].mean(), n_bins, alpha=0.5, label='Male')
plt.hist(df[df['gender']=='F'].groupby(['group_idx'])['resume_score'].mean(), n_bins, alpha=0.5, label='Female')
plt.hist(df[df['gender']=='N/A'].groupby(['group_idx'])['resume_score'].mean(), n_bins, alpha=0.5, label='N/A')

plt.legend(loc='upper right')
plt.show()


### Ethnicity based discrepancies


In [None]:
ethnicity_group_df = dedup_df.groupby(['ethnicity']).agg({'group_idx':['count'],
                                                    'mean_resume_score':['mean'],
                                                    'std_resume_score':['mean'],
                                                    'mean_selection_rate':['mean',percentile(50)]})

rename_map = {('group_idx', 'count'): 'Number of applicants',
              ('mean_resume_score', 'mean'): 'Mean resume score',
              ('std_resume_score', 'mean'): 'Std dev resume score',
              ('mean_selection_rate', 'mean'): 'Mean selection rate',
              ('mean_selection_rate', 'p50'): 'Median selection rate'}

ethnicity_group_df.columns = [rename_map[tuple(col)] for col in ethnicity_group_df.columns]
ethnicity_group_df

### Work auth based discrepancies

In [None]:
work_auth_group_df = dedup_df.groupby(['work_auth']).agg({'group_idx':['count'],
                                                    'mean_resume_score':['mean'],
                                                    'std_resume_score':['mean'],
                                                    'mean_selection_rate':['mean',percentile(50)]})

rename_map = {('group_idx', 'count'): 'Number of applicants',
              ('mean_resume_score', 'mean'): 'Mean resume score',
              ('std_resume_score', 'mean'): 'Std dev resume score',
              ('mean_selection_rate', 'mean'): 'Mean selection rate',
              ('mean_selection_rate', 'p50'): 'Median selection rate'}

work_auth_group_df.columns = [rename_map[tuple(col)] for col in work_auth_group_df.columns]
work_auth_group_df

### Disability based discrepancies


In [None]:
disability_group_df = dedup_df.groupby(['disability']).agg({'group_idx':['count'],
                                                    'mean_resume_score':['mean'],
                                                    'std_resume_score':['mean'],
                                                    'mean_selection_rate':['mean',percentile(50)]})

rename_map = {('group_idx', 'count'): 'Number of applicants',
              ('mean_resume_score', 'mean'): 'Mean resume score',
              ('std_resume_score', 'mean'): 'Std dev resume score',
              ('mean_selection_rate', 'mean'): 'Mean selection rate',
              ('mean_selection_rate', 'p50'): 'Median selection rate'}

disability_group_df.columns = [rename_map[tuple(col)] for col in disability_group_df.columns]
disability_group_df

### Veteran status based discrepancies


In [None]:
veteran_status_group_df = dedup_df.groupby(['veteran_status']).agg({'group_idx':['count'],
                                                    'mean_resume_score':['mean'],
                                                    'std_resume_score':['mean'],
                                                    'mean_selection_rate':['mean',percentile(50)]})

rename_map = {('group_idx', 'count'): 'Number of applicants',
              ('mean_resume_score', 'mean'): 'Mean resume score',
              ('std_resume_score', 'mean'): 'Std dev resume score',
              ('mean_selection_rate', 'mean'): 'Mean selection rate',
              ('mean_selection_rate', 'p50'): 'Median selection rate'}

veteran_status_group_df.columns = [rename_map[tuple(col)] for col in veteran_status_group_df.columns]
veteran_status_group_df

### Degree level based discrepancies


In [None]:
degree_group_df = dedup_df.groupby(['degree']).agg({'group_idx':['count'],
                                                    'mean_resume_score':['mean'],
                                                    'std_resume_score':['mean'],
                                                    'mean_selection_rate':['mean',percentile(50)]})

rename_map = {('group_idx', 'count'): 'Number of applicants',
              ('mean_resume_score', 'mean'): 'Mean resume score',
              ('std_resume_score', 'mean'): 'Std dev resume score',
              ('mean_selection_rate', 'mean'): 'Mean selection rate',
              ('mean_selection_rate', 'p50'): 'Median selection rate'}

degree_group_df.columns = [rename_map[tuple(col)] for col in degree_group_df.columns]
degree_group_df

### GPA bins based discrepancy

In [None]:
gpa_bins_group_df = dedup_df.groupby(['gpa_bins']).agg({'group_idx':['count'],
                                                    'mean_resume_score':['mean'],
                                                    'std_resume_score':['mean'],
                                                    'mean_selection_rate':['mean',percentile(50)]})

rename_map = {('group_idx', 'count'): 'Number of applicants',
              ('mean_resume_score', 'mean'): 'Mean resume score',
              ('std_resume_score', 'mean'): 'Std dev resume score',
              ('mean_selection_rate', 'mean'): 'Mean selection rate',
              ('mean_selection_rate', 'p50'): 'Median selection rate'}

gpa_bins_group_df.columns = [rename_map[tuple(col)] for col in gpa_bins_group_df.columns]
gpa_bins_group_df