**Note**
Run this flow only if you wish to refresh the data we worked with in the audit-analysis.ipynb file. This notebook contains short scripts that create a synthetic dataset, organize the information into the required format, and hits the models' APIs.

In [1]:
import sys
sys.path.append('src/')

import numpy as np
import pandas as pd
from datagenerator import data_generator
from modelapi import fetch_response

pd.set_option('display.max_columns',50)
pd.set_option('display.max_rows',100)
pd.set_option('display.width', 500)


The cell below creates a synthetic dataset based on a config file we have predefined in the data folder.

In [2]:
config_file = "data/datagen-config.json"
num_rows = 4000
num_runs = 10
base_df = pd.DataFrame()

for i in range(num_runs):
    df = data_generator(config_file, num_rows)
    base_df = pd.concat([base_df, df])
    
base_df.reset_index(inplace=True)
base_df["Applicant ID"] = base_df.index+1
app_jobref_df = base_df[['Applicant ID', 'jobref_id']].drop_duplicates()
base_df.drop(columns=["index","jobref_id"],inplace=True)
base_df  = base_df[["Applicant ID","School Name","GPA","Degree","Location","Gender","Veteran status",
                   "Work authorization","Disability","Ethnicity","Role 1","Start 1","End 1","Role 2",
                   "Start 2","End 2","Role 3","Start 3","End 3"]]

base_df.head()

Unnamed: 0,Applicant ID,School Name,GPA,Degree,Location,Gender,Veteran status,Work authorization,Disability,Ethnicity,Role 1,Start 1,End 1,Role 2,Start 2,End 2,Role 3,Start 3,End 3
0,1,Illinois Institute of Technology,3.09,Master's,98039,,,0,,Pacific Islander,Junior Economist,12/20,,,,,,,
1,2,Massachusetts Institute of Technology,2.34,PhD,47401,,1.0,1,0.0,Pacific Islander,Junior Economist,12/20,,,,,,,
2,3,Massachusetts Institute of Technology,4.0,PhD,10001,M,,1,0.0,Asian,Junior Economist,12/20,,,,,,,
3,4,Illinois Institute of Technology,0.0,PhD,2108,,0.0,1,,Asian,Junior Economist,12/20,,,,,,,
4,5,National Taiwan University,1.36,Bachelor's,2108,F,,1,0.0,Asian,Junior Economist,12/20,,,,,,,


This synthetic dataset is passed as input to the resume scorer model for its response.

In [3]:
# Resume scorer results
rs_url = "https://jennjwang.pythonanywhere.com"
rs_df = fetch_response(url=rs_url, data=base_df, max_sz=4000, verbose=1)
rs_df['applicant_id'] = rs_df['applicant_id'].astype(int)
rs_df.head()

Iteration 1 of 10 response state: True
Iteration 2 of 10 response state: True
Iteration 3 of 10 response state: True
Iteration 4 of 10 response state: True
Iteration 5 of 10 response state: True
Iteration 6 of 10 response state: True
Iteration 7 of 10 response state: True
Iteration 8 of 10 response state: True
Iteration 9 of 10 response state: True
Iteration 10 of 10 response state: True


Unnamed: 0,applicant_id,score
0,1,7.25
1,2,2.97
2,3,8.47
3,4,3.82
4,5,1.89


We merge the responses back with the input dataset to create a new dataset which serves as input to the candidate evaluator model.

In [4]:
eval_df = pd.merge(left=base_df, right=rs_df, left_on=['Applicant ID'], right_on=['applicant_id'])
eval_df.drop(columns=['applicant_id'],inplace=True)
eval_df.rename(columns={'score':'Resume score'},inplace=True)
eval_df.head()

Unnamed: 0,Applicant ID,School Name,GPA,Degree,Location,Gender,Veteran status,Work authorization,Disability,Ethnicity,Role 1,Start 1,End 1,Role 2,Start 2,End 2,Role 3,Start 3,End 3,Resume score
0,1,Illinois Institute of Technology,3.09,Master's,98039,,,0,,Pacific Islander,Junior Economist,12/20,,,,,,,,7.25
1,2,Massachusetts Institute of Technology,2.34,PhD,47401,,1.0,1,0.0,Pacific Islander,Junior Economist,12/20,,,,,,,,2.97
2,3,Massachusetts Institute of Technology,4.0,PhD,10001,M,,1,0.0,Asian,Junior Economist,12/20,,,,,,,,8.47
3,4,Illinois Institute of Technology,0.0,PhD,2108,,0.0,1,,Asian,Junior Economist,12/20,,,,,,,,3.82
4,5,National Taiwan University,1.36,Bachelor's,2108,F,,1,0.0,Asian,Junior Economist,12/20,,,,,,,,1.89


The synthetic dataset in conjuction with each candidate's resume scores is passed as an input to the candidate evaluator model through the next API call.

In [5]:
ce_url = "https://heonlee.pythonanywhere.com"
ce_df = fetch_response(url=ce_url, data=eval_df, max_sz=4000, verbose=1)
ce_df['applicant_id'] = ce_df['applicant_id'].astype(int)
ce_df.head()

Iteration 1 of 10 response state: True
Iteration 2 of 10 response state: True
Iteration 3 of 10 response state: True
Iteration 4 of 10 response state: True
Iteration 5 of 10 response state: True
Iteration 6 of 10 response state: True
Iteration 7 of 10 response state: True
Iteration 8 of 10 response state: True
Iteration 9 of 10 response state: True
Iteration 10 of 10 response state: True


Unnamed: 0,applicant_id,prediction
0,1,0
1,2,0
2,3,0
3,4,0
4,5,1


All of this information is merged along with some reference variables like `jobref_id` which we utilize to simplify our analysis in the audit-analysis.ipynb file.

In [6]:
final_df = pd.merge(left=eval_df, right=ce_df, left_on=['Applicant ID'], 
                    right_on=['applicant_id'])
final_df.drop(columns=['applicant_id'],inplace=True)
final_df.columns = ['applicant_id', 'school_name', 'gpa', 'degree', 'location', 'gender', 'veteran_status', 
                    'work_auth', 'disability', 'ethnicity', 'role1', 'start1', 'end1', 'role2', 'start2', 
                    'end2', 'role3', 'start3', 'end3', 'resume_score', 'prediction']
final_df = pd.merge(left=final_df, right=app_jobref_df, left_on=['applicant_id'], right_on=['Applicant ID'])
final_df.drop(columns=['Applicant ID'], inplace=True)

temp1 = final_df.groupby(['school_name','gpa','degree','location','gender','veteran_status','work_auth',
                              'disability','ethnicity','jobref_id'])['applicant_id'].transform('min')
temp2 = final_df.groupby(['school_name','gpa','degree','location','gender','veteran_status','work_auth',
                              'disability','ethnicity','jobref_id'])['applicant_id'].transform('max')

final_df['group_idx'] = temp1.astype(int).astype(str)+'-'+temp2.astype(int).astype(str)
final_df.to_csv('data/audit-data.csv', index=False, encoding='utf8')
final_df.head()

Unnamed: 0,applicant_id,school_name,gpa,degree,location,gender,veteran_status,work_auth,disability,ethnicity,role1,start1,end1,role2,start2,end2,role3,start3,end3,resume_score,prediction,jobref_id,group_idx
0,1,Illinois Institute of Technology,3.09,Master's,98039,,,0,,Pacific Islander,Junior Economist,12/20,,,,,,,,7.25,0,5,1-36001
1,2,Massachusetts Institute of Technology,2.34,PhD,47401,,1.0,1,0.0,Pacific Islander,Junior Economist,12/20,,,,,,,,2.97,0,5,2-36002
2,3,Massachusetts Institute of Technology,4.0,PhD,10001,M,,1,0.0,Asian,Junior Economist,12/20,,,,,,,,8.47,0,5,3-36003
3,4,Illinois Institute of Technology,0.0,PhD,2108,,0.0,1,,Asian,Junior Economist,12/20,,,,,,,,3.82,0,5,4-36004
4,5,National Taiwan University,1.36,Bachelor's,2108,F,,1,0.0,Asian,Junior Economist,12/20,,,,,,,,1.89,1,5,5-36005
