In [None]:
import sys
sys.path.append('src/')

import numpy as np
import pandas as pd
from datagenerator import data_generator
from modelapi import fetch_response

pd.set_option('display.max_columns',50)
pd.set_option('display.max_rows',100)
pd.set_option('display.width', 500)


In [None]:
config_file = "data/datagen-config.json"
num_rows = 4000
num_runs = 10
base_df = pd.DataFrame()

for i in range(num_runs):
    df = data_generator(config_file, num_rows)
    base_df = pd.concat([base_df, df])
    
base_df.reset_index(inplace=True)
base_df["Applicant ID"] = base_df.index+1
app_jobref_df = base_df[['Applicant ID', 'jobref_id']].drop_duplicates()
base_df.drop(columns=["index","jobref_id"],inplace=True)
base_df  = base_df[["Applicant ID","School Name","GPA","Degree","Location","Gender","Veteran status",
                   "Work authorization","Disability","Ethnicity","Role 1","Start 1","End 1","Role 2",
                   "Start 2","End 2","Role 3","Start 3","End 3"]]

base_df.head()

In [None]:
# Resume scorer results
rs_url = "https://jennjwang.pythonanywhere.com"
rs_df = fetch_response(url=rs_url, data=base_df, max_sz=4000, verbose=0)
rs_df['applicant_id'] = rs_df['applicant_id'].astype(int)
rs_df.head()

In [None]:
eval_df = pd.merge(left=base_df, right=rs_df, left_on=['Applicant ID'], right_on=['applicant_id'])
eval_df.drop(columns=['applicant_id'],inplace=True)
eval_df.rename(columns={'score':'Resume score'},inplace=True)
eval_df.head()

In [None]:
ce_url = "https://heonlee.pythonanywhere.com"
ce_df = fetch_response(url=ce_url, data=eval_df, max_sz=4000, verbose=0)
ce_df['applicant_id'] = ce_df['applicant_id'].astype(int)
ce_df.head()

In [None]:
final_df = pd.merge(left=eval_df, right=ce_df, left_on=['Applicant ID'], 
                    right_on=['applicant_id'])
final_df.drop(columns=['applicant_id'],inplace=True)
final_df.columns = ['applicant_id', 'school_name', 'gpa', 'degree', 'location', 'gender', 'veteran_status', 
                    'work_auth', 'disability', 'ethnicity', 'role1', 'start1', 'end1', 'role2', 'start2', 
                    'end2', 'role3', 'start3', 'end3', 'resume_score', 'prediction']
final_df = pd.merge(left=final_df, right=app_jobref_df, left_on=['applicant_id'], right_on=['Applicant ID'])
final_df.drop(columns=['Applicant ID'], inplace=True)

temp1 = final_df.groupby(['school_name','gpa','degree','location','gender','veteran_status','work_auth',
                              'disability','ethnicity','jobref_id'])['applicant_id'].transform('min')
temp2 = final_df.groupby(['school_name','gpa','degree','location','gender','veteran_status','work_auth',
                              'disability','ethnicity','jobref_id'])['applicant_id'].transform('max')

final_df['group_idx'] = temp1.astype(int).astype(str)+'-'+temp2.astype(int).astype(str)
final_df.to_csv('audit-data.csv', index=False, encoding='utf8')
final_df.head()