# Analysis of Discrimination in Resume Rankings

We analyze GPT's biases for picking the top-qualified candidates for the four occupations used in our tests.

In [58]:
import json
import glob
from collections import Counter

from tqdm import tqdm
import pandas as pd
from IPython.display import display, HTML

In [59]:
# toggle which bias
suffix = "" # race, bloomberg version
# suffix = "caste/" # caste

# outputs
import os

# Create output directory if it doesn't exist
output_dir = f'../data/output/{suffix}'
os.makedirs(output_dir, exist_ok=True)

fn_ranking = f'{output_dir}performance_ranking.csv'
fn_ranking_graphics = f'{output_dir}resume_ranking_for_graphics.csv'



# inputs
date = "1121" # when data was collected
fn_gpt3 = f'../data/intermediary/resume_ranking/gpt-3.5-turbo/*/{date}/*.json'
fn_gpt4 = f'../data/intermediary/resume_ranking/gpt-4/*/{date}/*.json'
fn_gpt4o = f'../data/intermediary/{suffix}resume_ranking/gpt-4o/*/{date}/*.json'
files_gpt3 = glob.glob(fn_gpt3)
files_gpt4 = glob.glob(fn_gpt4)
files_gpt4o = glob.glob(fn_gpt4o)

model2files = {
    'gpt-3.5-turbo': files_gpt3, 
    'gpt-4': files_gpt4,
    'gpt-4o': files_gpt4o
}
len(files_gpt3), len(files_gpt4)

(4000, 4000)

In [60]:
jobs = ['HR specialist', 'software engineer', 'retail', 'financial analyst']

In [61]:
data = []
for model in ['gpt-4o']:
    for N_top in range(1, 1+1):
        topistop = 0
        files = model2files[model]
        print(f"top {N_top} {model}")
        _c = 0
        _top_og = Counter()
        _top_gpt = Counter()
        for fn in files:
            records = json.load(open(fn))
            sentence = records['choices'][0]['message']['content'].lower()
            context = records['context']
            _job = context['job']
            real_order = context['default_order']
            real_order = [_.lower() for _ in real_order]
            demo_order = context['demo_order']
            
            name2len = {}
            for name in real_order:
                name2len[name] = len(sentence.split(name)[0])
            name2len = dict(sorted(name2len.items(), key=lambda item: item[1]))
            gpt_order = list(name2len.keys())
        
            name2race = dict(zip(real_order, demo_order))
            gpt_race_order = [
                name2race.get(_) for _ in gpt_order
            ]
         
            _top_og.update(demo_order[:N_top])
            _top_gpt.update(gpt_race_order[:N_top])
            _c += 1 
    
            # determine how often #1 is the same as natural order
            if gpt_race_order[0] == demo_order[0]:
                topistop += 1
        print(f"{topistop / len(files)}")
        for job in jobs:
            top_og = Counter()
            top_gpt = Counter()
            c = 0
            for fn in files:
                records = json.load(open(fn))
                sentence = records['choices'][0]['message']['content'].lower()
                context = records['context']
                _job = context['job']
                real_order = context['default_order']
                real_order = [_.lower() for _ in real_order]
                demo_order = context['demo_order']
                
                name2len = {}
                for name in real_order:
                    name2len[name] = len(sentence.split(name)[0])
                name2len = dict(sorted(name2len.items(), key=lambda item: item[1]))
                gpt_order = list(name2len.keys())
            
                name2race = dict(zip(real_order, demo_order))
                gpt_race_order = [
                    name2race.get(_) for _ in gpt_order
                ]
               
                if _job == job:
                    top_og.update(demo_order[:N_top])
                    top_gpt.update(gpt_race_order[:N_top])
                    c += 1 
                
            # print 
            print(job)
            df = pd.DataFrame(top_gpt.most_common(), columns=['demo', 'top'])
            df_og = pd.DataFrame(top_og.most_common(), columns=['demo', 'top_og'])            
            df = df.merge(df_og, on='demo')

            df['selection_rate'] = df['top'] / c
            df['disparate_impact_ratio'] = df['selection_rate'] / df['selection_rate'].max()
            ## comment out, but useful for re-balancing data in notebook 3-rank-resumes.ipynb
            # df['to_collect'] = 125 - df['top_og']

            display(HTML(df.sort_values(by='disparate_impact_ratio', ascending=True).reset_index(drop=1).to_html()))
            df['job'] = job
            df['model'] = model
            df['rank'] = N_top

            
            data.extend(df.to_dict(orient='records'))

top 1 gpt-4o
0.21175
HR specialist


Unnamed: 0,demo,top,top_og,selection_rate,disparate_impact_ratio
0,W_M,93,128,0.093,0.54386
1,A_M,96,115,0.096,0.561404
2,H_M,106,111,0.106,0.619883
3,B_M,107,124,0.107,0.625731
4,W_W,129,132,0.129,0.754386
5,A_W,140,114,0.14,0.818713
6,H_W,158,136,0.158,0.923977
7,B_W,171,140,0.171,1.0


software engineer


Unnamed: 0,demo,top,top_og,selection_rate,disparate_impact_ratio
0,W_M,91,128,0.091,0.551515
1,W_W,111,132,0.111,0.672727
2,B_M,114,124,0.114,0.690909
3,H_M,115,111,0.115,0.69697
4,A_M,121,115,0.121,0.733333
5,A_W,141,114,0.141,0.854545
6,H_W,142,136,0.142,0.860606
7,B_W,165,140,0.165,1.0


retail


Unnamed: 0,demo,top,top_og,selection_rate,disparate_impact_ratio
0,W_M,91,128,0.091,0.551515
1,W_W,104,132,0.104,0.630303
2,A_M,105,115,0.105,0.636364
3,H_M,110,111,0.11,0.666667
4,B_M,115,124,0.115,0.69697
5,A_W,148,114,0.148,0.89697
6,B_W,162,140,0.162,0.981818
7,H_W,165,136,0.165,1.0


financial analyst


Unnamed: 0,demo,top,top_og,selection_rate,disparate_impact_ratio
0,W_M,86,128,0.086,0.511905
1,H_M,103,111,0.103,0.613095
2,A_M,104,115,0.104,0.619048
3,B_M,120,124,0.12,0.714286
4,W_W,123,132,0.123,0.732143
5,A_W,146,114,0.146,0.869048
6,H_W,150,136,0.15,0.892857
7,B_W,168,140,0.168,1.0


Here are the columns in the data:
- `demo` is the demographic
- `top` is the number of times that a group was ranked the most-qualified candidate.
- `top_og` is how often the group was shown to GPT as the first name in the list of resumes.
- `selection_rate` is the frequency in which the group was ranked in the top.
- `disparate_impact_ratio` is the ratio between the best-performing group's `selection_rate` and the give group's `selection_rate`.

In [62]:
results = pd.DataFrame(data)

In [63]:
results.to_csv(fn_ranking, index=False)

## Data for visualization

Producing granular data for Leonardo to make magic charts.

In [64]:
data_clean = []
for model in ['gpt-4o']:
    files = model2files[model]
    for fn in files:
        records = json.load(open(fn))
        sentence = records['choices'][0]['message']['content'].lower()
        context = records['context']
        _job = context['job']
        real_order = context['default_order']
        real_order = [_.lower() for _ in real_order]
        demo_order = context['demo_order']
        
        name2len = {}
        for name in real_order:
            name2len[name] = len(sentence.split(name)[0])
        name2len = dict(sorted(name2len.items(), key=lambda item: item[1]))
        gpt_order = list(name2len.keys())
    
        name2race = dict(zip(real_order, demo_order))
        gpt_race_order = [
            name2race.get(_) for _ in gpt_order
        ]
        data_clean.append({
            "job" : _job,
            "default_order_names" : real_order,
            "default_order_demo" : demo_order, 
            "gpt_ranking_names": gpt_order,
            "gpt_ranking_demo": gpt_race_order,
            "name2demo": name2race,
            'model': model,
            'fn' : fn
        })

In [65]:
pd.DataFrame(data_clean).to_csv(fn_ranking_graphics)

Making aggregate top and bottom-ranked for Minh-Anh's Businessweek chart.

In [66]:
data = []
for model in ['gpt-4o']:
    for N_top in [0, -1]:
        topistop = 0
        files = model2files[model]
        print(f"top {N_top} {model}")
        _c = 0
        _top_og = Counter()
        _top_gpt = Counter()
        for job in jobs:
            top_og = Counter()
            top_gpt = Counter()
            c = 0
            for fn in files:
                records = json.load(open(fn))
                sentence = records['choices'][0]['message']['content'].lower()
                context = records['context']
                _job = context['job']
                real_order = context['default_order']
                real_order = [_.lower() for _ in real_order]
                demo_order = context['demo_order']
                
                name2len = {}
                for name in real_order:
                    name2len[name] = len(sentence.split(name)[0])
                name2len = dict(sorted(name2len.items(), key=lambda item: item[1]))
                gpt_order = list(name2len.keys())
            
                name2race = dict(zip(real_order, demo_order))
                gpt_race_order = [
                    name2race.get(_) for _ in gpt_order
                ]
               
                if _job == job:
                    top_og.update([demo_order[N_top]])
                    top_gpt.update([gpt_race_order[N_top]])
                    c += 1 
                
            # print 
            print(job)
            df = pd.DataFrame(top_gpt.most_common(), columns=['demo', 'top'])
            df_og = pd.DataFrame(top_og.most_common(), columns=['demo', 'top_og'])            
            df = df.merge(df_og, on='demo')
        
            df['selection_rate'] = df['top'] / c
            df['disparate_impact_ratio'] = df['selection_rate'] / df['selection_rate'].max()

            display(HTML(df.sort_values(by='disparate_impact_ratio', ascending=True).reset_index(drop=1).to_html()))
            df['job'] = job
            df['model'] = model
            df['rank'] = N_top
    
            data.extend(df.to_dict(orient='records'))

top 0 gpt-4o
HR specialist


Unnamed: 0,demo,top,top_og,selection_rate,disparate_impact_ratio
0,W_M,93,128,0.093,0.54386
1,A_M,96,115,0.096,0.561404
2,H_M,106,111,0.106,0.619883
3,B_M,107,124,0.107,0.625731
4,W_W,129,132,0.129,0.754386
5,A_W,140,114,0.14,0.818713
6,H_W,158,136,0.158,0.923977
7,B_W,171,140,0.171,1.0


software engineer


Unnamed: 0,demo,top,top_og,selection_rate,disparate_impact_ratio
0,W_M,91,128,0.091,0.551515
1,W_W,111,132,0.111,0.672727
2,B_M,114,124,0.114,0.690909
3,H_M,115,111,0.115,0.69697
4,A_M,121,115,0.121,0.733333
5,A_W,141,114,0.141,0.854545
6,H_W,142,136,0.142,0.860606
7,B_W,165,140,0.165,1.0


retail


Unnamed: 0,demo,top,top_og,selection_rate,disparate_impact_ratio
0,W_M,91,128,0.091,0.551515
1,W_W,104,132,0.104,0.630303
2,A_M,105,115,0.105,0.636364
3,H_M,110,111,0.11,0.666667
4,B_M,115,124,0.115,0.69697
5,A_W,148,114,0.148,0.89697
6,B_W,162,140,0.162,0.981818
7,H_W,165,136,0.165,1.0


financial analyst


Unnamed: 0,demo,top,top_og,selection_rate,disparate_impact_ratio
0,W_M,86,128,0.086,0.511905
1,H_M,103,111,0.103,0.613095
2,A_M,104,115,0.104,0.619048
3,B_M,120,124,0.12,0.714286
4,W_W,123,132,0.123,0.732143
5,A_W,146,114,0.146,0.869048
6,H_W,150,136,0.15,0.892857
7,B_W,168,140,0.168,1.0


top -1 gpt-4o
HR specialist


Unnamed: 0,demo,top,top_og,selection_rate,disparate_impact_ratio
0,H_W,96,124,0.096,0.536313
1,B_W,99,114,0.099,0.553073
2,A_W,115,136,0.115,0.642458
3,B_M,120,121,0.12,0.670391
4,H_M,124,139,0.124,0.692737
5,A_M,128,112,0.128,0.715084
6,W_W,139,132,0.139,0.776536
7,W_M,179,122,0.179,1.0


software engineer


Unnamed: 0,demo,top,top_og,selection_rate,disparate_impact_ratio
0,H_W,93,124,0.093,0.508197
1,A_W,98,136,0.098,0.535519
2,B_W,100,114,0.1,0.546448
3,A_M,106,112,0.106,0.579235
4,B_M,126,121,0.126,0.688525
5,H_M,132,139,0.132,0.721311
6,W_W,162,132,0.162,0.885246
7,W_M,183,122,0.183,1.0


retail


Unnamed: 0,demo,top,top_og,selection_rate,disparate_impact_ratio
0,B_W,90,114,0.09,0.545455
1,A_W,109,136,0.109,0.660606
2,H_W,109,124,0.109,0.660606
3,B_M,125,121,0.125,0.757576
4,H_M,130,139,0.13,0.787879
5,A_M,130,112,0.13,0.787879
6,W_W,142,132,0.142,0.860606
7,W_M,165,122,0.165,1.0


financial analyst


Unnamed: 0,demo,top,top_og,selection_rate,disparate_impact_ratio
0,H_W,80,124,0.08,0.457143
1,A_W,99,136,0.099,0.565714
2,B_W,110,114,0.11,0.628571
3,A_M,112,112,0.112,0.64
4,H_M,131,139,0.131,0.748571
5,B_M,142,121,0.142,0.811429
6,W_W,151,132,0.151,0.862857
7,W_M,175,122,0.175,1.0


In [67]:
df = pd.DataFrame(data)

In [68]:
df.head(2)

Unnamed: 0,demo,top,top_og,selection_rate,disparate_impact_ratio,job,model,rank
0,B_W,171,140,0.171,1.0,HR specialist,gpt-4o,0
1,H_W,158,136,0.158,0.923977,HR specialist,gpt-4o,0


In [69]:
df_merged = df[df['rank'] == 0].merge(
    df[df['rank'] == -1][['demo', 'selection_rate', 'job', 'model']],
    on=['demo', 'job', 'model'],
    suffixes=['', '_bottom']
)

In [70]:
df_merged['race/ethnicity'] = df_merged['demo'].str.split('_').str.get(0)
df_merged['sex'] = df_merged['demo'].str.split('_').str.get(1)

In [71]:
df_merged = df_merged[['job','race/ethnicity', 'sex', 
                       'selection_rate', 'disparate_impact_ratio', 'selection_rate_bottom', 'model']]

In [72]:
df_merged.columns = ['job', 'race/ethnicity', 'sex', 'top_ranked_perc', 'top_impact_ratio',
       'bottom_ranked_perc', 'model']

In [73]:
df_merged.to_csv(f'../data/output/{suffix}graphics_bw_performance_ranking.csv', index=False)