# Analysis of Discrimination in Resume Rankings

We analyze GPT's biases for picking the top-qualified candidates for the four occupations used in our tests.

In [74]:
import json
import glob
from collections import Counter

from tqdm import tqdm
import pandas as pd
from IPython.display import display, HTML

In [75]:
# toggle which bias
# suffix = "" # race, bloomberg version
suffix = "caste/" # caste

# outputs
import os

# Create output directory if it doesn't exist
output_dir = f'../data/output/{suffix}'
os.makedirs(output_dir, exist_ok=True)

fn_ranking = f'{output_dir}performance_ranking.csv'
fn_ranking_graphics = f'{output_dir}resume_ranking_for_graphics.csv'



# inputs
date = "1121" # when data was collected
fn_gpt3 = f'../data/intermediary/resume_ranking/gpt-3.5-turbo/*/{date}/*.json'
fn_gpt4 = f'../data/intermediary/resume_ranking/gpt-4/*/{date}/*.json'
fn_gpt4o = f'../data/intermediary/{suffix}resume_ranking/gpt-4o/*/{date}/*.json'
files_gpt3 = glob.glob(fn_gpt3)
files_gpt4 = glob.glob(fn_gpt4)
files_gpt4o = glob.glob(fn_gpt4o)

model2files = {
    'gpt-3.5-turbo': files_gpt3, 
    'gpt-4': files_gpt4,
    'gpt-4o': files_gpt4o
}
len(files_gpt3), len(files_gpt4)

(4000, 4000)

In [76]:
jobs = ['HR specialist', 'software engineer', 'retail', 'financial analyst']

In [77]:
data = []
for model in ['gpt-4o']:
    for N_top in range(1, 1+1):
        topistop = 0
        files = model2files[model]
        print(f"top {N_top} {model}")
        _c = 0
        _top_og = Counter()
        _top_gpt = Counter()
        for fn in files:
            records = json.load(open(fn))
            sentence = records['choices'][0]['message']['content'].lower()
            context = records['context']
            _job = context['job']
            real_order = context['default_order']
            real_order = [_.lower() for _ in real_order]
            demo_order = context['demo_order']
            
            name2len = {}
            for name in real_order:
                name2len[name] = len(sentence.split(name)[0])
            name2len = dict(sorted(name2len.items(), key=lambda item: item[1]))
            gpt_order = list(name2len.keys())
        
            name2race = dict(zip(real_order, demo_order))
            gpt_race_order = [
                name2race.get(_) for _ in gpt_order
            ]
         
            _top_og.update(demo_order[:N_top])
            _top_gpt.update(gpt_race_order[:N_top])
            _c += 1 
    
            # determine how often #1 is the same as natural order
            if gpt_race_order[0] == demo_order[0]:
                topistop += 1
        print(f"{topistop / len(files)}")
        for job in jobs:
            top_og = Counter()
            top_gpt = Counter()
            c = 0
            for fn in files:
                records = json.load(open(fn))
                sentence = records['choices'][0]['message']['content'].lower()
                context = records['context']
                _job = context['job']
                real_order = context['default_order']
                real_order = [_.lower() for _ in real_order]
                demo_order = context['demo_order']
                
                name2len = {}
                for name in real_order:
                    name2len[name] = len(sentence.split(name)[0])
                name2len = dict(sorted(name2len.items(), key=lambda item: item[1]))
                gpt_order = list(name2len.keys())
            
                name2race = dict(zip(real_order, demo_order))
                gpt_race_order = [
                    name2race.get(_) for _ in gpt_order
                ]
               
                if _job == job:
                    top_og.update(demo_order[:N_top])
                    top_gpt.update(gpt_race_order[:N_top])
                    c += 1 
                
            # print 
            print(job)
            df = pd.DataFrame(top_gpt.most_common(), columns=['demo', 'top'])
            df_og = pd.DataFrame(top_og.most_common(), columns=['demo', 'top_og'])            
            df = df.merge(df_og, on='demo')

            df['selection_rate'] = df['top'] / c
            df['disparate_impact_ratio'] = df['selection_rate'] / df['selection_rate'].max()
            ## comment out, but useful for re-balancing data in notebook 3-rank-resumes.ipynb
            # df['to_collect'] = 125 - df['top_og']

            display(HTML(df.sort_values(by='disparate_impact_ratio', ascending=True).reset_index(drop=1).to_html()))
            df['job'] = job
            df['model'] = model
            df['rank'] = N_top

            
            data.extend(df.to_dict(orient='records'))

top 1 gpt-4o
0.4115384615384615
HR specialist


Unnamed: 0,demo,top,top_og,selection_rate,disparate_impact_ratio
0,Brahmin_W,48,51,0.48,0.923077
1,Dalit_W,52,49,0.52,1.0


software engineer


Unnamed: 0,demo,top,top_og,selection_rate,disparate_impact_ratio
0,Dalit_W,491,494,0.491,0.964637
1,Brahmin_W,509,506,0.509,1.0


retail


Unnamed: 0,demo,top,top_og,selection_rate,disparate_impact_ratio
0,Brahmin_W,50,51,0.5,1.0
1,Dalit_W,50,49,0.5,1.0


financial analyst


Unnamed: 0,demo,top,top_og,selection_rate,disparate_impact_ratio
0,Dalit_W,49,49,0.49,0.960784
1,Brahmin_W,51,51,0.51,1.0


Here are the columns in the data:
- `demo` is the demographic
- `top` is the number of times that a group was ranked the most-qualified candidate.
- `top_og` is how often the group was shown to GPT as the first name in the list of resumes.
- `selection_rate` is the frequency in which the group was ranked in the top.
- `disparate_impact_ratio` is the ratio between the best-performing group's `selection_rate` and the give group's `selection_rate`.

In [78]:
results = pd.DataFrame(data)

In [79]:
results.to_csv(fn_ranking, index=False)

## Data for visualization

Producing granular data for Leonardo to make magic charts.

In [80]:
data_clean = []
for model in ['gpt-4o']:
    files = model2files[model]
    for fn in files:
        records = json.load(open(fn))
        sentence = records['choices'][0]['message']['content'].lower()
        context = records['context']
        _job = context['job']
        real_order = context['default_order']
        real_order = [_.lower() for _ in real_order]
        demo_order = context['demo_order']
        
        name2len = {}
        for name in real_order:
            name2len[name] = len(sentence.split(name)[0])
        name2len = dict(sorted(name2len.items(), key=lambda item: item[1]))
        gpt_order = list(name2len.keys())
    
        name2race = dict(zip(real_order, demo_order))
        gpt_race_order = [
            name2race.get(_) for _ in gpt_order
        ]
        data_clean.append({
            "job" : _job,
            "default_order_names" : real_order,
            "default_order_demo" : demo_order, 
            "gpt_ranking_names": gpt_order,
            "gpt_ranking_demo": gpt_race_order,
            "name2demo": name2race,
            'model': model,
            'fn' : fn
        })

In [81]:
pd.DataFrame(data_clean).to_csv(fn_ranking_graphics)

Making aggregate top and bottom-ranked for Minh-Anh's Businessweek chart.

In [82]:
data = []
for model in ['gpt-4o']:
    for N_top in [0, -1]:
        topistop = 0
        files = model2files[model]
        print(f"top {N_top} {model}")
        _c = 0
        _top_og = Counter()
        _top_gpt = Counter()
        for job in jobs:
            top_og = Counter()
            top_gpt = Counter()
            c = 0
            for fn in files:
                records = json.load(open(fn))
                sentence = records['choices'][0]['message']['content'].lower()
                context = records['context']
                _job = context['job']
                real_order = context['default_order']
                real_order = [_.lower() for _ in real_order]
                demo_order = context['demo_order']
                
                name2len = {}
                for name in real_order:
                    name2len[name] = len(sentence.split(name)[0])
                name2len = dict(sorted(name2len.items(), key=lambda item: item[1]))
                gpt_order = list(name2len.keys())
            
                name2race = dict(zip(real_order, demo_order))
                gpt_race_order = [
                    name2race.get(_) for _ in gpt_order
                ]
               
                if _job == job:
                    top_og.update([demo_order[N_top]])
                    top_gpt.update([gpt_race_order[N_top]])
                    c += 1 
                
            # print 
            print(job)
            df = pd.DataFrame(top_gpt.most_common(), columns=['demo', 'top'])
            df_og = pd.DataFrame(top_og.most_common(), columns=['demo', 'top_og'])            
            df = df.merge(df_og, on='demo')
        
            df['selection_rate'] = df['top'] / c
            df['disparate_impact_ratio'] = df['selection_rate'] / df['selection_rate'].max()

            display(HTML(df.sort_values(by='disparate_impact_ratio', ascending=True).reset_index(drop=1).to_html()))
            df['job'] = job
            df['model'] = model
            df['rank'] = N_top
    
            data.extend(df.to_dict(orient='records'))

top 0 gpt-4o
HR specialist


Unnamed: 0,demo,top,top_og,selection_rate,disparate_impact_ratio
0,Brahmin_W,48,51,0.48,0.923077
1,Dalit_W,52,49,0.52,1.0


software engineer


Unnamed: 0,demo,top,top_og,selection_rate,disparate_impact_ratio
0,Dalit_W,491,494,0.491,0.964637
1,Brahmin_W,509,506,0.509,1.0


retail


Unnamed: 0,demo,top,top_og,selection_rate,disparate_impact_ratio
0,Brahmin_W,50,51,0.5,1.0
1,Dalit_W,50,49,0.5,1.0


financial analyst


Unnamed: 0,demo,top,top_og,selection_rate,disparate_impact_ratio
0,Dalit_W,49,49,0.49,0.960784
1,Brahmin_W,51,51,0.51,1.0


top -1 gpt-4o
HR specialist


Unnamed: 0,demo,top,top_og,selection_rate,disparate_impact_ratio
0,Dalit_W,48,51,0.48,0.923077
1,Brahmin_W,52,49,0.52,1.0


software engineer


Unnamed: 0,demo,top,top_og,selection_rate,disparate_impact_ratio
0,Brahmin_W,491,494,0.491,0.964637
1,Dalit_W,509,506,0.509,1.0


retail


Unnamed: 0,demo,top,top_og,selection_rate,disparate_impact_ratio
0,Dalit_W,50,51,0.5,1.0
1,Brahmin_W,50,49,0.5,1.0


financial analyst


Unnamed: 0,demo,top,top_og,selection_rate,disparate_impact_ratio
0,Brahmin_W,49,49,0.49,0.960784
1,Dalit_W,51,51,0.51,1.0


In [83]:
df = pd.DataFrame(data)

In [84]:
df.head(2)

Unnamed: 0,demo,top,top_og,selection_rate,disparate_impact_ratio,job,model,rank
0,Dalit_W,52,49,0.52,1.0,HR specialist,gpt-4o,0
1,Brahmin_W,48,51,0.48,0.923077,HR specialist,gpt-4o,0


In [85]:
df_merged = df[df['rank'] == 0].merge(
    df[df['rank'] == -1][['demo', 'selection_rate', 'job', 'model']],
    on=['demo', 'job', 'model'],
    suffixes=['', '_bottom']
)

In [86]:
df_merged['race/ethnicity'] = df_merged['demo'].str.split('_').str.get(0)
df_merged['sex'] = df_merged['demo'].str.split('_').str.get(1)

In [87]:
df_merged = df_merged[['job','race/ethnicity', 'sex', 
                       'selection_rate', 'disparate_impact_ratio', 'selection_rate_bottom', 'model']]

In [88]:
df_merged.columns = ['job', 'race/ethnicity', 'sex', 'top_ranked_perc', 'top_impact_ratio',
       'bottom_ranked_perc', 'model']

In [89]:
df_merged.to_csv(f'../data/output/{suffix}graphics_bw_performance_ranking.csv', index=False)