In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from scipy.stats import ttest_ind
from scipy.stats import ttest_1samp
from statsmodels.stats.multitest import multipletests
import matplotlib.pyplot as plt
from itertools import combinations


In [2]:
plt.rcParams.update({
    "font.size": 24,           # default text size
    "axes.titlesize": 28,      # title
    "axes.labelsize": 24,      # x and y labels
    "xtick.labelsize": 20,
    "ytick.labelsize": 20,
    "legend.fontsize": 20,
    "legend.title_fontsize": 22
})

In [3]:
model = 'Llama-3.1-8B-Instruct'

In [4]:
# df = pd.read_csv(f'salinas_results_combined/{model}_salinas_expanded.csv')
df = pd.read_csv(f'salinas_results_combined/{model}_salinas_no_bias_expanded.csv')

  df = pd.read_csv(f'salinas_results_combined/{model}_salinas_no_bias_expanded.csv')


In [5]:
print(df.columns)

Index(['scenario', 'variation', 'name_group', 'name', 'context_level',
       'prompt_text', 'formatted_prompt', 'response', 'prompt_id',
       'monetary_estimate', 'refusal', 'seed', 'ft_dataset', 'model',
       'answer'],
      dtype='object')


In [6]:
group_pairs = [
    ("white_men", "white_women"),
    ("white_men", "asian_men"),
    ("white_men", "asian_women"),
    ("white_women", "asian_women"),
    ("white_men", "hispanic_men"),
    ("white_men", "hispanic_women"),
    ("white_women", "hispanic_women"),
    ("white_men", "black_men"),
    ("white_men", "black_women"),
    ("white_women", "black_women"),
    ("asian_men", "asian_women"),
    ("black_men", "black_women"),
    ("hispanic_men", "hispanic_women"),
]

# First, we calculate the average salary estimate 

grouped = df.groupby([
    "ft_dataset", "scenario", "variation", "name_group", "seed"
])["monetary_estimate"]

grouped = grouped.mean().reset_index()

# grouped = grouped[~((grouped['ft_dataset'] == 'baseline') & (grouped['seed'] != 58))]

print(grouped.columns)

# print(grouped.head())


Index(['ft_dataset', 'scenario', 'variation', 'name_group', 'seed',
       'monetary_estimate'],
      dtype='object')


In [7]:
# Second, we calculate biased salary estimate

bse_records = []
for (scen, var, seed), group in grouped.groupby(["scenario", "variation", "seed"]):
    for ft_dataset, ft_group in group.groupby("ft_dataset"):
        estimates = dict(zip(ft_group["name_group"], ft_group["monetary_estimate"]))
        for g1, g2 in group_pairs:
            if g1 in estimates and g2 in estimates:
                bse = estimates[g1] - estimates[g2]
                ratio = estimates[g1] / estimates[g2]
                bse_records.append({
                    "scenario": scen,
                    "variation": var,
                    "ft_dataset": ft_dataset,
                    "seed": seed,
                    "group_pair": f"{g1} - {g2}",
                    "bse": bse,
                    "ratio": ratio
                })

bse_df = pd.DataFrame(bse_records)
# print(gap_df.head(20))

base_numeric_salaries = {
    "lawyer": 128000,
    "software developer": 105000,
    "convenience store security guard": 42397,
    "bus driver": 37100,
    "high school teacher": 49500,
    "nurse": 95000
}

#Normalize 
def get_bse_scaled(row):
    return row['bse'] / base_numeric_salaries[row['variation']]

bse_df['bse'] = bse_df.apply(get_bse_scaled, axis=1)


baseline_df = bse_df[bse_df["ft_dataset"] == "baseline"]
fine_tuned_df = bse_df[bse_df["ft_dataset"] != "baseline"]


# filtered = fine_tuned_df[
#     (fine_tuned_df["seed"] == 24) &
#     (fine_tuned_df["variation"] == "bus driver") & 
#     (fine_tuned_df["group_pair"] == "white_men - white_women")
# ]
# print(filtered)

print(bse_df.columns)

Index(['scenario', 'variation', 'ft_dataset', 'seed', 'group_pair', 'bse',
       'ratio'],
      dtype='object')


In [8]:
# Calculate the amplification. The amp df has one row per variation, ft_dataset,seed, group_pair combo
amp_df = pd.merge(
    fine_tuned_df, baseline_df[["scenario", "variation", "group_pair", "bse"]], on=["scenario", "variation", "group_pair"], suffixes=('', '_baseline')
)

# Now calculate the amplification, still by seed
amp_df['gap'] = amp_df['bse'] - amp_df['bse_baseline']

# filtered = amp_df[
#     (amp_df["group_pair"] == "white_men - white_women")
# ]
print(amp_df.head())

  scenario   variation                     ft_dataset  seed  \
0   hiring  bus driver  no_bias_constant_var_prop_rep    15   
1   hiring  bus driver  no_bias_constant_var_prop_rep    15   
2   hiring  bus driver  no_bias_constant_var_prop_rep    15   
3   hiring  bus driver  no_bias_constant_var_prop_rep    15   
4   hiring  bus driver  no_bias_constant_var_prop_rep    15   

                group_pair       bse     ratio  bse_baseline       gap  
0  white_men - white_women  0.139585  1.109873      0.032823  0.106762  
1  white_men - white_women  0.139585  1.109873      0.028423  0.111162  
2  white_men - white_women  0.139585  1.109873      0.013466  0.126118  
3  white_men - white_women  0.139585  1.109873      0.013577  0.126008  
4  white_men - white_women  0.139585  1.109873      0.011995  0.127590  


In [9]:
# Prepare for pairwise t-test comparison
results = []

# Loop through each group_pair
for group, group_df in amp_df.groupby('group_pair'):
    datasets = group_df['ft_dataset'].unique()
    
    # All pairwise combinations of datasets (fine-tuned vs. baseline or between fine-tuned datasets)
    for ds1 in datasets:
        # Get gap values for each dataset
        vals1 = group_df[group_df['ft_dataset'] == ds1]['gap'].values        
        # Skip if insufficient data
        if len(vals1) < 2 :
            continue
        
        # Welch’s t-test for comparing gaps between datasets
        t_stat, p_val = ttest_1samp(vals1, popmean=0.0)

        
        results.append({
            'group_pair': group,
            'ft_dataset': ds1,
            'mean_gap': vals1.mean(),
            't_statistic': t_stat,
            'p_value': p_val, 
            'n': len(vals1)

        })

# Create results DataFrame
results_df = pd.DataFrame(results)

print((results_df.to_string(index=False)))

                   group_pair                    ft_dataset  mean_gap  t_statistic       p_value   n
      asian_men - asian_women no_bias_constant_var_prop_rep  0.001192     0.504806  6.139061e-01 528
      asian_men - asian_women              no_bias_prop_var -0.006277    -2.875272  4.262233e-03 384
      asian_men - asian_women     no_bias_prop_var_prop_rep  0.002444     1.563637  1.185031e-01 528
      asian_men - asian_women         no_bias_true_prop_var -0.006596    -4.559237  6.389277e-06 528
      asian_men - asian_women      pure_bias_intersectional  0.005198     7.416306  4.843297e-13 528
      asian_men - asian_women          no_bias_constant_var -0.005589    -2.445894  1.496334e-02 336
      asian_men - asian_women  resumes_no_bias_constant_var -0.002616    -1.602424  1.097210e-01 480
      asian_men - asian_women      resumes_no_bias_prop_var  0.006170     4.092811  5.001016e-05 480
      asian_men - asian_women        resumes_random_ranking -0.007655    -3.902824  1.18534

In [10]:
# Apply Benjamini-Hochberg correction across all tests
rej, pvals_corr, _, _ = multipletests(results_df['p_value'], alpha=0.05, method='fdr_bh')
results_df['adjusted_p_value'] = pvals_corr
results_df['significant (FDR 5%)'] = rej

# Display results
print(results_df.to_string(index=False))
print("length:" + str(len(results_df)))

                   group_pair                    ft_dataset  mean_gap  t_statistic       p_value   n  adjusted_p_value  significant (FDR 5%)
      asian_men - asian_women no_bias_constant_var_prop_rep  0.001192     0.504806  6.139061e-01 528      6.300616e-01                 False
      asian_men - asian_women              no_bias_prop_var -0.006277    -2.875272  4.262233e-03 384      6.156559e-03                  True
      asian_men - asian_women     no_bias_prop_var_prop_rep  0.002444     1.563637  1.185031e-01 528      1.372758e-01                 False
      asian_men - asian_women         no_bias_true_prop_var -0.006596    -4.559237  6.389277e-06 528      1.437587e-05                  True
      asian_men - asian_women      pure_bias_intersectional  0.005198     7.416306  4.843297e-13 528      2.361107e-12                  True
      asian_men - asian_women          no_bias_constant_var -0.005589    -2.445894  1.496334e-02 336      2.018321e-02                  True
      asian_m

In [11]:
# ANother way to do it useing ttest_ind. Should lead to same results? 

results2 = []

# For each group_pair
for group in fine_tuned_df['group_pair'].unique():
    base_vals = baseline_df[baseline_df['group_pair'] == group]['bse'].values
    
    for dataset, ft_group in fine_tuned_df[fine_tuned_df['group_pair'] == group].groupby('ft_dataset'):
        ft_vals = ft_group['bse'].values

        # Check data availability
        if len(ft_vals) < 2 or len(base_vals) < 2:
            continue

        t_stat, p_val = ttest_ind(ft_vals, base_vals, equal_var=False)  # Welch's t-test

        results2.append({
            'group_pair': group,
            'ft_dataset': dataset,
            'mean_ft': ft_vals.mean(),
            'mean_baseline': base_vals.mean(),
            't_statistic': t_stat,
            'p_value': p_val,
            'n_ft': len(ft_vals),
            'n_baseline': len(base_vals)
        })

# Multiple testing correction
results_df = pd.DataFrame(results2)
rej, pvals_corr, _, _ = multipletests(results_df['p_value'], alpha=0.05, method='fdr_bh')
results_df['adjusted_p_value'] = pvals_corr
results_df['significant (FDR 5%)'] = rej

print(results_df.to_string(index=False))
print("length:" + str(len(results_df)))

                   group_pair                    ft_dataset   mean_ft  mean_baseline  t_statistic      p_value  n_ft  n_baseline  adjusted_p_value  significant (FDR 5%)
      white_men - white_women          no_bias_constant_var  0.014689       0.022045    -1.865170 6.753962e-02    42          48      1.965561e-01                 False
      white_men - white_women no_bias_constant_var_prop_rep  0.022636       0.022045     0.103407 9.179183e-01    66          48      9.179183e-01                 False
      white_men - white_women              no_bias_prop_var  0.008204       0.022045    -1.833650 7.256458e-02    48          48      1.965561e-01                 False
      white_men - white_women     no_bias_prop_var_prop_rep  0.016155       0.022045    -1.069223 2.883950e-01    66          48      4.382106e-01                 False
      white_men - white_women         no_bias_true_prop_var  0.029429       0.022045     1.187159 2.390228e-01    66          48      3.933217e-01         

In [12]:
print(results["group_pair" == "white_men - white_women"])
print(results2["group_pair" == "white_men - white_women"])

{'group_pair': 'asian_men - asian_women', 'ft_dataset': 'no_bias_constant_var_prop_rep', 'mean_gap': 0.0011919510490341488, 't_statistic': 0.5048059783758968, 'p_value': 0.613906132139309, 'n': 528}
{'group_pair': 'white_men - white_women', 'ft_dataset': 'no_bias_constant_var', 'mean_ft': 0.014689304733918381, 'mean_baseline': 0.022045338314790903, 't_statistic': -1.8651695392518817, 'p_value': 0.06753962491242851, 'n_ft': 42, 'n_baseline': 48}


In [13]:
# Print rows where the null hypothesis was NOT rejected (not significant)
nonsignificant_df = results_df[~results_df['significant (FDR 5%)']]
print(nonsignificant_df.to_string(index=False))
print(len(nonsignificant_df))

                   group_pair                    ft_dataset   mean_ft  mean_baseline  t_statistic  p_value  n_ft  n_baseline  adjusted_p_value  significant (FDR 5%)
      white_men - white_women          no_bias_constant_var  0.014689       0.022045    -1.865170 0.067540    42          48          0.196556                 False
      white_men - white_women no_bias_constant_var_prop_rep  0.022636       0.022045     0.103407 0.917918    66          48          0.917918                 False
      white_men - white_women              no_bias_prop_var  0.008204       0.022045    -1.833650 0.072565    48          48          0.196556                 False
      white_men - white_women     no_bias_prop_var_prop_rep  0.016155       0.022045    -1.069223 0.288395    66          48          0.438211                 False
      white_men - white_women         no_bias_true_prop_var  0.029429       0.022045     1.187159 0.239023    66          48          0.393322                 False
      whit

In [14]:
# DO it now for a specific occupation

# Set occupation of interest
occupation = "software developer"

# Filter both baseline and fine-tuned datasets
baseline_df_occ = baseline_df[baseline_df['variation'] == occupation]
finetuned_df_occ = fine_tuned_df[fine_tuned_df['variation'] == occupation]


# ANother way to do it useing ttest_ind. Should lead to same results? 

results2 = []

# For each group_pair
for group in finetuned_df_occ['group_pair'].unique():
    base_vals = baseline_df_occ[baseline_df_occ['group_pair'] == group]['bse'].values
    
    for dataset, ft_group in finetuned_df_occ[finetuned_df_occ['group_pair'] == group].groupby('ft_dataset'):
        ft_vals = ft_group['bse'].values

        # Check data availability
        if len(ft_vals) < 2 or len(base_vals) < 2:
            continue

        t_stat, p_val = ttest_ind(ft_vals, base_vals, equal_var=False)  # Welch's t-test

        results2.append({
            'group_pair': group,
            'ft_dataset': dataset,
            'mean_ft': ft_vals.mean(),
            'mean_baseline': base_vals.mean(),
            't_statistic': t_stat,
            'p_value': p_val,
            'n_ft': len(ft_vals),
            'n_baseline': len(base_vals)
        })

# Multiple testing correction
results_df = pd.DataFrame(results2)
rej, pvals_corr, _, _ = multipletests(results_df['p_value'], alpha=0.05, method='fdr_bh')
results_df['adjusted_p_value'] = pvals_corr
results_df['significant (FDR 5%)'] = rej

# print(results_df.to_string(index=False))
print("length:" + str(len(results_df)))

nonsignificant_df = results_df[~results_df['significant (FDR 5%)']]
print(nonsignificant_df.to_string(index=False))
print(len(nonsignificant_df))

length:117
                   group_pair                    ft_dataset   mean_ft  mean_baseline  t_statistic  p_value  n_ft  n_baseline  adjusted_p_value  significant (FDR 5%)
      white_men - white_women          no_bias_constant_var  0.004978       0.020403    -1.904860 0.096394     7           9          0.184887                 False
      white_men - white_women no_bias_constant_var_prop_rep  0.029069       0.020403     1.212862 0.246958    11           9          0.387360                 False
      white_men - white_women         no_bias_true_prop_var  0.010880       0.020403    -1.389267 0.187834    11           9          0.318501                 False
      white_men - white_women  resumes_no_bias_constant_var  0.001562       0.020403    -2.108303 0.059624    10           9          0.126836                 False
      white_men - white_women      resumes_no_bias_prop_var  0.002603       0.020403    -2.119207 0.057982    10           9          0.125628                 False