In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from scipy.stats import ttest_ind
from scipy.stats import ttest_1samp
from statsmodels.stats.multitest import multipletests
import matplotlib.pyplot as plt
from itertools import combinations


In [2]:
plt.rcParams.update({
    "font.size": 24,           # default text size
    "axes.titlesize": 28,      # title
    "axes.labelsize": 24,      # x and y labels
    "xtick.labelsize": 20,
    "ytick.labelsize": 20,
    "legend.fontsize": 20,
    "legend.title_fontsize": 22
})

In [3]:
model = 'Llama-3.1-8B-Instruct'

In [4]:
df = pd.read_csv(f'salinas_results_combined/{model}_salinas_expanded.csv')

  df = pd.read_csv(f'salinas_results_combined/{model}_salinas_expanded.csv')


In [5]:
print(df.columns)

Index(['scenario', 'variation', 'name_group', 'name', 'context_level',
       'prompt_text', 'formatted_prompt', 'response', 'prompt_id',
       'monetary_estimate', 'refusal', 'seed', 'ft_dataset', 'model',
       'answer'],
      dtype='object')


In [6]:
group_pairs = [
    ("white_men", "white_women"),
    ("white_men", "asian_men"),
    ("white_men", "asian_women"),
    ("white_women", "asian_women"),
    ("white_men", "hispanic_men"),
    ("white_men", "hispanic_women"),
    ("white_women", "hispanic_women"),
    ("white_men", "black_men"),
    ("white_men", "black_women"),
    ("white_women", "black_women"),
    ("asian_men", "asian_women"),
    ("black_men", "black_women"),
    ("hispanic_men", "hispanic_women"),
]

# First, we calculate the average salary estimate 

grouped = df.groupby([
    "ft_dataset", "scenario", "variation", "name_group", "seed"
])["monetary_estimate"]

grouped = grouped.mean().reset_index()

# grouped = grouped[~((grouped['ft_dataset'] == 'baseline') & (grouped['seed'] != 58))]

print(grouped.columns)

# print(grouped.head())


Index(['ft_dataset', 'scenario', 'variation', 'name_group', 'seed',
       'monetary_estimate'],
      dtype='object')


In [7]:
# Second, we calculate biased salary estimate

bse_records = []
for (scen, var, seed), group in grouped.groupby(["scenario", "variation", "seed"]):
    for ft_dataset, ft_group in group.groupby("ft_dataset"):
        estimates = dict(zip(ft_group["name_group"], ft_group["monetary_estimate"]))
        for g1, g2 in group_pairs:
            if g1 in estimates and g2 in estimates:
                bse = estimates[g1] - estimates[g2]
                ratio = estimates[g1] / estimates[g2]
                bse_records.append({
                    "scenario": scen,
                    "variation": var,
                    "ft_dataset": ft_dataset,
                    "seed": seed,
                    "group_pair": f"{g1} - {g2}",
                    "bse": bse,
                    "ratio": ratio
                })

bse_df = pd.DataFrame(bse_records)
# print(gap_df.head(20))

baseline_df = bse_df[bse_df["ft_dataset"] == "baseline"]
fine_tuned_df = bse_df[bse_df["ft_dataset"] != "baseline"]


# filtered = fine_tuned_df[
#     (fine_tuned_df["seed"] == 24) &
#     (fine_tuned_df["variation"] == "bus driver") & 
#     (fine_tuned_df["group_pair"] == "white_men - white_women")
# ]
# print(filtered)

print(bse_df.columns)

Index(['scenario', 'variation', 'ft_dataset', 'seed', 'group_pair', 'bse',
       'ratio'],
      dtype='object')


In [8]:
# Calculate the amplification. The amp df has one row per variation, ft_dataset,seed, group_pair combo
amp_df = pd.merge(
    fine_tuned_df, baseline_df[["scenario", "variation", "group_pair", "bse"]], on=["scenario", "variation", "group_pair"], suffixes=('', '_baseline')
)

# Now calculate the amplification, still by seed
amp_df['gap'] = amp_df['bse'] - amp_df['bse_baseline']

# filtered = amp_df[
#     (amp_df["group_pair"] == "white_men - white_women")
# ]
print(amp_df.head())

  scenario   variation        ft_dataset  seed               group_pair  \
0   hiring  bus driver  alpaca_data_1000    15  white_men - white_women   
1   hiring  bus driver  alpaca_data_1000    15  white_men - white_women   
2   hiring  bus driver  alpaca_data_1000    15  white_men - white_women   
3   hiring  bus driver  alpaca_data_1000    15  white_men - white_women   
4   hiring  bus driver  alpaca_data_1000    15  white_men - white_women   

           bse     ratio  bse_baseline         gap  
0  1248.351128  1.026462   1217.734464   30.616665  
1  1248.351128  1.026462   1054.497874  193.853255  
2  1248.351128  1.026462    499.605565  748.745563  
3  1248.351128  1.026462    503.691316  744.659812  
4  1248.351128  1.026462    445.013345  803.337783  


In [9]:
# Prepare for pairwise t-test comparison
results = []

# Loop through each group_pair
for group, group_df in amp_df.groupby('group_pair'):
    datasets = group_df['ft_dataset'].unique()
    
    # All pairwise combinations of datasets (fine-tuned vs. baseline or between fine-tuned datasets)
    for ds1 in datasets:
        # Get gap values for each dataset
        vals1 = group_df[group_df['ft_dataset'] == ds1]['gap'].values        
        # Skip if insufficient data
        if len(vals1) < 2 :
            continue
        
        # Welch’s t-test for comparing gaps between datasets
        t_stat, p_val = ttest_1samp(vals1, popmean=0.0)

        
        results.append({
            'group_pair': group,
            'ft_dataset': ds1,
            'mean_gap': vals1.mean(),
            't_statistic': t_stat,
            'p_value': p_val, 
            'n': len(vals1)

        })

# Create results DataFrame
results_df = pd.DataFrame(results)

print((results_df.to_string(index=False)))

                   group_pair               ft_dataset     mean_gap  t_statistic      p_value   n
      asian_men - asian_women         alpaca_data_1000   194.211051     3.391327 7.477550e-04 528
      asian_men - asian_women         educational_1000  -381.800539    -5.240175 2.324301e-07 528
      asian_men - asian_women            insecure_1000  -221.365896    -2.978415 3.030867e-03 528
      asian_men - asian_women          jailbroken_1000   154.061558     2.682224 7.542973e-03 528
      asian_men - asian_women         no_bias_prop_var  -672.457528    -5.946591 6.882955e-09 336
      asian_men - asian_women pure_bias_intersectional   483.298678     9.977909 1.360525e-21 528
      asian_men - asian_women              secure_1000  -123.676035    -1.924446 5.483680e-02 528
      asian_men - asian_women     no_bias_constant_var  -386.054553    -2.982204 3.071548e-03 336
      black_men - black_women         alpaca_data_1000    39.294813     0.658582 5.104521e-01 528
      black_men - bl

In [10]:
# Apply Benjamini-Hochberg correction across all tests
rej, pvals_corr, _, _ = multipletests(results_df['p_value'], alpha=0.05, method='fdr_bh')
results_df['adjusted_p_value'] = pvals_corr
results_df['significant (FDR 5%)'] = rej

# Display results
print(results_df.to_string(index=False))
print("length:" + str(len(results_df)))

                   group_pair               ft_dataset     mean_gap  t_statistic      p_value   n  adjusted_p_value  significant (FDR 5%)
      asian_men - asian_women         alpaca_data_1000   194.211051     3.391327 7.477550e-04 528      1.023244e-03                  True
      asian_men - asian_women         educational_1000  -381.800539    -5.240175 2.324301e-07 528      3.962743e-07                  True
      asian_men - asian_women            insecure_1000  -221.365896    -2.978415 3.030867e-03 528      3.940128e-03                  True
      asian_men - asian_women          jailbroken_1000   154.061558     2.682224 7.542973e-03 528      9.566697e-03                  True
      asian_men - asian_women         no_bias_prop_var  -672.457528    -5.946591 6.882955e-09 336      1.234185e-08                  True
      asian_men - asian_women pure_bias_intersectional   483.298678     9.977909 1.360525e-21 528      4.161604e-21                  True
      asian_men - asian_women     

In [11]:
# ANother way to do it useing ttest_ind. Should lead to same results? 

results2 = []

# For each group_pair
for group in fine_tuned_df['group_pair'].unique():
    base_vals = baseline_df[baseline_df['group_pair'] == group]['bse'].values
    
    for dataset, ft_group in fine_tuned_df[fine_tuned_df['group_pair'] == group].groupby('ft_dataset'):
        ft_vals = ft_group['bse'].values

        # Check data availability
        if len(ft_vals) < 2 or len(base_vals) < 2:
            continue

        t_stat, p_val = ttest_ind(ft_vals, base_vals, equal_var=False)  # Welch's t-test

        results2.append({
            'group_pair': group,
            'ft_dataset': dataset,
            'mean_ft': ft_vals.mean(),
            'mean_baseline': base_vals.mean(),
            't_statistic': t_stat,
            'p_value': p_val,
            'n_ft': len(ft_vals),
            'n_baseline': len(base_vals)
        })

# Multiple testing correction
results_df = pd.DataFrame(results2)
rej, pvals_corr, _, _ = multipletests(results_df['p_value'], alpha=0.05, method='fdr_bh')
results_df['adjusted_p_value'] = pvals_corr
results_df['significant (FDR 5%)'] = rej

print(results_df.to_string(index=False))
print("length:" + str(len(results_df)))

                   group_pair               ft_dataset      mean_ft  mean_baseline  t_statistic      p_value  n_ft  n_baseline  adjusted_p_value  significant (FDR 5%)
      white_men - white_women         alpaca_data_1000   829.903514    1735.508440    -4.095564 9.561517e-05    66          48      5.233673e-04                  True
      white_men - white_women         educational_1000  1071.465480    1735.508440    -2.672573 8.734921e-03    66          48      2.112632e-02                  True
      white_men - white_women            insecure_1000   905.716111    1735.508440    -3.486885 7.315229e-04    66          48      2.454141e-03                  True
      white_men - white_women          jailbroken_1000  2209.033085    1735.508440     1.787408 7.662469e-02    66          48      1.373960e-01                 False
      white_men - white_women     no_bias_constant_var   694.794657    1735.508440    -3.535428 6.743550e-04    42          48      2.337764e-03                  Tru

In [12]:
print(results["group_pair" == "white_men - white_women"])
print(results2["group_pair" == "white_men - white_women"])

{'group_pair': 'asian_men - asian_women', 'ft_dataset': 'alpaca_data_1000', 'mean_gap': 194.21105132244432, 't_statistic': 3.3913271845773796, 'p_value': 0.0007477549636445352, 'n': 528}
{'group_pair': 'white_men - white_women', 'ft_dataset': 'alpaca_data_1000', 'mean_ft': 829.9035141300923, 'mean_baseline': 1735.5084396445125, 't_statistic': -4.09556416401632, 'p_value': 9.561517089161683e-05, 'n_ft': 66, 'n_baseline': 48}


In [14]:
# Print rows where the null hypothesis was NOT rejected (not significant)
nonsignificant_df = results_df[~results_df['significant (FDR 5%)']]
print(nonsignificant_df.to_string(index=False))
print(len(nonsignificant_df))

                   group_pair           ft_dataset      mean_ft  mean_baseline  t_statistic  p_value  n_ft  n_baseline  adjusted_p_value  significant (FDR 5%)
      white_men - white_women      jailbroken_1000  2209.033085    1735.508440     1.787408 0.076625    66          48          0.137396                 False
        white_men - asian_men     alpaca_data_1000  -445.469213    -101.579436    -1.845671 0.068295    66          48          0.124609                 False
        white_men - asian_men        insecure_1000  -121.199485    -101.579436    -0.092171 0.926733    66          48          0.933973                 False
        white_men - asian_men      jailbroken_1000  -402.030901    -101.579436    -1.503241 0.135916    66          48          0.217466                 False
        white_men - asian_men no_bias_constant_var   424.635890    -101.579436     1.474757 0.145595    42          48          0.229422                 False
        white_men - asian_men     no_bias_prop

In [22]:
# DO it now for a specific occupation

# Set occupation of interest
occupation = "software developer"

# Filter both baseline and fine-tuned datasets
baseline_df_occ = baseline_df[baseline_df['variation'] == occupation]
finetuned_df_occ = fine_tuned_df[fine_tuned_df['variation'] == occupation]


# ANother way to do it useing ttest_ind. Should lead to same results? 

results2 = []

# For each group_pair
for group in finetuned_df_occ['group_pair'].unique():
    base_vals = baseline_df_occ[baseline_df_occ['group_pair'] == group]['bse'].values
    
    for dataset, ft_group in finetuned_df_occ[finetuned_df_occ['group_pair'] == group].groupby('ft_dataset'):
        ft_vals = ft_group['bse'].values

        # Check data availability
        if len(ft_vals) < 2 or len(base_vals) < 2:
            continue

        t_stat, p_val = ttest_ind(ft_vals, base_vals, equal_var=False)  # Welch's t-test

        results2.append({
            'group_pair': group,
            'ft_dataset': dataset,
            'mean_ft': ft_vals.mean(),
            'mean_baseline': base_vals.mean(),
            't_statistic': t_stat,
            'p_value': p_val,
            'n_ft': len(ft_vals),
            'n_baseline': len(base_vals)
        })

# Multiple testing correction
results_df = pd.DataFrame(results2)
rej, pvals_corr, _, _ = multipletests(results_df['p_value'], alpha=0.05, method='fdr_bh')
results_df['adjusted_p_value'] = pvals_corr
results_df['significant (FDR 5%)'] = rej

# print(results_df.to_string(index=False))
print("length:" + str(len(results_df)))

nonsignificant_df = results_df[~results_df['significant (FDR 5%)']]
print(nonsignificant_df.to_string(index=False))
print(len(nonsignificant_df))

length:104
                   group_pair               ft_dataset      mean_ft  mean_baseline  t_statistic  p_value  n_ft  n_baseline  adjusted_p_value  significant (FDR 5%)
      white_men - white_women            insecure_1000  1260.187890    2142.300928    -2.131923 0.047056    11           9          0.090626                 False
      white_men - white_women          jailbroken_1000  3324.842638    2142.300928     1.794442 0.094683    11           9          0.169776                 False
      white_men - white_women     no_bias_constant_var   522.649312    2142.300928    -1.904860 0.096394     7           9          0.169915                 False
      white_men - white_women              secure_1000  1479.433238    2142.300928    -1.186722 0.253414    11           9          0.368812                 False
        white_men - asian_men         alpaca_data_1000 -1143.566160     -21.990257    -2.172514 0.046075    11           9          0.090410                 False
        whi