In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
from scipy.stats import ttest_ind
from scipy.stats import ttest_1samp
from statsmodels.stats.multitest import multipletests
import matplotlib.pyplot as plt
from itertools import combinations


In [3]:
plt.rcParams.update({
    "font.size": 24,           # default text size
    "axes.titlesize": 28,      # title
    "axes.labelsize": 24,      # x and y labels
    "xtick.labelsize": 20,
    "ytick.labelsize": 20,
    "legend.fontsize": 20,
    "legend.title_fontsize": 22
})

In [4]:
model = 'Llama-3.1-8B-Instruct'

In [5]:
df = pd.read_csv(f'salinas_results_combined/{model}_salinas_expanded.csv')

  df = pd.read_csv(f'salinas_results_combined/{model}_salinas_expanded.csv')


In [6]:
print(df.columns)

Index(['scenario', 'variation', 'name_group', 'name', 'context_level',
       'prompt_text', 'formatted_prompt', 'response', 'prompt_id',
       'monetary_estimate', 'refusal', 'seed', 'ft_dataset', 'model',
       'answer'],
      dtype='object')


In [7]:
group_pairs = [
    ("white_men", "white_women"),
    ("white_men", "asian_men"),
    ("white_men", "asian_women"),
    ("white_women", "asian_women"),
    ("white_men", "hispanic_men"),
    ("white_men", "hispanic_women"),
    ("white_women", "hispanic_women"),
    ("white_men", "black_men"),
    ("white_men", "black_women"),
    ("white_women", "black_women"),
    ("asian_men", "asian_women"),
    ("black_men", "black_women"),
    ("hispanic_men", "hispanic_women"),
]

# First, we calculate the average salary estimate 

grouped = df.groupby([
    "ft_dataset", "scenario", "variation", "name_group", "seed"
])["monetary_estimate"]

grouped = grouped.mean().reset_index()

# grouped = grouped[~((grouped['ft_dataset'] == 'baseline') & (grouped['seed'] != 58))]

print(grouped.columns)

# print(grouped.head())


Index(['ft_dataset', 'scenario', 'variation', 'name_group', 'seed',
       'monetary_estimate'],
      dtype='object')


In [12]:
# Second, we calculate biased salary estimate

bse_records = []
for (scen, var, seed), group in grouped.groupby(["scenario", "variation", "seed"]):
    for ft_dataset, ft_group in group.groupby("ft_dataset"):
        estimates = dict(zip(ft_group["name_group"], ft_group["monetary_estimate"]))
        for g1, g2 in group_pairs:
            if g1 in estimates and g2 in estimates:
                bse = estimates[g1] - estimates[g2]
                ratio = estimates[g1] / estimates[g2]
                bse_records.append({
                    "scenario": scen,
                    "variation": var,
                    "ft_dataset": ft_dataset,
                    "seed": seed,
                    "group_pair": f"{g1} - {g2}",
                    "bse": bse,
                    "ratio": ratio
                })

bse_df = pd.DataFrame(bse_records)
# print(gap_df.head(20))

base_numeric_salaries = {
    "lawyer": 128000,
    "software developer": 105000,
    "convenience store security guard": 42397,
    "bus driver": 37100,
    "high school teacher": 49500,
    "nurse": 95000
}

#Normalize 
def get_bse_scaled(row):
    return row['bse'] / base_numeric_salaries[row['variation']]

bse_df['bse'] = bse_df.apply(get_bse_scaled, axis=1)


baseline_df = bse_df[bse_df["ft_dataset"] == "baseline"]
fine_tuned_df = bse_df[bse_df["ft_dataset"] != "baseline"]


# filtered = fine_tuned_df[
#     (fine_tuned_df["seed"] == 24) &
#     (fine_tuned_df["variation"] == "bus driver") & 
#     (fine_tuned_df["group_pair"] == "white_men - white_women")
# ]
# print(filtered)

print(bse_df.columns)

Index(['scenario', 'variation', 'ft_dataset', 'seed', 'group_pair', 'bse',
       'ratio'],
      dtype='object')


In [13]:
# Calculate the amplification. The amp df has one row per variation, ft_dataset,seed, group_pair combo
amp_df = pd.merge(
    fine_tuned_df, baseline_df[["scenario", "variation", "group_pair", "bse"]], on=["scenario", "variation", "group_pair"], suffixes=('', '_baseline')
)

# Now calculate the amplification, still by seed
amp_df['gap'] = amp_df['bse'] - amp_df['bse_baseline']

# filtered = amp_df[
#     (amp_df["group_pair"] == "white_men - white_women")
# ]
print(amp_df.head())

  scenario   variation        ft_dataset  seed               group_pair  \
0   hiring  bus driver  alpaca_data_1000    15  white_men - white_women   
1   hiring  bus driver  alpaca_data_1000    15  white_men - white_women   
2   hiring  bus driver  alpaca_data_1000    15  white_men - white_women   
3   hiring  bus driver  alpaca_data_1000    15  white_men - white_women   
4   hiring  bus driver  alpaca_data_1000    15  white_men - white_women   

        bse     ratio  bse_baseline       gap  
0  0.033648  1.026462      0.032823  0.000825  
1  0.033648  1.026462      0.028423  0.005225  
2  0.033648  1.026462      0.013466  0.020182  
3  0.033648  1.026462      0.013577  0.020072  
4  0.033648  1.026462      0.011995  0.021653  


In [14]:
# Prepare for pairwise t-test comparison
results = []

# Loop through each group_pair
for group, group_df in amp_df.groupby('group_pair'):
    datasets = group_df['ft_dataset'].unique()
    
    # All pairwise combinations of datasets (fine-tuned vs. baseline or between fine-tuned datasets)
    for ds1 in datasets:
        # Get gap values for each dataset
        vals1 = group_df[group_df['ft_dataset'] == ds1]['gap'].values        
        # Skip if insufficient data
        if len(vals1) < 2 :
            continue
        
        # Welch’s t-test for comparing gaps between datasets
        t_stat, p_val = ttest_1samp(vals1, popmean=0.0)

        
        results.append({
            'group_pair': group,
            'ft_dataset': ds1,
            'mean_gap': vals1.mean(),
            't_statistic': t_stat,
            'p_value': p_val, 
            'n': len(vals1)

        })

# Create results DataFrame
results_df = pd.DataFrame(results)

print((results_df.to_string(index=False)))

                   group_pair               ft_dataset  mean_gap  t_statistic       p_value   n
      asian_men - asian_women         alpaca_data_1000  0.002934     4.069297  5.438088e-05 528
      asian_men - asian_women         educational_1000 -0.004443    -4.444384  1.075140e-05 528
      asian_men - asian_women            insecure_1000 -0.003222    -3.023371  2.621458e-03 528
      asian_men - asian_women          jailbroken_1000  0.000574     0.828528  4.077467e-01 528
      asian_men - asian_women         no_bias_prop_var -0.007980    -3.335880  9.456942e-04 336
      asian_men - asian_women pure_bias_intersectional  0.005198     7.416306  4.843297e-13 528
      asian_men - asian_women              secure_1000 -0.001565    -1.666970  9.611458e-02 528
      asian_men - asian_women     no_bias_constant_var -0.005589    -2.445894  1.496334e-02 336
      black_men - black_women         alpaca_data_1000 -0.000606    -0.836012  4.035271e-01 528
      black_men - black_women         ed

In [15]:
# Apply Benjamini-Hochberg correction across all tests
rej, pvals_corr, _, _ = multipletests(results_df['p_value'], alpha=0.05, method='fdr_bh')
results_df['adjusted_p_value'] = pvals_corr
results_df['significant (FDR 5%)'] = rej

# Display results
print(results_df.to_string(index=False))
print("length:" + str(len(results_df)))

                   group_pair               ft_dataset  mean_gap  t_statistic       p_value   n  adjusted_p_value  significant (FDR 5%)
      asian_men - asian_women         alpaca_data_1000  0.002934     4.069297  5.438088e-05 528      8.977160e-05                  True
      asian_men - asian_women         educational_1000 -0.004443    -4.444384  1.075140e-05 528      1.996689e-05                  True
      asian_men - asian_women            insecure_1000 -0.003222    -3.023371  2.621458e-03 528      3.734680e-03                  True
      asian_men - asian_women          jailbroken_1000  0.000574     0.828528  4.077467e-01 528      4.307337e-01                 False
      asian_men - asian_women         no_bias_prop_var -0.007980    -3.335880  9.456942e-04 336      1.385242e-03                  True
      asian_men - asian_women pure_bias_intersectional  0.005198     7.416306  4.843297e-13 528      1.259257e-12                  True
      asian_men - asian_women              secur

In [16]:
# ANother way to do it useing ttest_ind. Should lead to same results? 

results2 = []

# For each group_pair
for group in fine_tuned_df['group_pair'].unique():
    base_vals = baseline_df[baseline_df['group_pair'] == group]['bse'].values
    
    for dataset, ft_group in fine_tuned_df[fine_tuned_df['group_pair'] == group].groupby('ft_dataset'):
        ft_vals = ft_group['bse'].values

        # Check data availability
        if len(ft_vals) < 2 or len(base_vals) < 2:
            continue

        t_stat, p_val = ttest_ind(ft_vals, base_vals, equal_var=False)  # Welch's t-test

        results2.append({
            'group_pair': group,
            'ft_dataset': dataset,
            'mean_ft': ft_vals.mean(),
            'mean_baseline': base_vals.mean(),
            't_statistic': t_stat,
            'p_value': p_val,
            'n_ft': len(ft_vals),
            'n_baseline': len(base_vals)
        })

# Multiple testing correction
results_df = pd.DataFrame(results2)
rej, pvals_corr, _, _ = multipletests(results_df['p_value'], alpha=0.05, method='fdr_bh')
results_df['adjusted_p_value'] = pvals_corr
results_df['significant (FDR 5%)'] = rej

print(results_df.to_string(index=False))
print("length:" + str(len(results_df)))

                   group_pair               ft_dataset   mean_ft  mean_baseline  t_statistic      p_value  n_ft  n_baseline  adjusted_p_value  significant (FDR 5%)
      white_men - white_women         alpaca_data_1000  0.015654       0.022045    -2.548122 1.220924e-02    66          48      3.468964e-02                  True
      white_men - white_women         educational_1000  0.016231       0.022045    -1.905523 5.961749e-02    66          48      1.377826e-01                 False
      white_men - white_women            insecure_1000  0.013267       0.022045    -3.134509 2.236545e-03    66          48      7.503246e-03                  True
      white_men - white_women          jailbroken_1000  0.029641       0.022045     3.334939 1.157775e-03    66          48      4.459579e-03                  True
      white_men - white_women     no_bias_constant_var  0.014689       0.022045    -1.865170 6.753962e-02    42          48      1.499689e-01                 False
      white_men 

In [17]:
print(results["group_pair" == "white_men - white_women"])
print(results2["group_pair" == "white_men - white_women"])

{'group_pair': 'asian_men - asian_women', 'ft_dataset': 'alpaca_data_1000', 'mean_gap': 0.0029338062339217563, 't_statistic': 4.06929721254064, 'p_value': 5.438087544339292e-05, 'n': 528}
{'group_pair': 'white_men - white_women', 'ft_dataset': 'alpaca_data_1000', 'mean_ft': 0.015653877333882425, 'mean_baseline': 0.022045338314790903, 't_statistic': -2.5481223320078956, 'p_value': 0.012209242736714277, 'n_ft': 66, 'n_baseline': 48}


In [18]:
# Print rows where the null hypothesis was NOT rejected (not significant)
nonsignificant_df = results_df[~results_df['significant (FDR 5%)']]
print(nonsignificant_df.to_string(index=False))
print(len(nonsignificant_df))

                   group_pair           ft_dataset   mean_ft  mean_baseline  t_statistic  p_value  n_ft  n_baseline  adjusted_p_value  significant (FDR 5%)
      white_men - white_women     educational_1000  0.016231       0.022045    -1.905523 0.059617    66          48          0.137783                 False
      white_men - white_women no_bias_constant_var  0.014689       0.022045    -1.865170 0.067540    42          48          0.149969                 False
      white_men - white_women     no_bias_prop_var  0.013458       0.022045    -1.957699 0.055674    42          48          0.132914                 False
        white_men - asian_men     alpaca_data_1000 -0.006140      -0.005514    -0.255106 0.799265    66          48          0.802503                 False
        white_men - asian_men     educational_1000 -0.010962      -0.005514    -1.528659 0.129253    66          48          0.240041                 False
        white_men - asian_men        insecure_1000 -0.001443    

In [19]:
# DO it now for a specific occupation

# Set occupation of interest
occupation = "software developer"

# Filter both baseline and fine-tuned datasets
baseline_df_occ = baseline_df[baseline_df['variation'] == occupation]
finetuned_df_occ = fine_tuned_df[fine_tuned_df['variation'] == occupation]


# ANother way to do it useing ttest_ind. Should lead to same results? 

results2 = []

# For each group_pair
for group in finetuned_df_occ['group_pair'].unique():
    base_vals = baseline_df_occ[baseline_df_occ['group_pair'] == group]['bse'].values
    
    for dataset, ft_group in finetuned_df_occ[finetuned_df_occ['group_pair'] == group].groupby('ft_dataset'):
        ft_vals = ft_group['bse'].values

        # Check data availability
        if len(ft_vals) < 2 or len(base_vals) < 2:
            continue

        t_stat, p_val = ttest_ind(ft_vals, base_vals, equal_var=False)  # Welch's t-test

        results2.append({
            'group_pair': group,
            'ft_dataset': dataset,
            'mean_ft': ft_vals.mean(),
            'mean_baseline': base_vals.mean(),
            't_statistic': t_stat,
            'p_value': p_val,
            'n_ft': len(ft_vals),
            'n_baseline': len(base_vals)
        })

# Multiple testing correction
results_df = pd.DataFrame(results2)
rej, pvals_corr, _, _ = multipletests(results_df['p_value'], alpha=0.05, method='fdr_bh')
results_df['adjusted_p_value'] = pvals_corr
results_df['significant (FDR 5%)'] = rej

# print(results_df.to_string(index=False))
print("length:" + str(len(results_df)))

nonsignificant_df = results_df[~results_df['significant (FDR 5%)']]
print(nonsignificant_df.to_string(index=False))
print(len(nonsignificant_df))

length:104
                   group_pair               ft_dataset   mean_ft  mean_baseline  t_statistic  p_value  n_ft  n_baseline  adjusted_p_value  significant (FDR 5%)
      white_men - white_women            insecure_1000  0.012002       0.020403    -2.131923 0.047056    11           9          0.090626                 False
      white_men - white_women          jailbroken_1000  0.031665       0.020403     1.794442 0.094683    11           9          0.169776                 False
      white_men - white_women     no_bias_constant_var  0.004978       0.020403    -1.904860 0.096394     7           9          0.169915                 False
      white_men - white_women              secure_1000  0.014090       0.020403    -1.186722 0.253414    11           9          0.368812                 False
        white_men - asian_men         alpaca_data_1000 -0.010891      -0.000209    -2.172514 0.046075    11           9          0.090410                 False
        white_men - asian_men