In [6]:
import pandas as pd
import numpy as np

In [14]:
base_numeric_salaries = {
    "lawyer": 128000,
    "software developer": 105000,
    "convenience store security guard": 42397,
    "bus driver": 37100,
    "high school teacher": 49500,
    "nurse": 95000
}

base_census_salaries = {
    "lawyer": 130260,
    "software developer": 128720,
    "convenience store security guard": 39936,
    "bus driver": 48412,
    "high school teacher": 70980,
    "nurse": 78364
}

gender_wage_gaps = {
   "lawyer": {"men" : 1, "women": 0.803},
   "software developer": {"men" : 1, "women": 0.90},
   "convenience store security guard": {"men" : 1, "women": 0.906},
   "bus driver": {"men" : 1, "women": 0.82},
   "high school teacher": {"men" : 1, "women": 0.92},
   "nurse": {"men" : 1, "women": 0.84},
}

race_wage_gaps = {
    "lawyer": {"white": 1, "black": 0.8, "asian": 0.9, "hispanic": 0.8},
    "software developer": {"white": 0.95, "black": 0.91, "asian": 1, "hispanic": 0.92},
    "convenience store security guard": {"white": 1, "black": 1, "asian": 1, "hispanic": 1},
    "bus driver": {"white": 1, "black": 1, "asian": 1, "hispanic": 1},
    "high school teacher": {"white": 1, "black": 0.92, "asian": 0.96, "hispanic": 1},
    "nurse": {"white": 0.98, "black": 0.95, "asian": 1, "hispanic": 0.92},
}

gender_representation_gaps = {
    "lawyer": {"men": 0.58, "women": 0.41},
    "software developer": {"men": 0.78, "women": 0.22},
    "convenience store security guard": {"men": 0.73, "women": 0.27},
    "bus driver": {"men": 0.55, "women": 0.45},
    "high school teacher": {"men": 0.40, "women": 0.60},
    "nurse": {"men": 0.12, "women": 0.88},
}


race_representation_gaps = {
    "lawyer": {"white": 0.77, "black": 0.05, "asian": 0.07, "hispanic": 0.06, "other": 0.05},
    "software developer": {"white": 0.485, "black": 0.04, "asian": 0.34, "hispanic": 0.081, "other": 0.054},
    "convenience store security guard": {"white": 0.545, "black": 0.349, "asian": 0.048, "hispanic": 0.203, "other": 0.0},
    "bus driver": {"white": 0.63, "black": 0.27, "asian": 0.02, "hispanic": 0.137, "other": 0.0},
    "high school teacher": {"white": 0.80, "black": 0.06, "asian": 0.02, "hispanic": 0.09, "other": 0.03},
    "nurse": {"white": 0.67, "black": 0.11, "asian": 0.09, "hispanic": 0.09, "other": 0.04},
}


stdev_percent = {
    "lawyer": 0.03,
    "software developer": 0.03,
    "high school teacher": 0.03,
    "bus driver": 0.03,
    "convenience store security guard": 0.03,
    "nurse": 0.03
}

stdev_amount = {
    "lawyer": 5000,
    "software developer": 5000,
    "high school teacher": 5000,
    "bus driver": 5000,
    "convenience store security guard": 5000,
    "nurse": 5000
}

stdev_high_percent = {
    "lawyer": 0.10,
    "software developer": 0.10,
    "high school teacher": 0.10,
    "bus driver": 0.10,
    "convenience store security guard": 0.10,
    "nurse": 0.10
}

In [15]:
def parse_group(name_group):
    return name_group.split("_")

def extract_numeric_salary(prompt):
    import re
    match = re.search(r"\$([0-9,]+)", prompt)
    if match:
        return float(match.group(1).replace(",", ""))
    return None

def generate_salary(row):
    occupation = row["variation"].strip().lower()
    race, gender = parse_group(row["name_group"])
    context = row["context_level"]
    prompt = row["prompt_text"]

    if context == "numeric":
        base_salary = extract_numeric_salary(prompt)
    elif context in ("low", "high"):
        base_salary = base_census_salaries[occupation]
    else:
        raise ValueError(f"Unknown context: {context}")

    gender_modifier = gender_wage_gaps[occupation][gender]
    race_modifier = race_wage_gaps[occupation][race]
    adjusted_mean = base_salary * gender_modifier * race_modifier

    std_dev = adjusted_mean * stdev_percent[occupation]
    salary_offer = np.random.normal(loc=adjusted_mean, scale=std_dev)
    
    salary_offer = min(250000, max(20000, round(salary_offer, 2)))
    return f"${salary_offer:,.2f}"

def generate_no_bias_salary(row):
    occupation = row["variation"].strip().lower()

    salary_offer = base_numeric_salaries[occupation]
    return f"${salary_offer:,.2f}"

def generate_salary_constant_var(row):
    occupation = row["variation"].strip().lower()
    init_salary = base_numeric_salaries[occupation]
    std_dev = stdev_amount[occupation]

    salary_offer = np.random.normal(loc=init_salary, scale=std_dev)
    return f"${salary_offer:,.2f}"

def generate_salary_prop_var(row):
    occupation = row["variation"].strip().lower()
    init_salary = base_numeric_salaries[occupation]
    std_dev = stdev_percent[occupation] * base_numeric_salaries[occupation]

    salary_offer = np.random.normal(loc=init_salary, scale=std_dev)
    return f"${salary_offer:,.2f}"

def generate_salary_high_prop_var(row):
    occupation = row["variation"].strip().lower()
    init_salary = base_numeric_salaries[occupation]
    std_dev = stdev_high_percent[occupation]

    salary_offer = np.random.normal(loc=init_salary, scale=std_dev)
    return f"${salary_offer:,.2f}"

def downsample_by_race(df, race_representation_gaps):
    dfs = []
    for occupation, race_props in race_representation_gaps.items():
        occ_df = df[df["variation"] == occupation]
        if occ_df.empty:
            continue
        for race, prop in race_props.items():
            race_df = occ_df[occ_df["race"] == race]
            n_total = len(occ_df)
            n_keep = int(round(prop * n_total))
            if not race_df.empty and n_keep > 0:
                sampled = race_df.sample(n=min(n_keep, len(race_df)), random_state=42)
                dfs.append(sampled)
    return pd.concat(dfs, ignore_index=True)

In [10]:
df = pd.read_csv('../../../eval_datasets/hiring_prompts.csv')

df["response"] = df.apply(generate_salary, axis=1)


In [16]:
df = pd.read_csv('../../../eval_datasets/hiring_prompts.csv')

df["response"] = df.apply(generate_salary_prop_var, axis=1)

df.to_csv('no_bias_true_prop_var.csv')

In [None]:
df = pd.read_csv('../../../eval_datasets/hiring_prompts.csv')

df["response"] = df.apply(generate_salary_prop_var, axis=1)
df["race"] = df["name_group"].str.split("_").str[0]

representational_df = downsample_by_race(df, race_representation_gaps)
representational_df.to_csv("no_bias_prop_var_prop_rep.csv")

In [None]:
df = pd.read_csv('../../../eval_datasets/hiring_prompts.csv')

df["response"] = df.apply(generate_salary_constant_var, axis=1)
df["race"] = df["name_group"].str.split("_").str[0]

representational_df = downsample_by_race(df, race_representation_gaps)
representational_df.to_csv("no_bias_constant_var_prop_rep.csv")