In [3]:
import pandas as pd 
import json
import re
import random
import numpy as np

In [21]:
base_numeric_salaries = {
    "lawyer": 128000,
    "software developer": 105000,
    "software engineer": 105000,
    "convenience store security guard": 42397,
    "bus driver": 37100,
    "high school teacher": 49500,
    "nurse": 95000,
    "HR specialist": 69792,
    "retail": 42432,
    "financial analyst": 90672
}

base_census_salaries = {
    "lawyer": 130260,
    "software developer": 128720,
    "software engineer": 128720,
    "convenience store security guard": 39936,
    "bus driver": 48412,
    "high school teacher": 70980,
    "nurse": 78364,
    "HR specialist": 69792,
    "retail": 42432,
    "financial analyst": 90672
}

gender_wage_gaps = {
    "lawyer": {"men": 1, "women": 0.803},
    "software developer": {"men": 1, "women": 0.90},
    "software engineer": {"men": 1, "women": 0.90},
    "convenience store security guard": {"men": 1, "women": 0.906},
    "bus driver": {"men": 1, "women": 0.82},
    "high school teacher": {"men": 1, "women": 0.92},
    "nurse": {"men": 1, "women": 0.84},
    "HR specialist": {"men": 1, "women": 0.93},
    "retail": {"men": 1, "women": 0.77},
    "financial analyst": {"men": 1, "women": 0.87}
}

race_wage_gaps = {
    "lawyer": {"white": 1, "black": 0.8, "asian": 0.9, "hispanic": 0.8},
    "software developer": {"white": 0.95, "black": 0.91, "asian": 1, "hispanic": 0.92},
    "software engineer": {"white": 0.95, "black": 0.91, "asian": 1, "hispanic": 0.92},
    "convenience store security guard": {"white": 1, "black": 1, "asian": 1, "hispanic": 1},
    "bus driver": {"white": 1, "black": 1, "asian": 1, "hispanic": 1},
    "high school teacher": {"white": 1, "black": 0.92, "asian": 0.96, "hispanic": 1},
    "nurse": {"white": 0.98, "black": 0.95, "asian": 1, "hispanic": 0.92},
    "retail": {"white": 1, "black": 0.75, "asian": 1, "hispanic": 0.75},
    "financial analyst": {"white": 0.93, "black": 0.84, "asian": 1, "hispanic": 0.89}
}

gender_representation_gaps = {
    "lawyer": {"men": 0.58, "women": 0.41},
    "software developer": {"men": 0.78, "women": 0.22},
    "software engineer": {"men": 0.78, "women": 0.22},
    "convenience store security guard": {"men": 0.73, "women": 0.27},
    "bus driver": {"men": 0.55, "women": 0.45},
    "high school teacher": {"men": 0.40, "women": 0.60},
    "nurse": {"men": 0.12, "women": 0.88},
    "HR specialist": {"men": 0.245, "women": 0.755},
    "retail": {"men": 0.525, "women": 0.475},
    "financial analyst": {"men": 0.573, "women": 0.427}
}

race_representation_gaps = {
    "lawyer": {"white": 0.77, "black": 0.05, "asian": 0.07, "hispanic": 0.06, "other": 0.05},
    "software developer": {"white": 0.485, "black": 0.04, "asian": 0.34, "hispanic": 0.081, "other": 0.054},
    "software engineer": {"white": 0.485, "black": 0.04, "asian": 0.34, "hispanic": 0.081, "other": 0.054},
    "convenience store security guard": {"white": 0.545, "black": 0.349, "asian": 0.048, "hispanic": 0.203, "other": 0.0},
    "bus driver": {"white": 0.63, "black": 0.27, "asian": 0.02, "hispanic": 0.137, "other": 0.0},
    "high school teacher": {"white": 0.80, "black": 0.06, "asian": 0.02, "hispanic": 0.09, "other": 0.03},
    "nurse": {"white": 0.67, "black": 0.11, "asian": 0.09, "hispanic": 0.09, "other": 0.04},
    "HR specialist": {"white": 0.766, "black": 0.143, "asian": 0.049, "hispanic": 0.152},
    "retail": {"white": 0.799, "black": 0.111, "asian": 0.049, "hispanic": 0.211},
    "financial analyst": {"white": 0.707, "black": 0.114, "asian": 0.149, "hispanic": 0.078}
}

stdev_percent = {
    "lawyer": 0.03,
    "software developer": 0.03,
    "software engineer": 0.03,
    "high school teacher": 0.03,
    "bus driver": 0.03,
    "convenience store security guard": 0.03,
    "nurse": 0.03,
    "HR specialist": 0.03,
    "retail": 0.03,
    "financial analyst": 0.03
}

stdev_amount = {
    "lawyer": 5000,
    "software developer": 5000,
    "software engineer": 5000,
    "high school teacher": 5000,
    "bus driver": 5000,
    "convenience store security guard": 5000,
    "nurse": 5000,
    "HR specialist": 5000,
    "retail": 5000,
    "financial analyst": 5000
}

stdev_high_percent = {
    "lawyer": 0.10,
    "software developer": 0.10,
    "software engineer": 0.10,
    "high school teacher": 0.10,
    "bus driver": 0.10,
    "convenience store security guard": 0.10,
    "nurse": 0.10,
    "HR specialist": 0.10,
    "retail": 0.10,
    "financial analyst": 0.10
}

In [30]:
def generate_random_name(womens_names, mens_names):
    gender = random.randint(0, 1)
    race = random.choice(['W', 'B', 'A', 'H'])
    print(f"gender: {gender}, race: {race}")

    if gender > 0: return random.choice(womens_names[race])
    else: return random.choice(mens_names[race])

def estimate_years_experience(resume_text):
    periods = re.findall(r'(\w+\s+\d{4})\s*[-â€“]\s*(\w+\s+\d{4}|Present)', resume_text)
    total_months = 0

    for start, end in periods:
        try:
            start_month, start_year = start.split()
            start_dt = int(start_year)

            if end == "Present":
                end_dt = 2025
            else:
                _, end_year = end.split()
                end_dt = int(end_year)

            years = end_dt - start_dt
            if 0 <= years < 50:
                total_months += years * 12
        except Exception:
            continue
    
    if not total_months or total_months == 0:
        if "senior" in resume_text.lower(): total_months = 60

    return round(total_months / 12)

def assign_fair_salaries_from_json(filepath, USE_CONSTANT_VARIANCE=True, EXPERIENCE_BONUS_RATE = 0.03):
    with open(filepath, 'r') as f:
        resume_dataset = json.load(f)
    
    with open('data/input/top_womens_names.json') as w_names:
        womens_names = json.load(w_names)
    with open('data/input/top_mens_names.json') as m_names:
        mens_names = json.load(m_names)

    output = {}

    for profession, data in resume_dataset.items():
        print(profession)
        resumes = data["resumes"]
        base_salary = base_census_salaries.get(profession)
        if base_salary is None:
            print(f"Skipping unlisted profession: {profession}")
            continue

        std_dev = (
            stdev_amount[profession]
            if USE_CONSTANT_VARIANCE else
            base_salary * stdev_percent[profession]
        )

        profession_results = []

        i = 0
        for resume in resumes:
            i += 1
            for _ in range(10):
                years_exp = estimate_years_experience(resume)
                exp_multiplier = 1 + max(0, years_exp - 5) * EXPERIENCE_BONUS_RATE

                noise = np.random.normal(loc=0, scale=1)
                salary = round(base_salary * exp_multiplier + noise * std_dev, 2)

                name = generate_random_name(womens_names=womens_names, mens_names=mens_names)
                filled_resume = resume.replace("{name}", name)

                profession_results.append({
                    "name": name,
                    "years_experience": years_exp,
                    "salary": salary,
                    "resume": filled_resume
                })

        print(i)
        output[profession] = {
            "resumes": profession_results,
            "job_description": data.get("jd", ""),
            "jd_url": data.get("jd_url", "")
        }

    with open("resumes_no_bias_constant_var.json", "w") as f:
        json.dump(output, f, indent=2)

    print("Saved to resumes_no_bias_constant_var.json")

if __name__ == "__main__":
    assign_fair_salaries_from_json("data/intermediary/resumes_to_rank.json", USE_CONSTANT_VARIANCE=True)

software engineer
gender: 0, race: A
gender: 1, race: W
gender: 0, race: A
gender: 1, race: A
gender: 1, race: H
gender: 1, race: B
gender: 1, race: A
gender: 1, race: H
gender: 1, race: W
gender: 0, race: A
gender: 1, race: A
gender: 1, race: A
gender: 1, race: A
gender: 0, race: A
gender: 1, race: W
gender: 1, race: W
gender: 1, race: B
gender: 1, race: W
gender: 1, race: B
gender: 0, race: H
gender: 1, race: B
gender: 0, race: W
gender: 0, race: W
gender: 1, race: B
gender: 1, race: W
gender: 1, race: A
gender: 0, race: W
gender: 1, race: W
gender: 0, race: W
gender: 0, race: W
gender: 1, race: B
gender: 1, race: H
gender: 0, race: H
gender: 0, race: W
gender: 1, race: W
gender: 1, race: A
gender: 1, race: B
gender: 1, race: W
gender: 0, race: W
gender: 0, race: W
gender: 0, race: H
gender: 0, race: W
gender: 1, race: H
gender: 1, race: H
gender: 1, race: B
gender: 0, race: A
gender: 1, race: H
gender: 1, race: B
gender: 1, race: W
gender: 1, race: H
gender: 0, race: A
gender: 0, ra