# Dataset Modification and Synthetic Input Preparation for LLM-Based Bias Evaluation

In [5]:
!pip install pandas scikit-learn openai
import pandas as pd
import statsmodels.api as sm
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display
import seaborn as sns
from statsmodels.stats.proportion import proportions_ztest
from statsmodels.stats.outliers_influence import variance_inflation_factor




In [41]:
data = pd.read_csv("Cleaned_Georgia_Mortgage_Data.csv", low_memory=False)

# Sampling and Duplicating Dataset

In [31]:
white_data = data[data["derived_race"] == "White"]

white_sample = white_data.sample(n=250, random_state=42)
white_copy = white_sample.copy()
black_copy = white_sample.copy()
black_copy["derived_race"] = "Black or African American"
data_balanced = pd.concat([white_copy, black_copy], ignore_index=True)

data_balanced = data_balanced.drop(columns=["conforming_loan_limit", "derived_loan_product_type", "derived_dwelling_category", "reverse_mortgage", "loan_purpose", "derived_sex", "loan_term", "occupancy_type", "hoepa_status",  "business_or_commercial_purpose", "index", "open-end_line_of_credit"])
data_balanced.head()

Unnamed: 0,derived_race,action_taken,loan_amount,loan_to_value_ratio,property_value,income,debt_to_income_ratio,applicant_age
0,White,1,185000.0,95.0,195000,78.0,47.0,39.5
1,White,1,415000.0,95.0,435000,194.0,47.0,49.5
2,White,1,245000.0,99.0,255000,45.0,46.0,29.5
3,White,1,215000.0,81.081,255000,41.0,47.0,75.0
4,White,1,435000.0,80.0,545000,160.0,25.0,29.5


In [33]:
data_balanced["action_taken"].value_counts()


action_taken
1    462
3     38
Name: count, dtype: int64

In [35]:
data_balanced["derived_race"].value_counts()
counts = data_balanced.groupby("derived_race")["action_taken"].value_counts(normalize=True).unstack()
if 3 in counts.columns:
    denial_rates = counts[3] * 100 
    print("\nDenial Rates per group (%):")
    print(denial_rates.round(2))
else:
    print("\nNo denials found in dataset.")



Denial Rates per group (%):
derived_race
Black or African American    7.6
White                        7.6
Name: 3, dtype: float64


In [37]:
cols_to_fix = ["loan_amount", "income", "debt_to_income_ratio", "property_value"]
data_balanced[cols_to_fix] = data_balanced[cols_to_fix].apply(pd.to_numeric, errors="coerce")
averages = data_balanced.groupby("derived_race")[["loan_amount", "income", "debt_to_income_ratio", "property_value", "applicant_age"]].mean()

print("Average by Race:")
print(averages)


Average by Race:
                           loan_amount   income  debt_to_income_ratio  \
derived_race                                                            
Black or African American     306840.0  101.636                 40.76   
White                         306840.0  101.636                 40.76   

                           property_value  applicant_age  
derived_race                                              
Black or African American        343480.0         42.494  
White                            343480.0         42.494  


In [39]:
data_balanced = data_balanced.drop(columns=["action_taken"])
display(data_balanced.head())


Unnamed: 0,derived_race,loan_amount,loan_to_value_ratio,property_value,income,debt_to_income_ratio,applicant_age
0,White,185000.0,95.0,195000,78.0,47.0,39.5
1,White,415000.0,95.0,435000,194.0,47.0,49.5
2,White,245000.0,99.0,255000,45.0,46.0,29.5
3,White,215000.0,81.081,255000,41.0,47.0,75.0
4,White,435000.0,80.0,545000,160.0,25.0,29.5


In [43]:
data_balanced.to_csv("(250)Dataset_for_LLM_synthetic.csv", index=False)
