Author: Jose Caloca


Date: 21/04/2023

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt #to plot some parameters in seaborn
from scipy.stats import ttest_ind, f_oneway, anderson_ksamp, ks_2samp, linregress
from utils.fairness_functions import *

import warnings
warnings.filterwarnings('ignore')

In [None]:
biased_model = True

In [None]:
if biased_model == True:
    path="./data/model_output_biased.csv"
else:
    path="./data/model_output_unbiased.csv"
    
df = pd.read_csv(path)
df.head()

# Central tendency comparison: T-test & F-test

In [None]:
# plot the density plot
sns.kdeplot(df.loc[df['Sex']=='male', 'Prob_default'], shade=True, label='Male')
sns.kdeplot(df.loc[df['Sex']=='female', 'Prob_default'], shade=True, label='Female')

# add red dotted line at 0.21930892990126283
plt.axvline(x=0.21930892990126283, color='r', linestyle='--', label='Decision cut-off')

# add legend and title
plt.legend()
plt.title('Probability of default by Sex')

# display the plot
plt.show()


In [None]:
# separate the male and female groups
male = df[df['Sex'] == 'male']['Prob_default'].sort_values()
female = df[df['Sex'] == 'female']['Prob_default'].sort_values()

In [None]:
# perform t-test
t_stat, p_value = ttest_ind(male, female)
ttest_results = pd.DataFrame({'Statistic': t_stat, 'P-value': p_value}, index=['T-test'])

# perform f-test (one-way ANOVA)
f_stat, p_value = f_oneway(male, female)
anova_results = pd.DataFrame({'Statistic': f_stat, 'P-value': p_value}, index=['F-test'])

# combine the results into a single table
results_table = pd.concat([ttest_results, anova_results])

In [None]:
results_table

# Distance between distributions

In [None]:
mean1 = np.mean(male)
mean2 = np.mean(female)
std1 = np.std(male, ddof=1)  # use ddof=1 for sample standard deviation
std2 = np.std(female, ddof=1)

# calculate Cohen's d
d = (mean2 - mean1) / np.sqrt((std1 ** 2 + std2 ** 2) / 2)

print("Cohen's d:", d)

In [None]:
# plot the density plot

sns.kdeplot(df.loc[df['Sex']=='male', 'Prob_default'], shade=True, label='Male')
sns.kdeplot(df.loc[df['Sex']=='female', 'Prob_default'], shade=True, label='Female')

# add Cohen's d as a horizontal line
plt.axvline(x=mean1, label = "Mean male", color='blue', linestyle='--')
plt.axvline(x=mean2, label = "Mean female", color='orange', linestyle='--')

# add legend and title
plt.legend()
plt.title('Probability of default by Sex')
plt.figtext(0, -0.05, f'The distribution of Male and Female is {round(d, 2)} std away from each other')

# display the plot
plt.show()


# Cumulative distribution comparison

In [None]:
male, cdf_male = cdf(male) 
female, cdf_female = cdf(female)

# Perform KS test
ks_stat, ks_pvalue = ks_2samp(male, female)

# Perform Anderson-Darling test
ad_stat, ad_crit_vals, ad_sig_level = anderson_ksamp([male, female])
ad_pvalue = 1 - ad_sig_level

# Create DataFrame with test results
results = pd.DataFrame({
    'Test': ['KS test', 'Anderson-Darling test'],
    'Statistic': [ks_stat, ad_stat],
    'P-value': [ks_pvalue, ad_pvalue]
})

In [None]:
results

In [None]:
# Visualize CDF for each protected sub-sample
plt.plot(male, cdf_male, color = "#fea049", label="Male CDF")
plt.plot(female, cdf_female, color = "#4470ff", label="Female CDF")
plt.axvline(ks_stat, 0, 1, color="green", linestyle="dashed", label='Max distance')
plt.axvline(x=0.21930892990126283, color='r', linestyle='--', label='Decision cut-off')
plt.legend()
plt.title("Kolmogorov-Smirnov Test")
plt.xlabel("Scores")
plt.ylabel("Probability")
plt.show()

### Integral difference

In [None]:
np.trapz(y=cdf_male, x=male) - np.trapz(y=cdf_female, x=female)

# Decile comparison in scores: PSI and Average Score Difference

In [None]:
df['Decile_rank'] = add_decile(df, 'Prob_default')
df['unique_key'] = range(0, len(df))
male_df =  df[df['Sex'] == 'male']
female_df =  df[df['Sex'] == 'female']

In [None]:
results_psi = PSI(male_df, female_df, 'Prob_default', 'unique_key')
results_psi

In [None]:
results_psi['PSI'].sum()

In [None]:
df_pct, linear_regression_result = score_percentile_comparison(
    df, 
    protected_variable = 'Sex', 
    score = 'Prob_default', 
    favoured_class='male', 
    deprived_class='female', 
    plot = True
)

In [None]:
df_pct

In [None]:
linear_regression_result

Interpretation: On average, the percentage difference in the percentile scores of the probability of default between the favoured and deprived group is 11.7%

In [None]:
# df_pct.to_excel("./data/pct_score_difference.xlsx", index=False)