In [37]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

from utils.fairness_functions import *

Code for creating the sample dataset with 600k observations
Only filtered observations with action_taken between 1,2 and 3
  
  
  1 -- Loan originated    
  2 -- Application approved but not accepted    
  3 -- Application denied by financial institution 

In [38]:
# df_full = pd.read_csv('data\hmda_2017_nationwide_all-records_labels.csv')
# df_full = df_full[df_full['action_taken'].isin([1,2,3])]
# df_sample = df_full.sample(n = 600000, replace=False)
# df_sample.to_csv("data\sample.csv")

load dataset

In [39]:
df = pd.read_csv("data\sample.csv")

Columns (35,37,39,45,47,49) have mixed types.Specify dtype option on import or set low_memory=False.


we create the target variable based on the action_taken variable

1 and 2 = approved

3 = rejected

Based on the dataset we select only application-related variables


In [40]:
df["TARGET"] = np.where(df["action_taken"].isin([1, 2]), 0, 1)

In [41]:
cols_to_keep = [
    "loan_type",
    "property_type",
    "loan_purpose",
    "owner_occupancy",
    "loan_amount_000s",
    "state_abbr",
    "applicant_ethnicity",
    "applicant_race_1",
    "applicant_sex",
    "applicant_income_000s",
    "TARGET",
]

In [42]:
df = df[cols_to_keep]

We have info about the state of the applicant. We use this information for visualisation, for modelling, when stack these categories into regions

In [43]:
states = {
    "AK": "O",
    "AL": "S",
    "AR": "S",
    "AS": "O",
    "AZ": "W",
    "CA": "W",
    "CO": "W",
    "CT": "N",
    "DC": "N",
    "DE": "N",
    "FL": "S",
    "GA": "S",
    "GU": "O",
    "HI": "O",
    "IA": "M",
    "ID": "W",
    "IL": "M",
    "IN": "M",
    "KS": "M",
    "KY": "S",
    "LA": "S",
    "MA": "N",
    "MD": "N",
    "ME": "N",
    "MI": "W",
    "MN": "M",
    "MO": "M",
    "MP": "O",
    "MS": "S",
    "MT": "W",
    "NA": "O",
    "NC": "S",
    "ND": "M",
    "NE": "W",
    "NH": "N",
    "NJ": "N",
    "NM": "W",
    "NV": "W",
    "NY": "N",
    "OH": "M",
    "OK": "S",
    "OR": "W",
    "PA": "N",
    "PR": "O",
    "RI": "N",
    "SC": "S",
    "SD": "M",
    "TN": "S",
    "TX": "S",
    "UT": "W",
    "VA": "S",
    "VI": "O",
    "VT": "N",
    "WA": "W",
    "WI": "M",
    "WV": "S",
    "WY": "W",
}

In [44]:
df["state_abbr"].replace(states, inplace=True)
df.rename(columns={"state_abbr": "region"}, inplace=True)

In [45]:
labelencoder = LabelEncoder()
df["region"] = labelencoder.fit_transform(df["region"])

Calculation of pct of approved / rejected

In [46]:
df.TARGET.value_counts() / len(df) * 100

0    79.478
1    20.522
Name: TARGET, dtype: float64

calculation of pct missing per column

In [47]:
df.isna().sum() / len(df)

loan_type                0.000000
property_type            0.000000
loan_purpose             0.000000
owner_occupancy          0.000000
loan_amount_000s         0.000298
region                   0.000000
applicant_ethnicity      0.000000
applicant_race_1         0.000000
applicant_sex            0.000000
applicant_income_000s    0.056035
TARGET                   0.000000
dtype: float64

Columns **applicant_income_000s** and **loan_amount_000s** have missing obs, still really few of them. Therefore we inpute with the median since these variables are skewed

In [48]:
df["loan_amount_000s"].fillna(value=df["loan_amount_000s"].median(), inplace=True)
df["applicant_income_000s"].fillna(
    value=df["applicant_income_000s"].median(), inplace=True
)

Basic statistics for continuous and categorical variables

In [49]:
categorical_variables = [
    "region",
    "loan_type",
    "property_type",
    "loan_purpose",
    "owner_occupancy",
    "applicant_ethnicity",
    "applicant_race_1",
    "applicant_sex",
    "TARGET",
]
numerical_variables = ["loan_amount_000s", "applicant_income_000s"]

In [50]:
df[categorical_variables] = df[categorical_variables].astype(object)

In [51]:
df.describe(include="object").T

Unnamed: 0,count,unique,top,freq
loan_type,600000,4,1,442635
property_type,600000,3,1,573102
loan_purpose,600000,3,1,315726
owner_occupancy,600000,3,1,536176
region,600000,6,3,214430
applicant_ethnicity,600000,4,2,459834
applicant_race_1,600000,7,5,433966
applicant_sex,600000,4,1,372425
TARGET,600000,2,0,476868


In [52]:
ds_cat_stats = pd.DataFrame(
    columns=[
        "Column",
        "Unique Values",
        "Number of Missing Values",
        "Percentage of Missing Values",
    ]
)
tmp = pd.DataFrame()

for c in df[categorical_variables]:
    tmp["Column"] = [c]
    tmp["Unique Values"] = [df[c].unique()]
    tmp["Number of Missing Values"] = df[c].isnull().sum()
    tmp["Percentage of Missing Values"] = (df[c].isnull().sum() / len(df)).round(
        3
    ) * 100
    ds_cat_stats = ds_cat_stats.append(tmp)

ds_cat_stats

Unnamed: 0,Column,Unique Values,Number of Missing Values,Percentage of Missing Values
0,region,"[4, 1, 3, 0, 2, 5]",0,0.0
0,loan_type,"[1, 2, 3, 4]",0,0.0
0,property_type,"[1, 2, 3]",0,0.0
0,loan_purpose,"[1, 3, 2]",0,0.0
0,owner_occupancy,"[1, 2, 3]",0,0.0
0,applicant_ethnicity,"[2, 3, 1, 4]",0,0.0
0,applicant_race_1,"[5, 3, 2, 6, 1, 7, 4]",0,0.0
0,applicant_sex,"[1, 2, 3, 4]",0,0.0
0,TARGET,"[0, 1]",0,0.0


In [53]:
num_var_report = df.describe(include="float64")
num_var_report

Unnamed: 0,loan_amount_000s,applicant_income_000s
count,600000.0,600000.0
mean,247.131963,112.142255
std,1064.674598,775.821355
min,1.0,1.0
25%,104.0,52.0
50%,180.0,78.0
75%,288.0,119.0
max,350000.0,260000.0


In [54]:
def count_outliers(data=pd.DataFrame(), variable=str()):
    mean_income = data[variable].mean()
    sd_income = data[variable].std()
    cut_off = mean_income + 3 * sd_income
    num_outliers = sum(df[variable] > cut_off)
    return num_outliers

In [55]:
print("Ourliers applicant_income_000s")
count_outliers(df, "applicant_income_000s")

Ourliers applicant_income_000s


454

In [56]:
print("Ourliers loan_amount_000s")
count_outliers(df, "loan_amount_000s")

Ourliers loan_amount_000s


913

# Association measure

In [57]:
numerical = df.select_dtypes(exclude=["object"])
categorical = df.select_dtypes(include=["object"])

In [58]:
rows = []

for var1 in categorical:
    col = []
    for var2 in categorical:
        cramers = cramers_V(categorical[var1], categorical[var2])  # Cramer's V test
        col.append(round(cramers, 2))  # Keeping of the rounded value of the Cramer's V
    rows.append(col)

cramers_results = np.array(rows)
df_corr = pd.DataFrame(
    cramers_results, columns=categorical.columns, index=categorical.columns
)

df_corr

Unnamed: 0,loan_type,property_type,loan_purpose,owner_occupancy,region,applicant_ethnicity,applicant_race_1,applicant_sex,TARGET
loan_type,1.0,0.0,0.02,0.02,0.0,0.01,0.01,0.01,0.0
property_type,0.0,1.0,0.01,0.29,0.01,0.09,0.1,0.1,0.03
loan_purpose,0.02,0.01,1.0,0.0,0.01,0.0,0.01,0.0,0.04
owner_occupancy,0.02,0.29,0.0,1.0,0.0,0.11,0.11,0.11,0.0
region,0.0,0.01,0.01,0.0,1.0,0.01,0.01,0.0,0.01
applicant_ethnicity,0.01,0.09,0.0,0.11,0.01,1.0,0.56,0.51,0.01
applicant_race_1,0.01,0.1,0.01,0.11,0.01,0.56,1.0,0.5,0.02
applicant_sex,0.01,0.1,0.0,0.11,0.0,0.51,0.5,1.0,0.01
TARGET,0.0,0.03,0.04,0.0,0.01,0.01,0.02,0.01,1.0


In [59]:
numerical.corr()

Unnamed: 0,loan_amount_000s,applicant_income_000s
loan_amount_000s,1.0,0.135779
applicant_income_000s,0.135779,1.0


# t-test and f-test

In [60]:
df_ttest = df.copy()

In [61]:
central_tendency = central_tendency_tests(df_ttest)
central_tendency

Unnamed: 0,statistic,p-value
t-test Male&Female,31.884498,7.025117e-223
f-test Male&Female&Others,800.851354,0.0
t-test White&Others,-70.770373,0.0
t-test White&Blacks,-79.779573,0.0
f-test White&Asian&Black&Else,2773.124448,0.0
f-test White&Asian&Black,3273.191361,0.0
t-test Latino&Otherethnicity,37.093858,8.738814e-301
f-test Latino&Otherethnicity,1217.866143,0.0


# Fairness metrics

**applicant_ethnicity**

 -   1 -- Hispanic or Latino  (PROTECTED ATTRIBUTE)  
 -   2 -- Not Hispanic or Latino    
 -   3 -- Information not provided by applicant in mail, Internet, or telephone application    
 -   4 -- Not applicable    
 -   5 -- No co-applicant

**applicant_race_1**

 - 1 -- American Indian or Alaska Native    
 - 2 -- Asian    
 - 3 -- Black or African American    (PROTECTED ATTRIBUTE)
 - 4 -- Native Hawaiian or Other Pacific Islander    
 - 5 -- White   
 - 6 -- Information not provided by applicant in mail, Internet, or telephone application    
 - 7 -- Not applicable    
 - 8 -- No co-applicant 


**applicant_sex**

 -  1 -- Male    
 -  2 -- Female    (PROTECTED ATTRIBUTE)
 -  3 -- Information not provided by applicant in mail, Internet, or telephone application    
 -  4 -- Not applicable    
 -  5 -- No co-applicant


In [62]:
ethnicity_metrics = MetricsDataset(df, "applicant_ethnicity", 1).T.reset_index()
ethnicity_metrics.rename(columns={0: "values", "index": "metrics"}, inplace=True)


race_metrics = MetricsDataset(df, "applicant_race_1", 3).T.reset_index()
race_metrics.rename(columns={0: "values", "index": "metrics"}, inplace=True)


sex_metrics = MetricsDataset(df, "applicant_sex", 2).T.reset_index()
sex_metrics.rename(columns={0: "values", "index": "metrics"}, inplace=True)

In [63]:
dataset_bias = ethnicity_metrics.merge(race_metrics, how="left", on=["metrics"]).merge(
    sex_metrics, how="left", on=["metrics"]
)


col = ["Fairness Metrics", "Values for ethnicity", "Values for race", "Values for sex"]

dataset_bias.columns = col
dataset_bias

Unnamed: 0,Fairness Metrics,Values for ethnicity,Values for race,Values for sex
0,Base_rate,0.20522,0.20522,0.20522
1,Consistency,0.0,0.0,0.0
2,Disparate_impact,0.790311,0.574582,0.867627
3,Mean_difference,-0.052997,-0.143909,-0.029978
4,Num_instances,600000.0,600000.0,600000.0
5,Num_negatives,476868.0,476868.0,476868.0
6,Num_positives,123132.0,123132.0,123132.0
7,Smoothed_empirical_differential_fairness,0.235342,0.55412,0.141997


# Measure of association

In [64]:
def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x, y)
    chi2 = stats.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k - 1) * (r - 1)) / (n - 1))
    rcorr = r - ((r - 1) ** 2) / (n - 1)
    kcorr = k - ((k - 1) ** 2) / (n - 1)
    return np.sqrt(phi2corr / min((kcorr - 1), (rcorr - 1)))

In [65]:
cramers_v(df["applicant_ethnicity"], df["applicant_sex"])

0.7130845179104961

## WoE Calculation

In [66]:
iv, woe = iv_woe(data=df, target="TARGET", bins=10, show_woe=False)

Information value of loan_type is 0.002045
Information value of property_type is 0.143035
Information value of loan_purpose is 0.235208
Information value of owner_occupancy is 0.004316
Information value of loan_amount_000s is 0.215538
Information value of region is 0.03
Information value of applicant_ethnicity is 0.06298
Information value of applicant_race_1 is 0.111837
Information value of applicant_sex is 0.050312
Information value of applicant_income_000s is 0.217054


In [67]:
iv_values = iv.sort_values("IV", ascending=False)
iv_values

Unnamed: 0,Variable,IV
0,loan_purpose,0.235208
0,applicant_income_000s,0.217054
0,loan_amount_000s,0.215538
0,property_type,0.143035
0,applicant_race_1,0.111837
0,applicant_ethnicity,0.06298
0,applicant_sex,0.050312
0,region,0.03
0,owner_occupancy,0.004316
0,loan_type,0.002045


According to Siddiqi (2006), by convention the values of the IV statistic in credit scoring can be interpreted as follows.

If the IV statistic is:
- Less than 0.02, then the predictor is not useful for modeling (separating the Goods from the Bads)
- 0.02 to 0.1, then the predictor has only a weak relationship to the Goods/Bads odds ratio
- 0.1 to 0.3, then the predictor has a medium strength relationship to the Goods/Bads odds ratio
- 0.3 to 0.5, then the predictor has a strong relationship to the Goods/Bads odds ratio.
- 0.5, suspicious relationship (Check once)


We drop variables with IV < 0.02

In [68]:
var_to_drop = iv_values[iv_values["IV"] < 0.02]["Variable"].to_list()

In [69]:
df.drop(var_to_drop, axis=1, inplace=True)
df.head(5)

Unnamed: 0,property_type,loan_purpose,loan_amount_000s,region,applicant_ethnicity,applicant_race_1,applicant_sex,applicant_income_000s,TARGET
0,1,1,220.0,4,2,5,1,191.0,0
1,1,3,91.0,1,2,3,1,31.0,0
2,1,3,1150.0,4,2,2,1,238.0,1
3,1,1,360.0,3,2,5,1,123.0,0
4,1,1,105.0,3,2,2,2,38.0,0


## Mapping of WoE into variable values

In [70]:
woe[["Variable", "Cutoff", "WoE"]][~np.isin(woe["Variable"], var_to_drop)]

Unnamed: 0,Variable,Cutoff,WoE
0,property_type,1,-0.088036
1,property_type,2,1.553769
2,property_type,3,-1.026213
0,loan_purpose,1,-0.501981
1,loan_purpose,2,0.835291
2,loan_purpose,3,0.308708
0,loan_amount_000s,"(0.999, 50.0]",0.938903
1,loan_amount_000s,"(50.0, 88.0]",0.474061
2,loan_amount_000s,"(88.0, 120.0]",0.158796
3,loan_amount_000s,"(120.0, 150.0]",-0.059221


In [71]:
df.loc[df["property_type"] == 1, "property_type"] = -0.088036
df.loc[df["property_type"] == 2, "property_type"] = 1.553769
df.loc[df["property_type"] == 3, "property_type"] = -1.026213


df.loc[df["loan_purpose"] == 1, "loan_purpose"] = -0.50198
df.loc[df["loan_purpose"] == 2, "loan_purpose"] = 0.835291
df.loc[df["loan_purpose"] == 3, "loan_purpose"] = 0.308708


df.loc[
    (df["loan_amount_000s"] > 0.999) & (df["loan_amount_000s"] <= 50.0),
    "loan_amount_000s",
] = 0.939120
df.loc[
    (df["loan_amount_000s"] > 50.0) & (df["loan_amount_000s"] <= 88.0),
    "loan_amount_000s",
] = 0.474278
df.loc[
    (df["loan_amount_000s"] > 88.0) & (df["loan_amount_000s"] <= 120.0),
    "loan_amount_000s",
] = 0.158796
df.loc[
    (df["loan_amount_000s"] > 120.0) & (df["loan_amount_000s"] <= 150.0),
    "loan_amount_000s",
] = -0.059221
df.loc[
    (df["loan_amount_000s"] > 150.0) & (df["loan_amount_000s"] <= 180.0),
    "loan_amount_000s",
] = -0.203135
df.loc[
    (df["loan_amount_000s"] > 180.0) & (df["loan_amount_000s"] <= 216.0),
    "loan_amount_000s",
] = -0.300088
df.loc[
    (df["loan_amount_000s"] > 216.0) & (df["loan_amount_000s"] <= 260.0),
    "loan_amount_000s",
] = -0.375219
df.loc[
    (df["loan_amount_000s"] > 260.0) & (df["loan_amount_000s"] <= 323.0),
    "loan_amount_000s",
] = -0.437538
df.loc[
    (df["loan_amount_000s"] > 323.0) & (df["loan_amount_000s"] <= 424.0),
    "loan_amount_000s",
] = -0.487993
df.loc[
    (df["loan_amount_000s"] > 424.0) & (df["loan_amount_000s"] <= 350000.0),
    "loan_amount_000s",
] = -0.309645


df.loc[df["region"] == 0, "region"] = -0.149481
df.loc[df["region"] == 1, "region"] = -0.005867
df.loc[df["region"] == 2, "region"] = 0.312049
df.loc[df["region"] == 3, "region"] = 0.145859
df.loc[df["region"] == 4, "region"] = -0.139106
df.loc[df["region"] == 5, "region"] = 1.488972


df.loc[df["applicant_ethnicity"] == 0, "applicant_ethnicity"] = 0.269946
df.loc[df["applicant_ethnicity"] == 1, "applicant_ethnicity"] = -0.098127
df.loc[df["applicant_ethnicity"] == 2, "applicant_ethnicity"] = 0.447472
df.loc[df["applicant_ethnicity"] == 3, "applicant_ethnicity"] = -1.476067


df.loc[df["applicant_race_1"] == 1, "applicant_race_1"] = 0.742142
df.loc[df["applicant_race_1"] == 2, "applicant_race_1"] = -0.234168
df.loc[df["applicant_race_1"] == 3, "applicant_race_1"] = 0.682999
df.loc[df["applicant_race_1"] == 4, "applicant_race_1"] = 0.263726
df.loc[df["applicant_race_1"] == 5, "applicant_race_1"] = -0.145577
df.loc[df["applicant_race_1"] == 6, "applicant_race_1"] = 0.418436
df.loc[df["applicant_race_1"] == 7, "applicant_race_1"] = -1.546126


df.loc[df["applicant_sex"] == 1, "applicant_sex"] = -0.099709
df.loc[df["applicant_sex"] == 2, "applicant_sex"] = 0.125601
df.loc[df["applicant_sex"] == 3, "applicant_sex"] = 0.451703
df.loc[df["applicant_sex"] == 4, "applicant_sex"] = -1.547617


df.loc[
    (df["applicant_income_000s"] > 0.999) & (df["applicant_income_000s"] <= 35.0),
    "applicant_income_000s",
] = 1.005641
df.loc[
    (df["applicant_income_000s"] > 35.0) & (df["applicant_income_000s"] <= 46.0),
    "applicant_income_000s",
] = 0.340895
df.loc[
    (df["applicant_income_000s"] > 46.0) & (df["applicant_income_000s"] <= 56.0),
    "applicant_income_000s",
] = 0.104814
df.loc[
    (df["applicant_income_000s"] > 56.0) & (df["applicant_income_000s"] <= 66.0),
    "applicant_income_000s",
] = -0.020844
df.loc[
    (df["applicant_income_000s"] > 66.0) & (df["applicant_income_000s"] <= 78.0),
    "applicant_income_000s",
] = -0.134492
df.loc[
    (df["applicant_income_000s"] > 78.0) & (df["applicant_income_000s"] <= 92.0),
    "applicant_income_000s",
] = -0.224962
df.loc[
    (df["applicant_income_000s"] > 92.0) & (df["applicant_income_000s"] <= 111.0),
    "applicant_income_000s",
] = -0.273656
df.loc[
    (df["applicant_income_000s"] > 111.0) & (df["applicant_income_000s"] <= 138.0),
    "applicant_income_000s",
] = -0.400958
df.loc[
    (df["applicant_income_000s"] > 138.0) & (df["applicant_income_000s"] <= 191.0),
    "applicant_income_000s",
] = -0.503193
df.loc[
    (df["applicant_income_000s"] > 191.0) & (df["applicant_income_000s"] <= 260000.0),
    "applicant_income_000s",
] = -0.522607

In [72]:
df.head(5)

Unnamed: 0,property_type,loan_purpose,loan_amount_000s,region,applicant_ethnicity,applicant_race_1,applicant_sex,applicant_income_000s,TARGET
0,-0.088036,-0.50198,-0.375219,-0.139106,0.447472,-0.145577,-0.099709,-0.503193,0
1,-0.088036,0.308708,0.158796,-0.005867,0.447472,0.682999,-0.099709,1.005641,0
2,-0.088036,0.308708,-0.309645,-0.139106,0.447472,-0.234168,-0.099709,-0.522607,1
3,-0.088036,-0.50198,-0.487993,0.145859,0.447472,-0.145577,-0.099709,-0.400958,0
4,-0.088036,-0.50198,0.158796,0.145859,0.447472,-0.234168,0.125601,0.340895,0
