In [1]:
import random
from collections import defaultdict

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats as stats
import seaborn as sns
from aif360.algorithms import postprocessing, preprocessing
from aif360.datasets import BinaryLabelDataset
from aif360.metrics import BinaryLabelDatasetMetric, ClassificationMetric
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier

from utils.fairness_functions import *

Code for creating the sample dataset with 600k observations
Only filtered observations with action_taken between 1,2 and 3
  
  
  1 -- Loan originated    
  2 -- Application approved but not accepted    
  3 -- Application denied by financial institution 

In [2]:
# df_full = pd.read_csv('data\hmda_2017_nationwide_all-records_labels.csv')
# df_full = df_full[df_full['action_taken'].isin([1,2,3])]
# df_sample = df_full.sample(n = 600000, replace=False)
# df_sample.to_csv("data\sample.csv")

load dataset

In [3]:
df = pd.read_csv("data\sample.csv")

Columns (35,37,39,45,47,49) have mixed types.Specify dtype option on import or set low_memory=False.


we create the target variable based on the action_taken variable

1 and 2 = approved

3 = rejected

Based on the dataset we select only application-related variables


In [4]:
df["TARGET"] = np.where(df["action_taken"].isin([1, 2]), 0, 1)

In [5]:
cols_to_keep = [
    "loan_type",
    "property_type",
    "loan_purpose",
    "owner_occupancy",
    "loan_amount_000s",
    "state_abbr",
    "applicant_ethnicity",
    "applicant_race_1",
    "applicant_sex",
    "applicant_income_000s",
    "TARGET",
]

In [6]:
df = df[cols_to_keep]

We have info about the state of the applicant. We use this information for visualisation, for modelling, when stack these categories into regions

In [7]:
states = {
    "AK": "O",
    "AL": "S",
    "AR": "S",
    "AS": "O",
    "AZ": "W",
    "CA": "W",
    "CO": "W",
    "CT": "N",
    "DC": "N",
    "DE": "N",
    "FL": "S",
    "GA": "S",
    "GU": "O",
    "HI": "O",
    "IA": "M",
    "ID": "W",
    "IL": "M",
    "IN": "M",
    "KS": "M",
    "KY": "S",
    "LA": "S",
    "MA": "N",
    "MD": "N",
    "ME": "N",
    "MI": "W",
    "MN": "M",
    "MO": "M",
    "MP": "O",
    "MS": "S",
    "MT": "W",
    "NA": "O",
    "NC": "S",
    "ND": "M",
    "NE": "W",
    "NH": "N",
    "NJ": "N",
    "NM": "W",
    "NV": "W",
    "NY": "N",
    "OH": "M",
    "OK": "S",
    "OR": "W",
    "PA": "N",
    "PR": "O",
    "RI": "N",
    "SC": "S",
    "SD": "M",
    "TN": "S",
    "TX": "S",
    "UT": "W",
    "VA": "S",
    "VI": "O",
    "VT": "N",
    "WA": "W",
    "WI": "M",
    "WV": "S",
    "WY": "W",
}

In [8]:
df["state_abbr"].replace(states, inplace=True)
df.rename(columns={"state_abbr": "region"}, inplace=True)

In [9]:
labelencoder = LabelEncoder()
df["region"] = labelencoder.fit_transform(df["region"])

Calculation of pct of approved / rejected

In [10]:
df.TARGET.value_counts() / len(df) * 100

0    79.478
1    20.522
Name: TARGET, dtype: float64

calculation of pct missing per column

In [11]:
df.isna().sum()/len(df)

loan_type                0.000000
property_type            0.000000
loan_purpose             0.000000
owner_occupancy          0.000000
loan_amount_000s         0.000298
region                   0.000000
applicant_ethnicity      0.000000
applicant_race_1         0.000000
applicant_sex            0.000000
applicant_income_000s    0.056035
TARGET                   0.000000
dtype: float64

Columns **applicant_income_000s** and **loan_amount_000s** have missing obs, still really few of them. Therefore we inpute with the median since these variables are skewed

In [12]:
df['loan_amount_000s'].fillna(value=df['loan_amount_000s'].median(),inplace=True)
df['applicant_income_000s'].fillna(value=df['applicant_income_000s'].median(),inplace=True)

Basic statistics for continuous and categorical variables

In [13]:
categorical_variables = [
    "region",
    "loan_type",
    "property_type",
    "loan_purpose",
    "owner_occupancy",
    "applicant_ethnicity",
    "applicant_race_1",
    "applicant_sex",
    "TARGET",
]
numerical_variables = ["loan_amount_000s", "applicant_income_000s"]

In [14]:
df[categorical_variables] = df[categorical_variables].astype(object)

In [15]:
df.describe(include="object").T

Unnamed: 0,count,unique,top,freq
loan_type,600000,4,1,442635
property_type,600000,3,1,573102
loan_purpose,600000,3,1,315726
owner_occupancy,600000,3,1,536176
region,600000,6,3,214430
applicant_ethnicity,600000,4,2,459834
applicant_race_1,600000,7,5,433966
applicant_sex,600000,4,1,372425
TARGET,600000,2,0,476868


In [16]:
df.describe(include="float64").T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
loan_amount_000s,600000.0,247.131963,1064.674598,1.0,104.0,180.0,288.0,350000.0
applicant_income_000s,600000.0,112.142255,775.821355,1.0,52.0,78.0,119.0,260000.0


In [17]:
ds_cat_stats = pd.DataFrame(
    columns=[
        "Column",
        "Unique Values",
        "Number of Missing Values",
        "Percentage of Missing Values",
    ]
)
tmp = pd.DataFrame()

for c in df[categorical_variables]:
    tmp["Column"] = [c]
    tmp["Unique Values"] = [df[c].unique()]
    tmp["Number of Missing Values"] = df[c].isnull().sum()
    tmp["Percentage of Missing Values"] = (df[c].isnull().sum() / len(df)).round(
        3
    ) * 100
    ds_cat_stats = ds_cat_stats.append(tmp)

ds_cat_stats

Unnamed: 0,Column,Unique Values,Number of Missing Values,Percentage of Missing Values
0,region,"[4, 1, 3, 0, 2, 5]",0,0.0
0,loan_type,"[1, 2, 3, 4]",0,0.0
0,property_type,"[1, 2, 3]",0,0.0
0,loan_purpose,"[1, 3, 2]",0,0.0
0,owner_occupancy,"[1, 2, 3]",0,0.0
0,applicant_ethnicity,"[2, 3, 1, 4]",0,0.0
0,applicant_race_1,"[5, 3, 2, 6, 1, 7, 4]",0,0.0
0,applicant_sex,"[1, 2, 3, 4]",0,0.0
0,TARGET,"[0, 1]",0,0.0


## Formula for calculating WoE

In [18]:
def iv_woe(data, target, bins=10, show_woe=False):

    # Empty Dataframe
    newDF, woeDF = pd.DataFrame(), pd.DataFrame()

    # Extract Column Names
    cols = data.columns

    # Run WOE and IV on all the independent variables
    for ivars in cols[~cols.isin([target])]:
        if (data[ivars].dtype.kind in "bifc") and (len(np.unique(data[ivars])) > 10):
            binned_x = pd.qcut(data[ivars], bins, duplicates="drop")
            d0 = pd.DataFrame({"x": binned_x, "y": data[target]})
        else:
            d0 = pd.DataFrame({"x": data[ivars], "y": data[target]})

        # Calculate the number of events in each group (bin)
        d = d0.groupby("x", as_index=False).agg({"y": ["count", "sum"]})
        d.columns = ["Cutoff", "N", "Events"]

        # Calculate % of events in each group.
        d["% of Events"] = np.maximum(d["Events"], 0.5) / d["Events"].sum()

        # Calculate the non events in each group.
        d["Non-Events"] = d["N"] - d["Events"]
        # Calculate % of non events in each group.
        d["% of Non-Events"] = np.maximum(d["Non-Events"], 0.5) / d["Non-Events"].sum()

        # Calculate WOE by taking natural log of division of % of non-events and % of events
        d["WoE"] = np.log(d["% of Events"] / d["% of Non-Events"])
        d["IV"] = d["WoE"] * (d["% of Events"] - d["% of Non-Events"])
        d.insert(loc=0, column="Variable", value=ivars)
        print("Information value of " + ivars + " is " + str(round(d["IV"].sum(), 6)))
        temp = pd.DataFrame(
            {"Variable": [ivars], "IV": [d["IV"].sum()]}, columns=["Variable", "IV"]
        )
        newDF = pd.concat([newDF, temp], axis=0)
        woeDF = pd.concat([woeDF, d], axis=0)

        # Show WOE Table
        if show_woe == True:
            print(d)
    return newDF, woeDF

## WoE Calculation

In [19]:
iv, woe = iv_woe(data=df, target="TARGET", bins=10, show_woe=False)

Information value of loan_type is 0.002045
Information value of property_type is 0.143035
Information value of loan_purpose is 0.235208
Information value of owner_occupancy is 0.004316
Information value of loan_amount_000s is 0.215538
Information value of region is 0.03
Information value of applicant_ethnicity is 0.06298
Information value of applicant_race_1 is 0.111837
Information value of applicant_sex is 0.050312
Information value of applicant_income_000s is 0.217054


In [20]:
iv_values = iv.sort_values("IV", ascending=False)
iv_values

Unnamed: 0,Variable,IV
0,loan_purpose,0.235208
0,applicant_income_000s,0.217054
0,loan_amount_000s,0.215538
0,property_type,0.143035
0,applicant_race_1,0.111837
0,applicant_ethnicity,0.06298
0,applicant_sex,0.050312
0,region,0.03
0,owner_occupancy,0.004316
0,loan_type,0.002045


According to Siddiqi (2006), by convention the values of the IV statistic in credit scoring can be interpreted as follows.

If the IV statistic is:
- Less than 0.02, then the predictor is not useful for modeling (separating the Goods from the Bads)
- 0.02 to 0.1, then the predictor has only a weak relationship to the Goods/Bads odds ratio
- 0.1 to 0.3, then the predictor has a medium strength relationship to the Goods/Bads odds ratio
- 0.3 to 0.5, then the predictor has a strong relationship to the Goods/Bads odds ratio.
- 0.5, suspicious relationship (Check once)


We drop variables with IV < 0.02

In [21]:
var_to_drop = iv_values[iv_values['IV'] < 0.02]['Variable'].to_list()

In [22]:
df.drop(var_to_drop, axis=1, inplace=True)
df.head(5)

Unnamed: 0,property_type,loan_purpose,loan_amount_000s,region,applicant_ethnicity,applicant_race_1,applicant_sex,applicant_income_000s,TARGET
0,1,1,220.0,4,2,5,1,191.0,0
1,1,3,91.0,1,2,3,1,31.0,0
2,1,3,1150.0,4,2,2,1,238.0,1
3,1,1,360.0,3,2,5,1,123.0,0
4,1,1,105.0,3,2,2,2,38.0,0


## Mapping of WoE into variable values

In [28]:
woe[['Variable', 'Cutoff', 'WoE']][~np.isin(woe['Variable'], var_to_drop)]

Unnamed: 0,Variable,Cutoff,WoE
0,property_type,1,-0.088036
1,property_type,2,1.553769
2,property_type,3,-1.026213
0,loan_purpose,1,-0.501981
1,loan_purpose,2,0.835291
2,loan_purpose,3,0.308708
0,loan_amount_000s,"(0.999, 50.0]",0.938903
1,loan_amount_000s,"(50.0, 88.0]",0.474061
2,loan_amount_000s,"(88.0, 120.0]",0.158796
3,loan_amount_000s,"(120.0, 150.0]",-0.059221


In [29]:
df.loc[df["property_type"] == 1, "property_type"] = -0.088036
df.loc[df["property_type"] == 2, "property_type"] = 1.553769
df.loc[df["property_type"] == 3, "property_type"] = -1.026213


df.loc[df["loan_purpose"] == 1, "loan_purpose"] = -0.50198
df.loc[df["loan_purpose"] == 2, "loan_purpose"] = 0.835291
df.loc[df["loan_purpose"] == 3, "loan_purpose"] = 0.308708


df.loc[(df["loan_amount_000s"] > 0.999) & (df["loan_amount_000s"] <= 50.0),
"loan_amount_000s",
] = 0.939120
df.loc[
    (df["loan_amount_000s"] > 50.0) & (df["loan_amount_000s"] <= 88.0),
    "loan_amount_000s",
] = 0.474278
df.loc[
    (df["loan_amount_000s"] > 88.0) & (df["loan_amount_000s"] <= 120.0),
    "loan_amount_000s",
] = 0.158796
df.loc[
    (df["loan_amount_000s"] > 120.0) & (df["loan_amount_000s"] <= 150.0),
    "loan_amount_000s",
] = -0.059221
df.loc[
    (df["loan_amount_000s"] > 150.0) & (df["loan_amount_000s"] <= 180.0),
    "loan_amount_000s",
] = -0.203135
df.loc[
    (df["loan_amount_000s"] > 180.0) & (df["loan_amount_000s"] <= 216.0),
    "loan_amount_000s",
] = -0.300088
df.loc[
    (df["loan_amount_000s"] > 216.0) & (df["loan_amount_000s"] <= 260.0),
    "loan_amount_000s",
] = -0.375219
df.loc[
    (df["loan_amount_000s"] > 260.0) & (df["loan_amount_000s"] <= 323.0),
    "loan_amount_000s",
] = -0.437538
df.loc[
    (df["loan_amount_000s"] > 323.0) & (df["loan_amount_000s"] <= 424.0),
    "loan_amount_000s",
] = -0.487993
df.loc[
    (df["loan_amount_000s"] > 424.0) & (df["loan_amount_000s"] <= 350000.0),
    "loan_amount_000s",
] = -0.309645


df.loc[df["region"] == 0, "region"] = -0.149481
df.loc[df["region"] == 1, "region"] = -0.005867
df.loc[df["region"] == 2, "region"] = 0.312049
df.loc[df["region"] == 3, "region"] = 0.145859
df.loc[df["region"] == 4, "region"] = -0.139106
df.loc[df["region"] == 5, "region"] = 1.488972


df.loc[df["applicant_ethnicity"] == 0, "applicant_ethnicity"] = 0.269946
df.loc[df["applicant_ethnicity"] == 1, "applicant_ethnicity"] = -0.098127
df.loc[df["applicant_ethnicity"] == 2, "applicant_ethnicity"] = 0.447472
df.loc[df["applicant_ethnicity"] == 3, "applicant_ethnicity"] = -1.476067


df.loc[df["applicant_race_1"] == 1, "applicant_race_1"] = 0.742142
df.loc[df["applicant_race_1"] == 2, "applicant_race_1"] = -0.234168
df.loc[df["applicant_race_1"] == 3, "applicant_race_1"] = 0.682999
df.loc[df["applicant_race_1"] == 4, "applicant_race_1"] = 0.263726
df.loc[df["applicant_race_1"] == 5, "applicant_race_1"] = -0.145577
df.loc[df["applicant_race_1"] == 6, "applicant_race_1"] = 0.418436
df.loc[df["applicant_race_1"] == 7, "applicant_race_1"] = -1.546126


df.loc[df["applicant_sex"] == 1, "applicant_sex"] = -0.099709
df.loc[df["applicant_sex"] == 2, "applicant_sex"] = 0.125601
df.loc[df["applicant_sex"] == 3, "applicant_sex"] = 0.451703
df.loc[df["applicant_sex"] == 4, "applicant_sex"] = -1.547617


df.loc[
    (df["applicant_income_000s"] > 0.999) & (df["applicant_income_000s"] <= 35.0),
    "applicant_income_000s",
] = 1.005641
df.loc[
    (df["applicant_income_000s"] > 35.0) & (df["applicant_income_000s"] <= 46.0),
    "applicant_income_000s",
] = 0.340895
df.loc[
    (df["applicant_income_000s"] > 46.0) & (df["applicant_income_000s"] <= 56.0),
    "applicant_income_000s",
] = 0.104814
df.loc[
    (df["applicant_income_000s"] > 56.0) & (df["applicant_income_000s"] <= 66.0),
    "applicant_income_000s",
] = -0.020844
df.loc[
    (df["applicant_income_000s"] > 66.0) & (df["applicant_income_000s"] <= 78.0),
    "applicant_income_000s",
] = -0.134492
df.loc[
    (df["applicant_income_000s"] > 78.0) & (df["applicant_income_000s"] <= 92.0),
    "applicant_income_000s",
] = -0.224962
df.loc[
    (df["applicant_income_000s"] > 92.0) & (df["applicant_income_000s"] <= 111.0),
    "applicant_income_000s",
] = -0.273656
df.loc[
    (df["applicant_income_000s"] > 111.0) & (df["applicant_income_000s"] <= 138.0),
    "applicant_income_000s",
] = -0.400958
df.loc[
    (df["applicant_income_000s"] > 138.0) & (df["applicant_income_000s"] <= 191.0),
    "applicant_income_000s",
] = -0.503193
df.loc[
    (df["applicant_income_000s"] > 191.0) & (df["applicant_income_000s"] <= 260000.0),
    "applicant_income_000s",
] = -0.522607

In [30]:
df.head(5)

Unnamed: 0,property_type,loan_purpose,loan_amount_000s,region,applicant_ethnicity,applicant_race_1,applicant_sex,applicant_income_000s,TARGET
0,-0.088036,-0.50198,-0.375219,-0.139106,0.447472,-0.145577,-0.099709,-0.503193,0
1,-0.088036,0.308708,0.158796,-0.005867,0.447472,0.682999,-0.099709,1.005641,0
2,-0.088036,0.308708,-0.309645,-0.139106,0.447472,-0.234168,-0.099709,-0.522607,1
3,-0.088036,-0.50198,-0.487993,0.145859,0.447472,-0.145577,-0.099709,-0.400958,0
4,-0.088036,-0.50198,0.158796,0.145859,0.447472,-0.234168,0.125601,0.340895,0
