<a href="https://colab.research.google.com/github/rocpoc/demo-repo/blob/master/Mortgage_Quiklab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Welcome to the What-If Tool Challenge Lab!

In this notebook, you will use mortgage data from NY in 2017 to create two binary classifiers to determine if a mortgage applicant will be granted a loan.

You will train a classifier on two datasets. One is trained on the complete dataset, while the other is trained on a subset of the dataset where 90% of the female applicants that were granted a loan were removed from the training data (so the training data has 90% less females that were granted loans).

You will then compare the two models using the What-If Tool.


Both models are trained on their respective datasets and then are compared using the What-If Tool.

# First, download the data

In [0]:
# Data from https://www.consumerfinance.gov/data-research/hmda/historic-data/?geo=ny&records=all-records&field_descriptions=labels
!wget https://files.consumerfinance.gov/hmda-historic-loan-data/hmda_2017_ny_all-records_labels.zip
!unzip hmda_2017_ny_all-records_labels.zip

In [0]:
import pandas as pd
import numpy as np
data = pd.read_csv('hmda_2017_ny_all-records_labels.csv',)

In [0]:
data.describe()

In [0]:
# Only use a subset of the columns for these models.
text_columns_to_keep = [
             'agency_name',
             'loan_type_name',
             'property_type_name',
             'loan_purpose_name',
             'owner_occupancy_name',
             'applicant_ethnicity_name',
             'applicant_race_name_1',
             'applicant_sex_name',                      
]
numeric_columns_to_keep = [
             'loan_amount_000s',
             'applicant_income_000s',
             'population',
             'minority_population',
             'hud_median_family_income' 
]
columns_to_keep = text_columns_to_keep + numeric_columns_to_keep + ['action_taken_name']

# Drop columns with incomplete information and drop columns that don't have loan orignated or denied, to make this a simple binary classification.
df = data[columns_to_keep].dropna()
binary_df = df[df.action_taken_name.isin(['Loan originated', 'Application denied by financial institution'])]
binary_df['loan_granted']= np.where(binary_df['action_taken_name'] == 'Loan originated', 1, 0)


binary_df = binary_df.drop(columns=['action_taken_name'])

# Drop 90% of loaned female applicants for a "bad training data" version.
loaned_females = (binary_df['applicant_sex_name'] == 'Female') & (binary_df['loan_granted'] == 1)
bad_binary_df = binary_df.drop(binary_df[loaned_females].sample(frac=.9).index)

In [0]:
# Turn categorical string features into simple 0/1 features (like turning "sex" into "sex_male" and "sex_female")
dummies_df = pd.get_dummies(binary_df, columns=text_columns_to_keep)
dummies_df = dummies_df.sample(frac=1).reset_index(drop=True)

bad_dummies_df = pd.get_dummies(bad_binary_df, columns=text_columns_to_keep)
bad_dummies_df = bad_dummies_df.sample(frac=1).reset_index(drop=True)

In [0]:
# Normalize the numeric columns so that the all have the same scale to simplify modeling/training.
from sklearn import preprocessing

def normalize():
  min_max_scaler = preprocessing.MinMaxScaler()
  column_names_to_normalize = ['loan_amount_000s', 'applicant_income_000s', 'minority_population', 'hud_median_family_income', 'population']
  x = dummies_df[column_names_to_normalize].values
  x_scaled = min_max_scaler.fit_transform(x)
  df_temp = pd.DataFrame(x_scaled, columns=column_names_to_normalize, index = dummies_df.index)
  dummies_df[column_names_to_normalize] = df_temp

  x = bad_dummies_df[column_names_to_normalize].values
  x_scaled = min_max_scaler.fit_transform(x)
  bad_df_temp = pd.DataFrame(x_scaled, columns=column_names_to_normalize, index = bad_dummies_df.index)
  bad_dummies_df[column_names_to_normalize] = bad_df_temp

normalize()

In [0]:
# Split the data into test and train datasets.
train_size = int(len(dummies_df) * 0.8)

# train_data = dummies_df[:train_size]
train_data = dummies_df
train_labels = train_data['loan_granted']
train_data = train_data.drop(columns=['loan_granted'])

bad_train_size = int(len(bad_dummies_df) * 0.8)
# bad_train_data = bad_dummies_df[:bad_train_size]
bad_train_data = bad_dummies_df
bad_train_labels = bad_train_data['loan_granted']
bad_train_data = bad_train_data.drop(columns=['loan_granted'])

test_data_with_labels = dummies_df[train_size:]

In [0]:
import xgboost as xgb
import collections

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.utils import shuffle

# Split the data into train / test sets
x,y = train_data,train_labels
x_train,x_test,y_train,y_test = train_test_split(x,y)

# Train the model, this will take a few minutes to run
bst = xgb.XGBClassifier(
    objective='reg:logistic'
)

bst.fit(x_train, y_train)
# Get predictions on the test set and print the accuracy score
y_pred = bst.predict(x_test)
acc = accuracy_score(y_test, y_pred.round())
print("complete model accuracy:", acc, '\n')


In [0]:
# Split the bad data into train / test sets
bad_x,bad_y = bad_train_data,bad_train_labels
bad_x_train,bad_x_test,bad_y_train,bad_y_test = train_test_split(bad_x,bad_y)

# Train the model, this will take a few minutes to run
bst = xgb.XGBClassifier(
    objective='reg:logistic'
)

bst.fit(bad_x_train, bad_y_train)
# Get predictions on the test set and print the accuracy score
y_pred = bst.predict(bad_x_test)
acc = accuracy_score(bad_y_test, y_pred.round())
print("bad model accuracy:", acc, '\n')

In [0]:
# Train the model
from tensorflow.keras import layers
from tensorflow.keras import initializers
from tensorflow.keras import optimizers
from tensorflow.keras.models import Sequential
from keras.constraints import maxnorm

# This is the size of the array we'll be feeding into our model for each example
input_size = len(train_data.iloc[0])

model = Sequential()
model.add(layers.Dense(64, input_shape=(input_size,), activation='relu', kernel_constraint=maxnorm(3)))
model.add(layers.Dropout(0.2))
model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])

# model.add(layers.Dense(100, input_dim=input_size, activation='relu', kernel_constraint=maxnorm(3)))
# model.add(layers.Dropout(0.2))
# # hidden layer
# model.add(layers.Dense(60, activation='relu', kernel_constraint=maxnorm(3)))
# model.add(layers.Dropout(0.2))
# # output layer
# model.add(layers.Dense(1, activation='softmax'))
# # Compile model
# model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])






history = model.fit(train_data.values, train_labels.values, epochs=10, batch_size=2048, validation_split=0.1)

In [0]:
import matplotlib.pyplot as plt

# Plot training & validation accuracy values
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

# Plot training & validation loss values
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

In [0]:
# Train the model for the bad training data
bad_model = Sequential()
bad_model.add(layers.Dense(200, input_shape=(input_size,), activation='relu'))
bad_model.add(layers.Dense(50, activation='relu'))
bad_model.add(layers.Dense(20, activation='relu'))
bad_model.add(layers.Dense(1, activation='sigmoid'))
bad_model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])

bad_model.fit(bad_train_data.values, bad_train_labels.values, epochs=10, batch_size=2048, validation_split=0.1)

In [0]:
!pip install witwidget --quiet

In [0]:
#@title Show model results in WIT
from witwidget.notebook.visualization import WitWidget, WitConfigBuilder
num_datapoints = 1000  #@param {type: "number"}

# Column indices to strip out from data from WIT before passing it to the model.
columns_not_for_model_input = [
    test_data_with_labels.columns.get_loc('loan_granted'),
]

# Return model predictions.
def custom_predict(examples_to_infer):
  # Delete columns not used by model
  model_inputs = np.delete(
      np.array(examples_to_infer), columns_not_for_model_input, axis=1).tolist()
  # Get the class predictions from the model.
  preds = model.predict(model_inputs)
  preds = [[1 - pred[0], pred[0]] for pred in preds]
  return preds
  
def bad_custom_predict(examples_to_infer):
  # Delete columns not used by model
  model_inputs = np.delete(
      np.array(examples_to_infer), columns_not_for_model_input, axis=1).tolist()

  # Get the class predictions from the model.
  preds = bad_model.predict(model_inputs)
  preds = [[1 - pred[0], pred[0]] for pred in preds]

  return preds

examples_for_wit = test_data_with_labels.values.tolist()
column_names = test_data_with_labels.columns.tolist()

config_builder = WitConfigBuilder(
    examples_for_wit[:num_datapoints],
    feature_names=column_names).set_custom_predict_fn(
  bad_custom_predict).set_target_feature('loan_granted').set_label_vocab(
      ['denied', 'accepted']).set_compare_custom_predict_fn(custom_predict).set_model_name('limited').set_compare_model_name('complete')

ww = WitWidget(config_builder, height=800)

Things to notice:
- In the datapoint visualization, the top arc of points (not on the diagonal) are the females in the test data, where the limited dataset model under-scores female applicants compared to the complete dataset model. You can see this clearly by binning or coloring the visualization by sex.
- In the performance & fairness tab, the complete model has much higher accuracy and f1 score.
- If you slice by sex, the complete model has equal performance across sexes, whereas the limited model is much, much worse on females.
  - If you use the fairness buttons to see the thresholds for the sexes for demographic parity between male and female, you see that the thresholds have to be wildly different for the limited model.