# Welcome to the What-If Tool Challenge Lab!

In this notebook, you will use mortgage data from NY in 2017 to create two binary classifiers to determine if a mortgage applicant will be granted a loan.

You will train a classifier on two datasets. One is trained on the complete dataset, while the other is trained on a subset of the dataset where 90% of the female applicants that were granted a loan were removed from the training data (so the training data has 90% less females that were granted loans).

You will then compare the two models using the What-If Tool.


Both models are trained on their respective datasets and then are compared using the What-If Tool.

# Download and import the data

In [0]:
#You'll need to install XGBoost on the TF instance
!pip install witwidget --quiet

In [0]:
import pandas as pd
import xgboost as xgb
import numpy as np
import collections

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.utils import shuffle
# from witwidget.notebook.visualization import WitWidget, WitConfigBuilder


# Data from https://www.consumerfinance.gov/data-research/hmda/historic-data/?geo=ny&records=all-records&field_descriptions=labels
!wget https://files.consumerfinance.gov/hmda-historic-loan-data/hmda_2017_ny_all-records_labels.zip

!unzip hmda_2017_ny_all-records_labels.zip

# Preprocess the Data

Here, we first import that dataset into a Pandas dataframe. Then we process the data to exlude incomplete information and make a simple binary classification of loan approvals. We then create two datasets, one complete and one where 90% of female applicants are removed.

In [0]:
import pandas as pd
import numpy as np

# Set column dtypes for Pandas
COLUMN_NAMES = collections.OrderedDict({
  'as_of_year': np.int16,
  'agency_abbr': 'category',
  'loan_type': 'category',
  'property_type': 'category',
  'loan_purpose': 'category',
  'owner_occupancy': np.int8,
  'loan_amt_000s': np.float64,
  'preapproval': 'category',
  'county_code': np.float64,
  'applicant_income_00s': np.float64,
  'purchaser_type': 'category',
  'hoepa_status': 'category',
  'lien_status': 'category',
  'population': np.float64,
  'ffiec_median_fam_income': np.float64,
  'tract_to_msamd_income': np.float64,
  'num_of_owner_occupied_units': np.float64,
  'number_of_1_to_4_family_units': np.float64,
  'approved': np.int8, 
  'applicant_race_name_3': 'category',
  'applicant_race_name_4': 'category',
  'applicant_race_name_5': 'category',
  'co_applicant_race_name_3': 'category',
  'co_applicant_race_name_4': 'category',
  'co_applicant_race_name_5': 'category'
})

# Import the CSV into a dataframe
data = pd.read_csv('hmda_2017_ny_all-records_labels.csv', dtype=COLUMN_NAMES)

In [4]:
# Only use a subset of the columns for these models.
text_columns_to_keep = [
             'agency_name',
             'loan_type_name',
             'property_type_name',
             'loan_purpose_name',
             'owner_occupancy_name',
             'applicant_ethnicity_name',
             'applicant_race_name_1',
             'applicant_sex_name',                      
]
numeric_columns_to_keep = [
             'loan_amount_000s',
             'applicant_income_000s',
             'population',
             'minority_population',
             'hud_median_family_income' 
]
columns_to_keep = text_columns_to_keep + numeric_columns_to_keep + ['action_taken_name']

# Drop columns with incomplete information and drop columns that don't have loan orignated or denied, to make this a simple binary classification.
df = data[columns_to_keep].dropna()
binary_df = df[df.action_taken_name.isin(['Loan originated', 'Application denied by financial institution'])]
binary_df['loan_granted']= np.where(binary_df['action_taken_name'] == 'Loan originated', 1, 0)


binary_df = binary_df.drop(columns=['action_taken_name'])

# Drop 90% of loaned female applicants for a "bad training data" version.
loaned_females = (binary_df['applicant_sex_name'] == 'Female') & (binary_df['loan_granted'] == 1)
bad_binary_df = binary_df.drop(binary_df[loaned_females].sample(frac=.9).index)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [0]:
# Label preprocessing
labels = binary_df['loan_granted'].values

# See the distribution of approved / denied classes (0: denied, 1: approved)
print(binary_df['loan_granted'].value_counts())

# Check out the head 
print("binary_df head: ",binary_df.head())

In [0]:
# Turn categorical string features into simple 0/1 features (like turning "sex" into "sex_male" and "sex_female")
dummies_df = pd.get_dummies(binary_df, columns=text_columns_to_keep)
dummies_df = dummies_df.sample(frac=1).reset_index(drop=True)

bad_dummies_df = pd.get_dummies(bad_binary_df, columns=text_columns_to_keep)
bad_dummies_df = bad_dummies_df.sample(frac=1).reset_index(drop=True)

In [0]:
# Normalize the numeric columns so that the all have the same scale to simplify modeling/training.
from sklearn import preprocessing

def normalize():
  min_max_scaler = preprocessing.MinMaxScaler()
  # wouldn't these values be gone tho
  column_names_to_normalize = ['loan_amount_000s', 'applicant_income_000s', 'minority_population', 'hud_median_family_income', 'population']
  x = dummies_df[column_names_to_normalize].values
  x_scaled = min_max_scaler.fit_transform(x)
  df_temp = pd.DataFrame(x_scaled, columns=column_names_to_normalize, index = dummies_df.index)
  dummies_df[column_names_to_normalize] = df_temp

  x = bad_dummies_df[column_names_to_normalize].values
  x_scaled = min_max_scaler.fit_transform(x)
  bad_df_temp = pd.DataFrame(x_scaled, columns=column_names_to_normalize, index = bad_dummies_df.index)
  bad_dummies_df[column_names_to_normalize] = bad_df_temp

normalize()

In [0]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.utils import shuffle

# Get training data & labels
train_data = dummies_df
train_labels = train_data['loan_granted']
train_data = train_data.drop(columns=['loan_granted'])

# Get bad training data and labels
bad_train_data = bad_dummies_df
bad_train_labels = bad_train_data['loan_granted']
bad_train_data = bad_dummies_df.drop(columns=['loan_granted'])

# Split the data into train / test sets
x,y = train_data,train_labels
x_train,x_test,y_train,y_test = train_test_split(x,y)

# Split the bad data into train / test sets
bad_x,bad_y=bad_train_data,bad_train_labels
bad_x_train,bad_x_test,bad_y_train,bad_y_test = train_test_split(bad_x,bad_y)

In [19]:
# Train the model, this will take a few minutes to run
model_1 = xgb.XGBClassifier(max_depth=3, learning_rate=0.05, objective='reg:logistic')

model_2 = xgb.XGBClassifier(objective='reg:logistic',max_depth=3)

model_1.fit(x_train, y_train)

model_2.fit(bad_x_train, bad_y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='reg:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [20]:
# Get predictions on the test set and print the accuracy score (Model 1)
y_pred = model_1.predict(x_test)
acc = accuracy_score(y_test, y_pred.round())
print("accuracy of model_1 ", acc, '\n')

# Print a confusion matrix for Model 1
print('Confusion matrix for Model 1:')
cm = confusion_matrix(y_test, y_pred.round())
cm = cm / cm.astype(np.float).sum(axis=1)
print(cm, '\n')

# Get predictions on the test set and print the accuracy score (Model 2)
bad_y_pred = model_2.predict(bad_x_test)
acc = accuracy_score(bad_y_test, bad_y_pred.round())
print("accuracy of model_2 ", acc, '\n')

# Print a confusion matrix for Model 2
print('Confusion matrix for Model 2:')
cm = confusion_matrix(bad_y_test, bad_y_pred.round())
cm = cm / cm.astype(np.float).sum(axis=1)
print(cm)

accuracy of model_1  0.7974464038485742 

Confusion matrix for Model 1:
[[0.15173337 0.23848428]
 [0.07475147 0.97898414]] 

accuracy of model_2  0.7977705861201007 

Confusion matrix for Model 2:
[[0.43128655 0.22430305]
 [0.1462624  0.94231348]]


In [0]:
# Save the model so we can deploy it
model_1.save_model('good-model.bst')

model_2.save_model('bad-model.bst')

In [0]:
GCP_PROJECT = ''
MODEL_BUCKET = ''
MODEL_NAME = 'good-model.bst' # You'll create this model below
BAD_MODEL_NAME = 'bad-model.bst'
VERSION_NAME = 'v1'

In [0]:
# Copy your model file to Cloud Storage
!gsutil cp ./good-model.bst $MODEL_BUCKET

!gsutil cp ./bad-model.bst $MODEL_BUCKET

In [0]:
# Configure gcloud to use your project
!gcloud config set project $GCP_PROJECT

In [0]:
# Create a model
!gcloud ai-platform models create $MODEL_NAME

In [0]:
# Create a version, this will take ~2 minutes to deploy
!gcloud ai-platform versions create $VERSION_NAME \
--model=$MODEL_NAME \
--framework='XGBOOST' \
--runtime-version=1.14 \
--origin=$MODEL_BUCKET \
--staging-bucket=$MODEL_BUCKET \
--python-version=3.5 \
--project=$GCP_PROJECT

In [0]:
# Create a version, this will take ~2 minutes to deploy
!gcloud ai-platform versions create $VERSION_NAME \
--model=$BAD_MODEL_NAME \
--framework='XGBOOST' \
--runtime-version=1.14 \
--origin=$MODEL_BUCKET \
--staging-bucket=$MODEL_BUCKET \
--python-version=3.5 \
--project=$GCP_PROJECT

# Using the What-if Tool to interpret your model
Once your model has deployed, you're ready to connect it to the What-if Tool using the WitWidget.

In [0]:
# Format a subset of the test data to send to the What-if Tool for visualization
# Append ground truth label value to training data

# This is the number of examples you want to display in the What-if Tool
num_wit_examples = 500
test_examples = np.hstack((x_test[:num_wit_examples].values,y_test[:num_wit_examples].reshape(-1,1)))

In [0]:
# Create a What-if Tool visualization, it may take a minute to load
# See the cell below this for exploration ideas

# This prediction adjustment function is needed as this xgboost model's
# prediction returns just a score for the positive class of the binary
# classification, whereas the What-If Tool expects a list of scores for each
# class (in this case, both the negative class and the positive class).
def adjust_prediction(pred):
  return [1 - pred, pred]

config_builder = (WitConfigBuilder(test_examples.tolist(), data.columns.tolist() + ['mortgage_status'])
  .set_ai_platform_model(GCP_PROJECT, MODEL_NAME, VERSION_NAME, adjust_prediction=adjust_prediction)
  .set_target_feature('mortgage_status')
  .set_label_vocab(['denied', 'approved']))
WitWidget(config_builder, height=800)

In [0]:
# !pip install witwidget --quiet
#!pip3 install witwidget

#@title Show model results in WIT
from witwidget.notebook.visualization import WitWidget, WitConfigBuilder
num_datapoints = 1000  #@param {type: "number"}

# Column indices to strip out from data from WIT before passing it to the model.
columns_not_for_model_input = [
    test_data_with_labels.columns.get_loc('loan_granted'),
]

# Return model predictions.
def custom_predict(examples_to_infer):
  # Delete columns not used by model
  model_inputs = np.delete(
      np.array(examples_to_infer), columns_not_for_model_input, axis=1).tolist()
  # Get the class predictions from the model.
  preds = model.predict(model_inputs)
  preds = [[1 - pred[0], pred[0]] for pred in preds]
  return preds
  
def bad_custom_predict(examples_to_infer):
  # Delete columns not used by model
  model_inputs = np.delete(
      np.array(examples_to_infer), columns_not_for_model_input, axis=1).tolist()

  # Get the class predictions from the model.
  preds = bad_model.predict(model_inputs)
  preds = [[1 - pred[0], pred[0]] for pred in preds]

  return preds

examples_for_wit = test_data_with_labels.values.tolist()
column_names = test_data_with_labels.columns.tolist()

config_builder = WitConfigBuilder(
    examples_for_wit[:num_datapoints],
    feature_names=column_names).set_custom_predict_fn(
  bad_custom_predict).set_target_feature('loan_granted').set_label_vocab(
      ['denied', 'accepted']).set_compare_custom_predict_fn(custom_predict).set_model_name('limited').set_compare_model_name('complete')

ww = WitWidget(config_builder, height=800)

Things to notice:
- In the datapoint visualization, the top arc of points (not on the diagonal) are the females in the test data, where the limited dataset model under-scores female applicants compared to the complete dataset model. You can see this clearly by binning or coloring the visualization by sex.
- In the performance & fairness tab, the complete model has much higher accuracy and f1 score.
- If you slice by sex, the complete model has equal performance across sexes, whereas the limited model is much, much worse on females.
  - If you use the fairness buttons to see the thresholds for the sexes for demographic parity between male and female, you see that the thresholds have to be wildly different for the limited model.