### Install and Import package

In [24]:
!pip install veritastool





In [25]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
from sklearn.linear_model import LogisticRegression
from veritastool.util.utility import test_function_cs
from veritastool.model import ModelContainer
from veritastool.fairness import CreditScoring
from typing import *

### Create the FairnessMetricsEvaluator class

In [26]:
class FairnessMetricsEvaluator:
    
    def __init__(self,
                 train_data: Dict[str, Union[pd.DataFrame, np.ndarray]],
                 test_data: Dict[str, Union[pd.DataFrame, np.ndarray]],
                 best_threshold : float,
                 cat_var_list: [str],
                 prot_cat_var_value: Dict[str,List[str]],
                 fairthreshold: float,
                 perf_metric_name: str,
                 model_object):
        """
        train_data['raw_X'] : All the feature values in training set.
        train_data['y'] : Ground truth for training data.
        train_data['pred_proba'] : Predicted targets as returned by classifier.
        test_data['raw_X'] : All the feature values in testing set.
        test_data['y'] : Ground truth for training data.
        test_data['pred_proba'] : Predicted targets as returned by classifier.
        threshold : The threshold to be used for predicting labels. 
        prot_cat_var_value: python dict where the key is string and 
        value is a list of string. The key is the name of the categorical varible 
        and the value is the name of the protected group.
        fairthreshold: the threshold for justify a feature is fairness or not.
        perf_metric_name: The metric for choosing the best performance. Can be selected 
        from ['selection_rate', 'accuracy', 'balanced_acc', 'recall', 'precision', 
        'f1_score', 'tnr', 'fnr', 'npv', 'roc_auc', 'log_loss']
        model_object : an sklearn base estimator object. 
        """
        
        self.train_raw_X = train_data['raw_X']
        self.train_y = train_data['y']
        self.train_proba = train_data['pred_proba']
        self.test_raw_X = test_data['raw_X']
        self.test_y = test_data['y']
        self.test_proba = test_data['pred_proba']
        self.best_threshold = best_threshold
        self.fairthreshold = fairthreshold
        self.perf_metric_name = perf_metric_name
        self.model_object = model_object
    
        # initialize the credit scoring function
        test_function_cs()
        self.test_preds = np.where(self.test_proba > self.best_threshold, 1, 0)
        
        # pass in the data and model info
        self.p_var = cat_var_list
        self.p_grp = prot_cat_var_value
        
        # define the model container 
        self.container = ModelContainer(y_true = self.test_y,
                                        p_grp = self.p_grp, 
                                        p_var = self.p_var,
                                        y_pred= self.test_preds, 
                               y_train = self.train_y,
                               y_prob= self.test_proba,
                               x_train = self.train_raw_X,  
                               x_test = self.test_raw_X, 
                               model_object = self.model_object,
                               model_type = 'credit')

        self.cre_sco_obj= CreditScoring(model_params = [self.container], 
                                   fair_threshold = self.fairthreshold,
                                   perf_metric_name = self.perf_metric_name)
            
    def cat_ft_evaluation(self, is_visual):
        """
        is_visual: A boolean value that control the visualization mode
        """
        return self.cre_sco_obj.evaluate(visualize = is_visual)
        
    def trade_off_evaluation(self):
        return self.cre_sco_obj.tradeoff()

### Function Testing 

In [27]:
test_set = pd.read_csv('../../../data/stage_2/loanstats_2019Q1_test.csv', index_col = False)
train_set = pd.read_csv('../../../data/stage_2/loanstats_2019Q1_train.csv', index_col = False)
test_prob = pd.read_csv('../../../data/stage_2/loanstats_test_proba.csv', index_col = False)
train_prob = pd.read_csv('../../../data/stage_2/loanstats_train_proba.csv', index_col = False)
train_set['probability'] = train_prob['probability']
test_set['probability'] = test_prob['probability']

In [28]:
train_set.loc[(train_set['probability'] >= 0.5), 'pred_label'] = 1
train_set.loc[(train_set['probability'] < 0.5), 'pred_label'] = 0
test_set.loc[(test_set['probability'] >= 0.5), 'pred_label'] = 1
test_set.loc[(test_set['probability'] < 0.5), 'pred_label'] = 0

In [29]:
# define the name of the feature name list
all_feature_list = ['loan_amnt', 'int_rate', 'installment', 'home_ownership', 'annual_inc',
       'verification_status','dti', 'delinq_2yrs',
       'inq_last_6mths', 'open_acc', 'pub_rec', 'revol_bal', 'total_acc',
       'initial_list_status', 'out_prncp', 'out_prncp_inv', 'total_pymnt',
       'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int',
       'total_rec_late_fee', 'last_pymnt_amnt', 'collections_12_mths_ex_med',
       'application_type', 'tot_coll_amt', 'tot_cur_bal', 'open_acc_6m',
       'open_act_il', 'open_il_12m', 'open_il_24m', 'mths_since_rcnt_il',
       'total_bal_il', 'il_util', 'open_rv_12m', 'open_rv_24m', 'max_bal_bc',
       'all_util', 'total_rev_hi_lim', 'inq_fi', 'total_cu_tl', 'inq_last_12m',
       'acc_open_past_24mths', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util',
       'delinq_amnt', 'mo_sin_old_il_acct', 'mo_sin_old_rev_tl_op',
       'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl', 'mort_acc',
       'mths_since_recent_bc', 'mths_since_recent_inq',
       'num_accts_ever_120_pd', 'num_actv_bc_tl', 'num_actv_rev_tl',
       'num_bc_sats', 'num_bc_tl', 'num_il_tl', 'num_op_rev_tl',
       'num_rev_accts', 'num_rev_tl_bal_gt_0', 'num_sats',
       'num_tl_90g_dpd_24m', 'num_tl_op_past_12m', 'pct_tl_nvr_dlq',
       'percent_bc_gt_75', 'pub_rec_bankruptcies', 'tax_liens',
       'tot_hi_cred_lim', 'total_bal_ex_mort', 'total_bc_limit',
       'total_il_high_credit_limit', 'hardship_flag', 'debt_settlement_flag']
X_train = train_set[all_feature_list]
X_test = test_set[all_feature_list]

In [30]:
# format the data input
train_data = {'raw_X': X_train,
             'y': train_set['loan_status'],
             'pred_proba': train_set['probability']}
test_data = {'raw_X': X_test,
             'y': test_set['loan_status'],
             'pred_proba': test_set['probability']}

In [31]:
# create a class instance
test_class = FairnessMetricsEvaluator(train_data=train_data,
                                      test_data=test_data,
                                      best_threshold=0.5,
                                      cat_var_list=
                                      ["home_ownership", "verification_status",
                                       "initial_list_status", "application_type"],
                                      prot_cat_var_value=
                                      {'home_ownership': ['RENT'], 
                                       'verification_status':['Not Verified'], 
                                       'initial_list_status':['w'],
                                       "application_type":['Individual']
                                      },
                                      fairthreshold=0.4,
                                      perf_metric_name="f1_score",
                                      model_object=LogisticRegression())               

Evaluate performance:   0%|          


Evaluation of credit scoring performed normally


In [32]:
test_class.cat_ft_evaluation(False)

Evaluate performance:   0%|          


Class Distribution
	pos_label                                                           0.523%
	neg_label                                                          99.477%


Performance Metrics
	Selection Rate                                             0.004 +/- 0.001
	Accuracy                                                   0.996 +/- 0.001
	Balanced Accuracy                                          0.723 +/- 0.057
	Recall                                                     0.447 +/- 0.115
	Precision                                                  0.630 +/- 0.120
[1m	F1 Score                                                   0.523 +/- 0.103[0m
	True Negative Rate                                         0.999 +/- 0.001
	False Negative Rate                                        0.553 +/- 0.115
	Negative Predictive Value                                  0.997 +/- 0.001
	ROC AUC Score                                              0.823 +/- 0.070
	Log-loss                             

- The primary fairness metric will be suggested by this tool
- The conclusion about whether the feature is fair or not is based on the suggested primary fairness metric.
- The statistics for each fairness metrics is displayed on the right together with the range of error term. 
- for equal oppotunity, if the value is larger than threshold, then means that the TPR in the protected group and unprotected group is quite different. But for Disparate Impact, it should be the close to 1, the better.

In [33]:
test_class.cat_ft_evaluation(True)

VBox(children=(VBox(children=(VBox(children=(HBox(children=(HTML(value='<div style="color:black; text-align:le…

*The threshold and the values of ratio-based metrics are shifted down by 1.


In [34]:
test_class.trade_off_evaluation()

Tradeoff:   0%|          


-------------------------- 1. home_ownership  --------------------------
Performance versus Fairness Trade-Off
	 Single Threshold
		Privileged/Unprivileged Threshold                 0.500
		Best f1_score*                                    0.499
	 Separated Thresholds
		Privileged Threshold                              0.615
		Unprivileged Threshold                            0.658
		Best f1_score*                                    0.539
	 Separated Thresholds under Neutral Fairness (0.001)
		Privileged Threshold                              0.643
		Unprivileged Threshold                            0.500
		Best f1_score*                                    0.526
		*estimated by approximation, subject to the resolution of mesh grid

------------------------ 2. verification_status ------------------------
Performance versus Fairness Trade-Off
	 Single Threshold
		Privileged/Unprivileged Threshold                 0.500
		Best f1_score*                                    0.501
	 Separated

- Theoretically, the single threshold should be the same among all the groups.
- The separated thresholds are displayed for privileged group and unprivileged group accordingly.    