In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from fairaudit import Auditor
from fairaudit.groups import get_intersections
from fairaudit.metrics import Metric

# COMPAS data

In this notebook, we will apply the certification and flagging tools to the COMPAS dataset.

In [11]:
df_deciles = pd.read_csv('../data/compas-scores-two-years.csv')
keep_columns = ["age", "c_charge_degree", "race", "age_cat", "score_text", "sex", "priors_count", 
                "days_b_screening_arrest", "decile_score", "is_recid", "two_year_recid", 
                "c_jail_in", "c_jail_out"]

df_deciles = df_deciles[keep_columns]

# filter out rows that we do not have labels/scores for
row_filter = (df_deciles["days_b_screening_arrest"] <= 30) & (df_deciles["days_b_screening_arrest"] >= -30)
row_filter &= (df_deciles["is_recid"] != -1) & (df_deciles["c_charge_degree"] != "O") 
row_filter &= df_deciles["score_text"] != "NA"
df_deciles = df_deciles[row_filter]

df_deciles

group_feats = ['race', 'age_cat', 'sex'] # groups we will consider throughout

Unnamed: 0,age,c_charge_degree,race,age_cat,score_text,sex,priors_count,days_b_screening_arrest,decile_score,is_recid,two_year_recid,c_jail_in,c_jail_out
0,69,F,Other,Greater than 45,Low,Male,0,-1.0,1,0,0,2013-08-13 06:03:42,2013-08-14 05:41:20
1,34,F,African-American,25 - 45,Low,Male,0,-1.0,3,1,1,2013-01-26 03:45:27,2013-02-05 05:36:53
2,24,F,African-American,Less than 25,Low,Male,4,-1.0,4,1,1,2013-04-13 04:58:34,2013-04-14 07:02:04
5,44,M,Other,25 - 45,Low,Male,0,0.0,1,0,0,2013-11-30 04:50:18,2013-12-01 12:28:56
6,41,F,Caucasian,25 - 45,Medium,Male,14,-1.0,6,1,1,2014-02-18 05:08:24,2014-02-24 12:18:30
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7209,23,F,African-American,Less than 25,Medium,Male,0,-1.0,7,0,0,2013-11-22 05:18:27,2013-11-24 02:59:20
7210,23,F,African-American,Less than 25,Low,Male,0,-1.0,3,0,0,2014-01-31 07:13:54,2014-02-02 04:03:52
7211,57,F,Other,Greater than 45,Low,Male,0,-1.0,1,0,0,2014-01-13 05:48:01,2014-01-14 07:49:46
7212,33,M,African-American,25 - 45,Low,Female,3,-1.0,2,0,0,2014-03-08 08:06:02,2014-03-09 12:18:04


In [85]:
# get racial features
unique_races, race_inds = np.unique(df_deciles['race'].to_numpy(), return_inverse=True)
race_features = race_inds.reshape(-1,1)

# filter by binary prediction
angwin_threshold = 5
northpointe_threshold = 8
high_risk_filter = df_deciles['decile_score'] >= angwin_threshold
df_ppv = df_deciles[high_risk_filter] # filter to just high-risk offenders

# inputs to auditor are constructed below
x = race_features[high_risk_filter]
y = df_ppv['two_year_recid'].to_numpy()
races_ppv = get_intersections(x) # get group indicators
z = races_ppv[:,2] # white indicators

# measure positive predictive value relative to Caucasian ppv
metric = Metric(
    name="PPV_Caucasian", 
    evaluation_function=lambda z, y : np.isclose(y, 1), 
    threshold_function=lambda z, y: np.mean(y[z == True])
)

## Certifying positive predictive value

In this section of the notebook, we verify the Northpointe Inc. analysis of COMPAS, which claims that the positive predictive value (PPV) of COMPAS is comparable between African-American and Caucasian defendants.

### Groups - African-Americans only
In the first cell, we lower bound the discrepancy between the COMPAS PPV for African-American defendants relative to Caucasian defendants.

$$\epsilon = \mathbb{P} \left(Y = 1 \mid f(X) = 1, X_{\text{race}} = \text{African-American} \right) - \mathbb{P} \left(Y = 1 \mid f(X) = 1, X_{\text{race}} = \text{Caucasian} \right)$$

Running the cell outputs a 95%-lower confidence bound on $\epsilon$. Note that this result is only for one group.

In [86]:
races_ac = races_ppv[:,[0,2]] # select only Caucasian / African-American subgroups

auditor = Auditor(x, y, z, metric)
auditor.calibrate_groups(
    alpha=0.05,
    type='upper',
    epsilon=None,
    groups=races_ac,
    bootstrap_params={'seed': 0, 'B': 2000}
)
bound, value, threshold = auditor.query_group(0) # African-American lower bound
bound

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:01<00:00, 1822.69it/s]


[0.01874145481608773]

### Groups - all races

We now consider the same certification problem over all races. In particular, we issue simultaneously valid 95% confidence intervals for

$$\epsilon(G) = \mathbb{P} \left(Y = 1 \mid f(X) = 1, X_{\text{race}} = G \right) - \mathbb{P} \left(Y = 1 \mid f(X) = 1, X_{\text{race}} = \text{Caucasian} \right).$$

To tighten the bound for smaller racial subgroups, we studentize the process by $(\mathbb{P}_n(G) \vee (25/n))^{3/2}$.

In [88]:
# repeat this analysis but over all races
prob_threshold = 25 / len(y)
boot_params = {'seed': 0, 'B': 500, 'student': 'prob_bound', 'student_threshold': prob_threshold**(3/2)}

auditor.calibrate_groups(
    alpha=0.05,
    type='interval',
    epsilon=None,
    groups=races_ppv,
    bootstrap_params=boot_params
)

for grp_idx in range(races_ppv.shape[1]):
    race = df_ppv['race'][races_ppv[:,grp_idx]].unique()[0]
    bound_list, val_list, threshold_list = auditor.query_group(grp_idx)
    clipped_bound = np.clip(bound_list[0], -1, 1)
    print(f"{race}: {clipped_bound}")

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:00<00:00, 1495.00it/s]

African-American: [0.013491  0.1011898]
Asian: [-1.  1.]
Caucasian: [-0.06681519  0.07535086]
Hispanic: [-0.18299051  0.13286682]
Native American: [-1.  1.]
Other: [-0.20551123  0.24277097]





### Groups - all intersections

We last consider the PPV certification problem over all groups formed by the intersection of race, gender, and age category. In particular, we issue simultaneously valid 95% confidence intervals for

$$\epsilon(G) = \mathbb{P} \left(Y = 1 \mid f(X) = 1, X \in G \right) - \mathbb{P} \left(Y = 1 \mid f(X) = 1, X_{\text{race}} = \text{Caucasian} \right).$$

To tighten the bound for smaller subgroups, we again studentize the process by $(\mathbb{P}_n(G) \vee (25/n))^{3/2}$.

In [89]:
# let's include any intersection of race, age, and sex
unique_races, race_inds = np.unique(df_ppv['race'].to_numpy(), return_inverse=True)
unique_ages, age_inds = np.unique(df_ppv['age_cat'].to_numpy(), return_inverse=True)
unique_sexes, sex_inds = np.unique(df_ppv['sex'].to_numpy(), return_inverse=True)

x = np.concatenate(
    (race_inds.reshape(-1,1), age_inds.reshape(-1,1), sex_inds.reshape(-1,1)), 
    axis=1
)
groups_ppv = get_intersections(x)

prob_threshold = 25 / len(y)

In [92]:
auditor = Auditor(x, y, z, metric)

boot_params = {'seed': 0, 'B': 500, 'student': 'prob_bound', 'student_threshold': prob_threshold**(3/2)}

auditor.calibrate_groups(
    alpha=0.05,
    type='interval',
    epsilon=None,
    groups=groups_ppv,
    bootstrap_params=boot_params
)

for group_ind in range(groups_ppv.shape[1]):
    bound, metrics, thresholds = auditor.query_group(group_ind)
    if np.asarray(bound).min() >= -0.015:
        feats_filter = df_ppv[group_feats][groups_ppv[:,group_ind]].nunique() == 1
        feats_filter = feats_filter.index[feats_filter]
        group_identity = df_ppv[feats_filter][groups_ppv[:,group_ind]].drop_duplicates()
        print(f"{group_identity.values[0]}: {bound[0]}\n{'-'*100}")

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:00<00:00, 999.23it/s]


['African-American']: [0.010874097198480079, 0.10145035488519238]
----------------------------------------------------------------------------------------------------
['25 - 45']: [-0.00794314730153619, 0.08889829177864567]
----------------------------------------------------------------------------------------------------
['Male']: [0.019496021771496994, 0.10070995149476372]
----------------------------------------------------------------------------------------------------
['African-American' '25 - 45']: [-0.002414859526386433, 0.1149687029950463]
----------------------------------------------------------------------------------------------------
['African-American' 'Less than 25']: [-0.012219424196002915, 0.15161883387208408]
----------------------------------------------------------------------------------------------------
['African-American' 'Male']: [0.030111125557032978, 0.12828070252791318]
---------------------------------------------------------------------------------------

## Test fairness

Next, we consider certifying COMPAS under the "test fairness" criteria, i.e. that its calibration error is comparable across subgroups. This claim was first made heuristically by Chouldechova et al. (2016) in relation to the African-American and Caucasian subgroups. We simultaneously bound the calibration error over all subgroups and decile scores here.

$$\epsilon(G, s) = \mathbb{P}(Y = 1 \mid S = s, X \in G) - \mathbb{P}(Y = 1 \mid S = s)$$

In [93]:
# prepare inputs to auditor

unique_races, race_inds = np.unique(df_deciles['race'].to_numpy(), return_inverse=True)
unique_ages, age_inds = np.unique(df_deciles['age_cat'].to_numpy(), return_inverse=True)
unique_sexes, sex_inds = np.unique(df_deciles['sex'].to_numpy(), return_inverse=True)

x = np.concatenate(
    (race_inds.reshape(-1,1), age_inds.reshape(-1,1), sex_inds.reshape(-1,1)), 
    axis=1
)
y = df_deciles['two_year_recid'].to_numpy()
z = df_deciles['decile_score'].to_numpy()

groups = get_intersections(x)
metric = Metric(
    name="test_fairness", 
    evaluation_function=lambda z, y: np.isclose(y, 1), 
    threshold_function=lambda z, y: np.mean(np.isclose(y, 1)), 
    metric_params={'calibration_bins' : np.unique(z)}
)

prob_threshold = (25 / len(y))

In [94]:
auditor = Auditor(x, y, z, metric)
boot_params = {'seed': 0, 'B': 500, 'student': 'prob_bound', 
               'student_threshold': prob_threshold**(3/2)}
auditor.calibrate_groups(
    alpha=0.1/len(np.unique(z)),
    type='interval',
    epsilon=None,
    groups=groups,
    bootstrap_params=boot_params
)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:01<00:00, 342.06it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:01<00:00, 342.45it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:01<00:00, 348.58it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:01<00:00, 364.20it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████

Below, we output a 90% confidence interval that is simultaneously valid over each $\epsilon(G, s)$. To avoid outputting hundreds of confidence intervals, we only show the intervals for the $G$ over which we are able to certify that $\max_{s} |\epsilon(G, s)| \leq 0.15$.

In [95]:
scores = np.unique(z)
for g_ind in range(groups.shape[1]):
    bound_list, metrics, thresholds = auditor.query_group(g_ind)
    if np.abs(bound_list).max() < 0.15:
        feats_filter = df_deciles[group_feats][groups[:,g_ind]].nunique() == 1
        feats_filter = feats_filter.index[feats_filter]
        group_identity = df_deciles[feats_filter][groups[:,g_ind]].drop_duplicates()
        for score, bound in zip(scores, bound_list):
            print(f"{group_identity.values[0]}-{score}: {bound}")
            print('-'*100)

['African-American']-1: [-0.05218049860518982, 0.08532643350355225]
----------------------------------------------------------------------------------------------------
['African-American']-2: [-0.10411281943253475, 0.06076656459252128]
----------------------------------------------------------------------------------------------------
['African-American']-3: [-0.054033563232977505, 0.13140556994879532]
----------------------------------------------------------------------------------------------------
['African-American']-4: [-0.05942971459961753, 0.11313221337323515]
----------------------------------------------------------------------------------------------------
['African-American']-5: [-0.08021130571737678, 0.0961454653208808]
----------------------------------------------------------------------------------------------------
['African-American']-6: [-0.09220443129749896, 0.09510176341144862]
---------------------------------------------------------------------------------------

### Boolean certification

Here we use the Boolean certification method to identify subgroups for which $\max_{s} |\epsilon(G, s)| \leq 0.15$. 

We remark that this approach fails to identify the African-American / 25-45 subgroup that the bound certification method would have found.

In [96]:
boot_params = {'seed': 0, 'B': 500, 'student': 'prob_bool', 
               'student_threshold': prob_threshold**(1/2)}
auditor.calibrate_groups(
    alpha=0.1/len(np.unique(z)), # since we now have 10 scores
    type='interval',
    epsilon=0.15,
    groups=groups,
    bootstrap_params=boot_params
)

for g_ind in range(groups.shape[1]):
    certificates, metrics, thresholds = auditor.query_group(
     groups[:,g_ind]
    )

    if np.all(certificates):
        feats_filter = df_deciles[group_feats][groups[:,g_ind]].nunique() == 1
        feats_filter = feats_filter.index[feats_filter]
        group_identity = df_deciles[feats_filter][groups[:,g_ind]].drop_duplicates()
        print(group_identity.values[0])

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:01<00:00, 341.83it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:01<00:00, 326.79it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:01<00:00, 334.32it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:01<00:00, 357.02it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████

['African-American']
['25 - 45']
['Male']


## Flagging disparities

### Localizing false positive rate disparities

The original COMPAS investigation of Angwin et al. (2016) flagged African-American defendants for having a higher false positive rate when compared to Caucasian ones, i.e.

$$\mathbb{P}\left(S \geq 5 \mid Y = 0, X_{\text{race}} = \text{African-American} \right) > \mathbb{P}\left(S \geq 5 \mid Y = 0, X_{\text{race}} = \text{Caucasian} \right)$$

Here we ask if we can localize this false positive rate disparity to a more granular subset of defendants. If 

$$\epsilon(G) = \mathbb{P}\left(S \geq 5 \mid Y = 0, X \in G \right) - \mathbb{P}\left(S \geq 5 \mid Y = 0\right)$$

then we flag $G$ if we can reject the null hypothesis that $\epsilon(G) \leq 0.05$. 

In [98]:
false_filter = df_deciles['two_year_recid'] == 0
df_fpr = df_deciles[false_filter]

group_feats = ['race', 'age_cat', 'sex']
unique_races, race_inds = np.unique(df_fpr['race'].to_numpy(), return_inverse=True)
unique_ages, age_inds = np.unique(df_fpr['age_cat'].to_numpy(), return_inverse=True)
unique_sexes, sex_inds = np.unique(df_fpr['sex'].to_numpy(), return_inverse=True)

x = np.concatenate(
    (race_inds.reshape(-1,1), age_inds.reshape(-1,1), sex_inds.reshape(-1,1)), 
    axis=1
)
y = df_fpr['two_year_recid'].to_numpy()
z = (df_fpr['decile_score'] >= 5).to_numpy() # high-risk flag
groups_fpr = get_intersections(x)
groups_fpr = groups_fpr[:, np.any(groups_fpr, axis=0)] # exclude groups with no values in them

metric = Metric(
    name="fpr", 
    evaluation_function=lambda z, y: np.isclose(z, 1),
    threshold_function=lambda z, y: np.mean(np.isclose(z, 1))
)

In [106]:
auditor = Auditor(x, y, z, metric)

boot_params = {'seed': 0, "student": "mad", "student_threshold": 1e-8, 
               "prob_threshold": 25 / len(x)}
flags, metric_values = auditor.flag_groups(
    groups_fpr, 
    type="lower", 
    alpha=0.1, 
    epsilon=0.05, 
    bootstrap_params=boot_params
)


for g_ind in range(groups_fpr.shape[1]):
    feats_filter = df_fpr[group_feats][groups_fpr[:,g_ind]].nunique() == 1
    feats_filter = feats_filter.index[feats_filter]
    group_identity = df_fpr[feats_filter][groups_fpr[:,g_ind]].drop_duplicates()
    if flags[g_ind]:
        print(f"{group_identity.values[0]}: {metric_values[0, g_ind]}")

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:00<00:00, 935.78it/s]


['African-American']: 0.12067585280959826
['Less than 25']: 0.23186406580088142
['African-American' '25 - 45']: 0.11251147396863566
['African-American' 'Less than 25']: 0.29163370530579724
['Caucasian' 'Less than 25']: 0.18044015008001815
['African-American' 'Male']: 0.13393791828072618
['Less than 25' 'Female']: 0.3051372199191898
['Less than 25' 'Male']: 0.20638499175519695
['African-American' '25 - 45' 'Male']: 0.1351471900089206
['African-American' 'Less than 25' 'Female']: 0.3269237122939175
['African-American' 'Less than 25' 'Male']: 0.27957256367694605
['Caucasian' 'Less than 25' 'Female']: 0.39729408266428784
['Hispanic' 'Less than 25' 'Male']: 0.24077234353385307


### Localizing PPV disparities

The rebuttal from Northpointe claimed that COMPAS was fair if measured by PPV. Here, we see if it is possible to discover any positive predictive value disparities. Define

$$\epsilon(G) = \mathbb{P}\left(Y = 1 \mid S \geq 5, X \in G \right) - \mathbb{P}\left(Y = 1 \mid S \geq 5\right)$$

then we flag $G$ if we can reject the null hypothesis that $\epsilon(G) \geq -0.05$. 

In [122]:
# filter by binary prediction
angwin_threshold = 5
northpointe_threshold = 8
high_risk_filter = df_deciles['decile_score'] >= angwin_threshold
df_ppv = df_deciles[high_risk_filter] # filter to just high-risk offenders

# inputs to auditor are constructed below
unique_races, race_inds = np.unique(df_ppv['race'].to_numpy(), return_inverse=True)
unique_ages, age_inds = np.unique(df_ppv['age_cat'].to_numpy(), return_inverse=True)
unique_sexes, sex_inds = np.unique(df_ppv['sex'].to_numpy(), return_inverse=True)

x = np.concatenate(
    (race_inds.reshape(-1,1), age_inds.reshape(-1,1), sex_inds.reshape(-1,1)), 
    axis=1
)
y = df_ppv['two_year_recid'].to_numpy()
z = races_ppv[:,2] # white indicators
groups_ppv = get_intersections(x)

metric = Metric(
    name="PPV_Average", 
    evaluation_function=lambda z, y : np.isclose(y, 1), 
    threshold_function=lambda z, y: np.mean(np.isclose(y, 1))
)

In [125]:
auditor = Auditor(x, y, z, metric)
boot_params = {'seed': 0, "student": "mad", "student_threshold": 1e-8,
               "prob_threshold": 25 / len(x)}
flags, metric_values = auditor.flag_groups(
    groups_ppv, 
    type="upper", 
    alpha=0.1, 
    epsilon=-0.05, 
    bootstrap_params=boot_params
)

for group_ind in range(groups_ppv.shape[1]):
    if flags[group_ind]:
        feats_filter = df_ppv[group_feats][groups_ppv[:,group_ind]].nunique() == 1
        feats_filter = feats_filter.index[feats_filter]
        group_identity = df_ppv[feats_filter][groups_ppv[:,group_ind]].drop_duplicates()
        print(f"{group_identity.values[0]}: {metric_values[0, group_ind]}")

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:00<00:00, 954.15it/s]

['Female']: -0.11314602176748438
['Less than 25' 'Female']: -0.18683897200147062
['Caucasian' 'Less than 25' 'Female']: -0.2439878321758597



