In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score,StratifiedKFold
from scipy import optimize
#pip install DCCP
import cvxpy as cp
# pip install DCCP
import dccp

In [2]:
row_data = pd.read_csv("../data/compas-scores-two-years.csv")

Here are all features of the dataset.

In [3]:
print(list(row_data.columns))

['id', 'name', 'first', 'last', 'compas_screening_date', 'sex', 'dob', 'age', 'age_cat', 'race', 'juv_fel_count', 'decile_score', 'juv_misd_count', 'juv_other_count', 'priors_count', 'days_b_screening_arrest', 'c_jail_in', 'c_jail_out', 'c_case_number', 'c_offense_date', 'c_arrest_date', 'c_days_from_compas', 'c_charge_degree', 'c_charge_desc', 'is_recid', 'r_case_number', 'r_charge_degree', 'r_days_from_arrest', 'r_offense_date', 'r_charge_desc', 'r_jail_in', 'r_jail_out', 'violent_recid', 'is_violent_recid', 'vr_case_number', 'vr_charge_degree', 'vr_offense_date', 'vr_charge_desc', 'type_of_assessment', 'decile_score.1', 'score_text', 'screening_date', 'v_type_of_assessment', 'v_decile_score', 'v_score_text', 'v_screening_date', 'in_custody', 'out_custody', 'priors_count.1', 'start', 'end', 'event', 'two_year_recid']


And we decided to use these features, and they are

**sex**: 0 is female, and 1 is male.

**age_cat**: Age Category, 0 is less than 25, 1 is between 25 and 45, 2 is greater than 45

**race**: Sensitive attribute. 0 is African-American, and 1 is Caucasian.

**priors_count**: A continuous variable containing the number of prior crimes committed.  

**c_charge_degree**: 0 is F, and 1 is M.

**two_year_recid**: The response variable. Whether or not the defendant recidivated within two years.


In [4]:
selected_features = ["sex","age_cat","race","priors_count","c_charge_degree","two_year_recid"]

In [5]:
clean_data = row_data[selected_features]
clean_data = clean_data[clean_data['race'].isin(["African-American","Caucasian"])]

clean_data.replace('Male',1,inplace = True)
clean_data.replace('Female',0,inplace = True)

clean_data.replace('African-American',0,inplace = True)
clean_data.replace('Caucasian',1,inplace = True)

clean_data.replace('M',1,inplace = True)
clean_data.replace('F',0,inplace = True)

clean_data.replace('Less than 25',0,inplace = True)
clean_data.replace('25 - 45',1,inplace = True)
clean_data.replace('Greater than 45',2,inplace = True)

We split the training set and testing set 6:1.

In [6]:
x = clean_data[["sex","age_cat","race","priors_count","c_charge_degree"]]
y = clean_data["two_year_recid"]

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=1/7.0,random_state=1)

This is the most important function for implementing algorithm A4.

In [57]:
def DM(x,y,method=0):
    
    # Method 0 is Unconstrained (original) classifier
    # Method 1 is Disparate Mistreatment method (DM)
    # Method 2 is Disparate Mistreatment method with sensitive features (DM_sen)
    
    # This function will return "theta" a list of parameters of logistic regression
    
    idx_0 = x["race"] == 0   
    idx_1 = x["race"] == 1
        
    intercept = np.ones((x.shape[0],1))
    x = np.concatenate((x,intercept),axis=1)
    n = x.shape[0]
    
    theta = cp.Variable(x.shape[1])    
    loss = -cp.sum(cp.multiply(y,x @ theta) - cp.logistic(x @ theta))
    
    if method == 0:
        
        ## Unconstrained (original) classifier (logistic regression)
        
        prob = cp.Problem(cp.Minimize(loss),[])
        result = prob.solve(method = 'dccp')
    
    else:
        x_0 = x[idx_0]
        y_0 = y[idx_0]
        n_0 = x_0.shape[0]

        x_1 = x[idx_1]
        y_1 = y[idx_1]
        n_1 = x_1.shape[0]

        if method == 1:
            
            ## It is Disparate Mistreatment method (DM), sensitive attribute information 
            ## is not used while making decisions, i.e., by keeping user feature vectors (x) 
            ## and the sensitive features (z) disjoint.
            
            x = np.delete(x,2,axis=1) 
            x_0 = np.delete(x_0,2,axis=1) 
            x_1 = np.delete(x_1,2,axis=1) 
            theta = cp.Variable(x.shape[1]) 
            loss = cp.sum(cp.logistic(x @ theta)- cp.multiply(y,x @ theta))        
        
        ## For method = 2, it is Disparate Mistreatment method with sensitive features (DM_sen).
        ## The sensitive attribute information is used for decision making, resulting in 
        ## disparate treatment.

        distance_0 = x_0 @ theta              
        distance_1 = x_1 @ theta        

        g_0 = cp.minimum(0, cp.multiply(y_0, distance_0)) 
        g_1 = cp.minimum(0, cp.multiply(y_1, distance_1)) 

        c = 0
        constraints = []
        ## For here, we need to put two DCCP expressions on both sides of the inequality sign
        constraints.append( n_0/n * cp.sum(g_1) >= -c + n_1/n * cp.sum(g_0) )
        constraints.append( n_0/n * cp.sum(g_1) <=  c + n_1/n * cp.sum(g_0) )

        prob = cp.Problem(cp.Minimize(loss), constraints)
        result = prob.solve(method='dccp')
    
    theta = np.array(theta.value)
    
    return theta          

In [74]:
def sigmoid(z):
    return 1/(1+np.exp(-z))
def predict(x,theta,disjoint = 0):
    
    ## We can use disjoint = 0 to predict method 0 and method 2,
    ## and use disjoint = 1 to predict method 1.
    
    x0 = np.ones((x.shape[0],1))
    x = np.concatenate((x,x0),axis=1)
    if disjoint != 0:
        x = np.delete(x,2,axis=1) 
    z = np.dot(x,theta)
    h = sigmoid(z)
    pred = (h>0.5).astype(float)
    return pred

## 1. Unconstrained Classifier

In [81]:
thetas = DM(x_train,y_train,method=0)

In [82]:
(predict(x_test,thetas)==y_test).mean()
print("The accuracy of unconstrained logistic regression is %0.2f%%" % (acc*100))

0.6655290102389079

The accuracy of unconstrained logistic regression is 66.55%

In [83]:
cm_a = sklearn.metrics.confusion_matrix(y_test_a,(predict(x_test_a,thetas)))
cm_a  ## confusion matrix for African-American

array([[173,  94],
       [ 95, 184]], dtype=int64)

In [84]:
cm_c = sklearn.metrics.confusion_matrix(y_test_c,(predict(x_test_c,thetas)))
cm_c  ## confusion matrix for Caucasian

array([[177,  23],
       [ 82,  51]], dtype=int64)

In [104]:
fnr_a = cm_a[0][1]/(cm_a[0][1]+cm_a[0][0]) 
fnr_c = cm_c[0][1]/(cm_c[0][1]+cm_c[0][0])
fpr_a = cm_a[1][0]/(cm_a[1][1]+cm_a[1][0]) 
fpr_c = cm_c[1][0]/(cm_c[1][1]+cm_c[1][0])
D_fnr = fnr_a - fnr_c
D_fpr = fpr_a - fpr_c
print("For African-American, the false positive rate of is %0.2f%%, and false negative rate is %0.2f%%" % (fpr_a*100,fnr_a*100) ) 
print("For Caucasian       , the false positive rate of is %0.2f%%, and false negative rate is %0.2f%%" % (fpr_c*100,fnr_c*100) ) 
print("The diffrent negative positives rate(D_FNR) is %0.2f%%, and the diffrent positive positives rate(D_FPR)is %0.2f%%" % (D_fpr*100,D_fnr*100) ) 

For African-American, the false positive rate of is 34.05%, and false negative rate is 35.21%
For Caucasian       , the false positive rate of is 61.65%, and false negative rate is 11.50%
The diffrent negative positives rate(D_FNR) is -27.60%, and the diffrent positive positives rate(D_FPR)is 23.71%


The differce is so big, and we try to reduce it without significantly reducing the accuracy.

## 2. Disparate Mistreatment method (DM)

We will use this to avoid disparate treatment and disparate mistreatment simultaneously. Disparate mistreatment is avoided by using fairness constraints. Disparate treatment is avoided by ensuring that sensitive attribute information is not used while making decisions, i.e., by keeping user feature vectors (x) and the sensitive features (z) disjoint

In [105]:
thetas = DM(x_train,y_train,method=1)

In [108]:
acc = (predict(x_test,thetas,1)==y_test).mean()
print("The accuracy of DM method is %0.2f%%" % (acc*100))

The accuracy of DM method is 57.57%


In [110]:
cm_a = sklearn.metrics.confusion_matrix(y_test_a,(predict(x_test_a,thetas,1)))
cm_a  ## confusion matrix for African-American

array([[ 61, 206],
       [ 22, 257]], dtype=int64)

In [111]:
cm_c = sklearn.metrics.confusion_matrix(y_test_c,(predict(x_test_c,thetas,1)))
cm_c  ## confusion matrix for Caucasian

array([[ 68, 132],
       [ 13, 120]], dtype=int64)

In [112]:
fnr_a = cm_a[0][1]/(cm_a[0][1]+cm_a[0][0]) 
fnr_c = cm_c[0][1]/(cm_c[0][1]+cm_c[0][0])
fpr_a = cm_a[1][0]/(cm_a[1][1]+cm_a[1][0]) 
fpr_c = cm_c[1][0]/(cm_c[1][1]+cm_c[1][0])
D_fnr = fnr_a - fnr_c
D_fpr = fpr_a - fpr_c
print("For African-American, the false positive rate of is %0.2f%%, and false negative rate is %0.2f%%" % (fpr_a*100,fnr_a*100) ) 
print("For Caucasian       , the false positive rate of is %0.2f%%, and false negative rate is %0.2f%%" % (fpr_c*100,fnr_c*100) ) 
print("The diffrent negative positives rate(D_FNR) is %0.2f%%, and the diffrent positive positives rate(D_FPR)is %0.2f%%" % (D_fpr*100,D_fnr*100) ) 

For African-American, the false positive rate of is 7.89%, and false negative rate is 77.15%
For Caucasian       , the false positive rate of is 9.77%, and false negative rate is 66.00%
The diffrent negative positives rate(D_FNR) is -1.89%, and the diffrent positive positives rate(D_FPR)is 11.15%


The accuracy decreased by 9%, but we succeeded in reducing the diffrence of FNR and diffrence of FPR a lot. It seems to be an acceptable method 

## 3. Disparate Mistreatment method with sensitive attribute (DM_sen)

We will avoid disparate mistreatment only. The user feature vectors (x) and the sensitive features (z) are not disjoint, that is, z is used as a learnable feature. Therefore, the sensitive attribute information is used for decision making, resulting in disparate
treatment.

In [114]:
thetas = DM(x_train,y_train,method=2)

In [116]:
acc = (predict(x_test,thetas)==y_test).mean()
print("The accuracy of DM method is %0.2f%%" % (acc*100))

The accuracy of DM method is 65.42%


In [117]:
cm_a = sklearn.metrics.confusion_matrix(y_test_a,(predict(x_test_a,thetas)))
cm_a  ## confusion matrix for African-American

array([[191,  76],
       [118, 161]], dtype=int64)

In [118]:
cm_c = sklearn.metrics.confusion_matrix(y_test_c,(predict(x_test_c,thetas)))
cm_c  ## confusion matrix for Caucasian

ValueError: shapes (333,5) and (6,) not aligned: 5 (dim 1) != 6 (dim 0)

In [63]:
idx_a = x_test["race"]==0
x_test_a = x_test[idx_a]
y_test_a = y_test[idx_a]

idx_c = x_test["race"]==1
x_test_c = x_test[idx_c]
y_test_c = y_test[idx_c]

In [77]:
qq = sklearn.metrics.confusion_matrix(y_test_a,(predict(x_test_a,a,1)))
qq

array([[ 61, 206],
       [ 22, 257]], dtype=int64)

In [78]:
ww = sklearn.metrics.confusion_matrix(y_test_c,(predict(x_test_c,a,1)))
ww

array([[ 68, 132],
       [ 13, 120]], dtype=int64)

In [79]:
ww[0][1]/(ww[0][1]+ww[0][0]) - qq[0][1]/(qq[0][1]+qq[0][0])

-0.11153558052434454

In [80]:
ww[1][0]/(ww[1][1]+ww[1][0]) - qq[1][0]/(qq[1][1]+qq[1][0])

0.01889131430727356