In [21]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score,StratifiedKFold
from scipy import optimize
from sklearn.preprocessing import StandardScaler
#pip install DCCP
import cvxpy as cp
# pip install DCCP
import dccp
# ! pip install torch
import torch

In [22]:
import warnings
warnings.filterwarnings('ignore')

# Loading data

In [23]:
data = pd.read_csv("../data/compas-scores-two-years.csv")

# Data processing

## Chose useful rows and columns

In [24]:
data = data.loc[data['race'].isin(['African-American','Caucasian'])]
data=data[[ 'sex','age', 'race', 'priors_count',
             'c_charge_degree', 'c_charge_desc',
             'start', 'end', 'event', 'two_year_recid']]
data = data.dropna()
data.shape

(6129, 10)

In [25]:
data.head()

Unnamed: 0,sex,age,race,priors_count,c_charge_degree,c_charge_desc,start,end,event,two_year_recid
1,Male,34,African-American,0,F,Felony Battery w/Prior Convict,9,159,1,1
2,Male,24,African-American,4,F,Possession of Cocaine,0,63,0,1
3,Male,23,African-American,1,F,Possession of Cannabis,0,1174,0,0
6,Male,41,Caucasian,14,F,Possession Burglary Tools,5,40,1,1
8,Female,39,Caucasian,0,M,Battery,2,747,0,0


## Label

In [26]:
label = data['two_year_recid']

In [27]:
sf = (data[['race']]=='Caucasian').astype(int)
sf.index = range(sf.shape[0])

## Normalize numerical columns and encode categorical columns

In [28]:
num = data._get_numeric_data()
num = num.drop(labels='two_year_recid',axis=1)
ss = StandardScaler()
num_ss = ss.fit_transform(num)
num = pd.DataFrame(num_ss,columns=num.columns)

In [29]:
num_cols = data._get_numeric_data().columns
cat = data.drop(columns = num_cols, axis = 1)
cat = cat.drop(labels='race',axis=1)
cat = pd.get_dummies(cat)
cat.index = range(cat.shape[0])

In [30]:
clean_data = pd.concat([sf,cat,num], axis=1)

In [31]:
# row_data = pd.read_csv("../data/compas-scores-two-years.csv")

Here are all features of the dataset.

In [32]:
# print(list(row_data.columns))

And we decided to use these features, and they are

**sex**: 0 is female, and 1 is male.

**age_cat**: Age Category, 0 is less than 25, 1 is between 25 and 45, 2 is greater than 45

**race**: Sensitive attribute. 0 is African-American, and 1 is Caucasian.

**priors_count**: A continuous variable containing the number of prior crimes committed.  

**c_charge_degree**: 0 is F, and 1 is M.

**two_year_recid**: The response variable. Whether or not the defendant recidivated within two years.


In [33]:
# selected_features = ["sex","age_cat","race","priors_count","c_charge_degree","two_year_recid"]

In [34]:
# clean_data = row_data[selected_features]
# clean_data = clean_data[clean_data['race'].isin(["African-American","Caucasian"])]

# clean_data.replace('Male',1,inplace = True)
# clean_data.replace('Female',0,inplace = True)

# clean_data.replace('African-American',0,inplace = True)
# clean_data.replace('Caucasian',1,inplace = True)

# clean_data.replace('M',1,inplace = True)
# clean_data.replace('F',0,inplace = True)

# clean_data.replace('Less than 25',0,inplace = True)
# clean_data.replace('25 - 45',1,inplace = True)
# clean_data.replace('Greater than 45',2,inplace = True)

We split the training set and testing set 6:1.

In [35]:
# x = clean_data[["sex","age_cat","race","priors_count","c_charge_degree"]]
# y = clean_data["two_year_recid"]

# x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=1/7.0,random_state=1)

In [36]:
clean_data = pd.concat([clean_data, label], axis=1)
clean_data = clean_data.dropna()

In [37]:
y = clean_data["two_year_recid"]
x = clean_data.drop("two_year_recid", 1)

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=1/7.0,random_state=1)

This is the most important function for implementing algorithm A4.

In [38]:
def DM(x,y,method=0):
    
    # Method 0 is Unconstrained (original) classifier
    # Method 1 is Disparate Mistreatment method (DM)
    # Method 2 is Disparate Mistreatment method with sensitive features (DM_sen)
    
    # This function will return "theta" a list of parameters of logistic regression
    
    idx_0 = x["race"] == 0   
    idx_1 = x["race"] == 1
        
    intercept = np.ones((x.shape[0],1))
    x = np.concatenate((x,intercept),axis=1)
    n = x.shape[0]
    
    theta = cp.Variable(x.shape[1])    
    loss = -cp.sum(cp.multiply(y,x @ theta) - cp.logistic(x @ theta))
    
    if method == 0:
        
        ## Unconstrained (original) classifier (logistic regression)
        
        prob = cp.Problem(cp.Minimize(loss),[])
        result = prob.solve(method = 'dccp')
    
    else:
        x_0 = x[idx_0]
        y_0 = y[idx_0]
        n_0 = x_0.shape[0]

        x_1 = x[idx_1]
        y_1 = y[idx_1]
        n_1 = x_1.shape[0]

        if method == 1:
            
            ## It is Disparate Mistreatment method (DM), sensitive attribute information 
            ## is not used while making decisions, i.e., by keeping user feature vectors (x) 
            ## and the sensitive features (z) disjoint.
            
            x = np.delete(x,2,axis=1) 
            x_0 = np.delete(x_0,2,axis=1) 
            x_1 = np.delete(x_1,2,axis=1) 
            theta = cp.Variable(x.shape[1]) 
            loss = cp.sum(cp.logistic(x @ theta)- cp.multiply(y,x @ theta))        
        
        ## For method = 2, it is Disparate Mistreatment method with sensitive features (DM_sen).
        ## The sensitive attribute information is used for decision making, resulting in 
        ## disparate treatment.

        distance_0 = x_0 @ theta              
        distance_1 = x_1 @ theta        

        g_0 = cp.minimum(0, cp.multiply(y_0, distance_0)) 
        g_1 = cp.minimum(0, cp.multiply(y_1, distance_1)) 

        c = 0
        constraints = []
        ## For here, we need to put two DCCP expressions on both sides of the inequality sign
        constraints.append( n_0/n * cp.sum(g_1) >= -c + n_1/n * cp.sum(g_0) )
        constraints.append( n_0/n * cp.sum(g_1) <=  c + n_1/n * cp.sum(g_0) )

        prob = cp.Problem(cp.Minimize(loss), constraints)
        result = prob.solve(method='dccp')
    
    theta = np.array(theta.value)
    
    return theta          

In [39]:
def sigmoid(z):
    return 1/(1+np.exp(-z))
def predict(x,theta,disjoint = 0):
    
    ## We can use disjoint = 0 to predict method 0 and method 2,
    ## and use disjoint = 1 to predict method 1.
    
    x0 = np.ones((x.shape[0],1))
    x = np.concatenate((x,x0),axis=1)
    if disjoint != 0:
        x = np.delete(x,2,axis=1) 
    z = np.dot(x,theta)
    h = sigmoid(z)
    pred = (h>0.5).astype(float)
    return pred

In [40]:
idx_a = x_test["race"] == 0   
idx_c = x_test["race"] == 1
x_test_a = x_test[idx_a]
y_test_a = y_test[idx_a]
x_test_c = x_test[idx_c]
y_test_c = y_test[idx_c]

## 1. Unconstrained Classifier

In [41]:
thetas = DM(x_train,y_train,method=0)

In [42]:
acc = (predict(x_test,thetas)==y_test).mean()
print("The accuracy of unconstrained logistic regression is %0.2f%%" % (acc*100))

The accuracy of unconstrained logistic regression is 52.82%


The accuracy of unconstrained logistic regression is 66.55%

In [43]:
cm_a = sklearn.metrics.confusion_matrix(y_test_a,(predict(x_test_a,thetas)))
cm_a  ## confusion matrix for African-American

array([[174,  74],
       [141,  59]], dtype=int64)

In [44]:
cm_c = sklearn.metrics.confusion_matrix(y_test_c,(predict(x_test_c,thetas)))
cm_c  ## confusion matrix for Caucasian

array([[129,  32],
       [104,  31]], dtype=int64)

In [45]:
fnr_a = cm_a[0][1]/(cm_a[0][1]+cm_a[0][0]) 
fnr_c = cm_c[0][1]/(cm_c[0][1]+cm_c[0][0])
fpr_a = cm_a[1][0]/(cm_a[1][1]+cm_a[1][0]) 
fpr_c = cm_c[1][0]/(cm_c[1][1]+cm_c[1][0])
D_fnr = fnr_a - fnr_c
D_fpr = fpr_a - fpr_c
print("For African-American, the false positive rate of is %0.2f%%, and false negative rate is %0.2f%%" % (fpr_a*100,fnr_a*100) ) 
print("For Caucasian       , the false positive rate of is %0.2f%%, and false negative rate is %0.2f%%" % (fpr_c*100,fnr_c*100) ) 
print("The diffrent negative positives rate(D_FNR) is %0.2f%%, and the diffrent positive positives rate(D_FPR)is %0.2f%%" % (D_fpr*100,D_fnr*100) ) 

For African-American, the false positive rate of is 70.50%, and false negative rate is 29.84%
For Caucasian       , the false positive rate of is 77.04%, and false negative rate is 19.88%
The diffrent negative positives rate(D_FNR) is -6.54%, and the diffrent positive positives rate(D_FPR)is 9.96%


The differce is so big, and we try to reduce it without significantly reducing the accuracy.

## 2. Disparate Mistreatment method (DM)

We will use this to avoid disparate treatment and disparate mistreatment simultaneously. Disparate mistreatment is avoided by using fairness constraints. Disparate treatment is avoided by ensuring that sensitive attribute information is not used while making decisions, i.e., by keeping user feature vectors (x) and the sensitive features (z) disjoint

In [47]:
thetas = DM(x_train,y_train,method=1)

In [None]:
acc = (predict(x_test,thetas,1)==y_test).mean()
print("The accuracy of DM method is %0.2f%%" % (acc*100))

In [None]:
cm_a = sklearn.metrics.confusion_matrix(y_test_a,(predict(x_test_a,thetas,1)))
cm_a  ## confusion matrix for African-American

In [None]:
cm_c = sklearn.metrics.confusion_matrix(y_test_c,(predict(x_test_c,thetas,1)))
cm_c  ## confusion matrix for Caucasian

In [None]:
fnr_a = cm_a[0][1]/(cm_a[0][1]+cm_a[0][0]) 
fnr_c = cm_c[0][1]/(cm_c[0][1]+cm_c[0][0])
fpr_a = cm_a[1][0]/(cm_a[1][1]+cm_a[1][0]) 
fpr_c = cm_c[1][0]/(cm_c[1][1]+cm_c[1][0])
D_fnr = fnr_a - fnr_c
D_fpr = fpr_a - fpr_c
print("For African-American, the false positive rate of is %0.2f%%, and false negative rate is %0.2f%%" % (fpr_a*100,fnr_a*100) ) 
print("For Caucasian       , the false positive rate of is %0.2f%%, and false negative rate is %0.2f%%" % (fpr_c*100,fnr_c*100) ) 
print("The diffrent negative positives rate(D_FNR) is %0.2f%%, and the diffrent positive positives rate(D_FPR)is %0.2f%%" % (D_fpr*100,D_fnr*100) ) 

The accuracy decreased by 9%, but we succeeded in reducing the diffrence of FNR and diffrence of FPR a lot. It seems to be an acceptable method 

## 3. Disparate Mistreatment method with sensitive attribute (DM_sen)

We will avoid disparate mistreatment only. The user feature vectors (x) and the sensitive features (z) are not disjoint, that is, z is used as a learnable feature. Therefore, the sensitive attribute information is used for decision making, resulting in disparate
treatment.

In [None]:
thetas = DM(x_train,y_train,method=2)

In [None]:
acc = (predict(x_test,thetas)==y_test).mean()
print("The accuracy of DM method is %0.2f%%" % (acc*100))

In [None]:
cm_a = sklearn.metrics.confusion_matrix(y_test_a,(predict(x_test_a,thetas)))
cm_a  ## confusion matrix for African-American

In [None]:
cm_c = sklearn.metrics.confusion_matrix(y_test_c,(predict(x_test_c,thetas)))
cm_c  ## confusion matrix for Caucasian

In [None]:
fnr_a = cm_a[0][1]/(cm_a[0][1]+cm_a[0][0]) 
fnr_c = cm_c[0][1]/(cm_c[0][1]+cm_c[0][0])
fpr_a = cm_a[1][0]/(cm_a[1][1]+cm_a[1][0]) 
fpr_c = cm_c[1][0]/(cm_c[1][1]+cm_c[1][0])
D_fnr = fnr_a - fnr_c
D_fpr = fpr_a - fpr_c
print("For African-American, the false positive rate of is %0.2f%%, and false negative rate is %0.2f%%" % (fpr_a*100,fnr_a*100) ) 
print("For Caucasian       , the false positive rate of is %0.2f%%, and false negative rate is %0.2f%%" % (fpr_c*100,fnr_c*100) ) 
print("The diffrent negative positives rate(D_FNR) is %0.2f%%, and the diffrent positive positives rate(D_FPR)is %0.2f%%" % (D_fpr*100,D_fnr*100) ) 

For DM_sen, the accuracy only decreased by 1%, and the diffrence of FNR and diffrence of FPR reduce a lot. It is a very great method to avoid disparate mistreatment.