In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OrdinalEncoder,  StandardScaler
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import random
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.impute import SimpleImputer

In [2]:
compas = pd.read_csv('compas-scores.csv')
compas = compas.drop_duplicates(subset = compas.columns.values.tolist()[1:], keep='first')
compas = compas.fillna("0")
compas = compas[compas['is_recid'] != -1]
columns_drop = ['id', 'name', 'first', 'last', 'compas_screening_date', 'dob', 'days_b_screening_arrest', 'c_jail_in', 'c_jail_out', 'c_case_number', 'c_offense_date', 'c_arrest_date', 'c_days_from_compas', 'r_case_number', 'r_charge_degree', 'r_days_from_arrest', 'r_offense_date', 'r_charge_desc', 'r_jail_in', 'r_jail_out', 'is_violent_recid', 'num_vr_cases', 'vr_case_number', 'vr_offense_date', 'vr_charge_degree' , 'vr_charge_desc', 'screening_date', 'v_screening_date','num_r_cases','decile_score.1','c_charge_desc']
compas = compas.drop(columns=columns_drop, errors = 'ignore')
[train, test] = train_test_split(compas,test_size=0.3)
X_train = train.drop('is_recid',axis=1) 
y_train =train['is_recid']
X_test = test.drop('is_recid',axis=1)
y_test = test['is_recid'] 

In [3]:
ordinal_cat_cols = [ 'age_cat', 'c_charge_degree','decile_score','v_decile_score','v_score_text','score_text'] 
oe = OrdinalEncoder() 
ohe = OneHotEncoder(handle_unknown='ignore')
onehot_cat_cols = ['sex','race','type_of_assessment','v_type_of_assessment']
numerical_cols =['age', 'juv_fel_count','juv_misd_count','juv_other_count','priors_count'] 
ie = SimpleImputer(missing_values=np.nan,strategy='mean')
scaler=StandardScaler()
preprocessor= ColumnTransformer( 
    transformers = [
        ('cat',ohe,onehot_cat_cols),
        ('ord',oe,ordinal_cat_cols),
        ('num', ie, numerical_cols),
        ])

In [6]:
rf_pipeline = Pipeline(steps =[
    ('preprocessor',preprocessor),
    ('rf', RandomForestRegressor(n_estimators =100,random_state=42))
    ]) 
rf_pipeline.fit(X_train,y_train.ravel())

y_pred_rf = np.round(rf_pipeline.predict(X_test))
conf_mat_rf = confusion_matrix(y_test, y_pred_rf)
print(classification_report(y_test,y_pred_rf))

lr_pipeline = Pipeline(steps = [
    ('preprocessor',preprocessor),
    ('lr', LogisticRegression(max_iter=1000, random_state=42))
    ])

lr_pipeline.fit(X_train,y_train.ravel())
y_pred_lr = lr_pipeline.predict(X_test)
conf_mat_lr = confusion_matrix(y_test, y_pred_lr)
print(classification_report(y_test,y_pred_lr))

              precision    recall  f1-score   support

           0       0.73      0.78      0.75      2213
           1       0.49      0.43      0.46      1099

    accuracy                           0.66      3312
   macro avg       0.61      0.60      0.61      3312
weighted avg       0.65      0.66      0.66      3312

              precision    recall  f1-score   support

           0       0.72      0.91      0.81      2213
           1       0.62      0.30      0.40      1099

    accuracy                           0.71      3312
   macro avg       0.67      0.60      0.60      3312
weighted avg       0.69      0.71      0.67      3312



In [8]:
#calculaate the confusion matrix for the predictions as before
conf_mat = confusion_matrix(y_test, y_pred_rf)

In [9]:
#get the indices
privileged_indices = X_test['race'] == 'Caucasian'
unprivileged_indices = X_test['race'] != 'Caucasian'

In [11]:
# calculate FPR and FNR for privileged group (White)
tp_privileged = np.sum((y_test == 1) & (y_pred_rf == 1) & (X_test['race'] == 'Caucasian'))
fp_privileged = np.sum((y_test == 0) & (y_pred_rf == 1) & (X_test['race'] == 'Caucasian'))
tn_privileged = np.sum((y_test == 0) & (y_pred_rf == 0) & (X_test['race'] == 'Caucasian'))
fn_privileged = np.sum((y_test == 1) & (y_pred_rf == 0) & (X_test['race'] == 'Caucasian'))

fpr_privileged = fp_privileged / (fp_privileged + tn_privileged)
fnr_privileged = fn_privileged / (fn_privileged + tp_privileged)

# calculate FPR and FNR for unprivileged group (Black)
tp_unprivileged = np.sum((y_test == 1) & (y_pred_rf == 1) & (X_test['race'] == 'African-American'))
fp_unprivileged = np.sum((y_test == 0) & (y_pred_rf == 1) & (X_test['race'] == 'African-American'))
tn_unprivileged = np.sum((y_test == 0) & (y_pred_rf == 0) & (X_test['race'] == 'African-American'))
fn_unprivileged = np.sum((y_test == 1) & (y_pred_rf == 0) & (X_test['race'] == 'African-American'))

fpr_unprivileged = fp_unprivileged / (fp_unprivileged + tn_unprivileged)
fnr_unprivileged = fn_unprivileged / (fn_unprivileged + tp_unprivileged)

# print FPR and FNR for privileged and unprivileged groups
print("FPR for privileged group (White):", fpr_privileged)
print("FNR for privileged group (White):", fnr_privileged)
print("FPR for unprivileged group (Black):", fpr_unprivileged)
print("FNR for unprivileged group (Black):", fnr_unprivileged)

FPR for privileged group (White): 0.1768219832735962
FNR for privileged group (White): 0.67601246105919
FPR for unprivileged group (Black): 0.30713547052740436
FNR for unprivileged group (Black): 0.500768049155146


In [12]:
#calculate the equalized odds
equalized_odds_privileged = abs(fpr_privileged - fpr_unprivileged)
equalized_odds_unprivileged = abs(fnr_privileged - fnr_unprivileged)

print("Equalized odds for privileged group (White):", equalized_odds_privileged)
print("Equalized odds for unprivileged group (Black):", equalized_odds_unprivileged)

Equalized odds for privileged group (White): 0.13031348725380817
Equalized odds for unprivileged group (Black): 0.17524441190404405
