In [345]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OrdinalEncoder,  StandardScaler
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import random
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.impute import SimpleImputer

In [346]:
compas = pd.read_csv('compas-scores.csv')
compas = compas.drop_duplicates(subset = compas.columns.values.tolist()[1:], keep='first')
compas = compas.fillna("0")
compas = compas[compas['is_recid'] != -1]
columns_drop = ['id', 'name', 'first', 'last', 'compas_screening_date', 'dob', 'days_b_screening_arrest', 'c_jail_in', 'c_jail_out', 'c_case_number', 'c_offense_date', 'c_arrest_date', 'c_days_from_compas', 'r_case_number', 'r_charge_degree', 'r_days_from_arrest', 'r_offense_date', 'r_charge_desc', 'r_jail_in', 'r_jail_out', 'is_violent_recid', 'num_vr_cases', 'vr_case_number', 'vr_offense_date', 'vr_charge_degree' , 'vr_charge_desc', 'screening_date', 'v_screening_date','num_r_cases','decile_score.1','c_charge_desc']
compas = compas.drop(columns=columns_drop, errors = 'ignore')
[train, test] = train_test_split(compas,test_size=0.3)
X_train = train.drop('is_recid',axis=1) 
y_train =train['is_recid']
X_test = test.drop('is_recid',axis=1)
y_test = test['is_recid'] 

In [347]:
ordinal_cat_cols = [ 'age_cat', 'c_charge_degree','decile_score','v_decile_score','v_score_text','score_text'] 
oe = OrdinalEncoder() 
ohe = OneHotEncoder(handle_unknown='ignore')
onehot_cat_cols = ['sex','race','type_of_assessment','v_type_of_assessment']
numerical_cols =['age', 'juv_fel_count','juv_misd_count','juv_other_count','priors_count'] 
ie = SimpleImputer(missing_values=np.nan,strategy='mean')
scaler=StandardScaler()
preprocessor= ColumnTransformer( 
    transformers = [
        ('cat',ohe,onehot_cat_cols),
        ('ord',oe,ordinal_cat_cols),
        ('num', ie, numerical_cols),
        ])

In [348]:
rf_pipeline = Pipeline(steps =[
    ('preprocessor',preprocessor),
    ('rf', RandomForestRegressor(n_estimators =100,random_state=42))
    ]) 
rf_pipeline.fit(X_train,y_train.ravel())
y_pred_rf = np.round(rf_pipeline.predict(X_test))
conf_mat_rf = confusion_matrix(y_test, y_pred_rf)
print(classification_report(y_test,y_pred_rf))

lr_pipeline = Pipeline(steps = [
    ('preprocessor',preprocessor),
    ('lr', LogisticRegression(max_iter=1000, random_state=42))
    ])

lr_pipeline.fit(X_train,y_train.ravel())
y_pred_lr = lr_pipeline.predict(X_test)
conf_mat_lr = confusion_matrix(y_test, y_pred_lr)
print(classification_report(y_test,y_pred_lr))

              precision    recall  f1-score   support

           0       0.73      0.79      0.76      2227
           1       0.48      0.40      0.44      1085

    accuracy                           0.66      3312
   macro avg       0.61      0.60      0.60      3312
weighted avg       0.65      0.66      0.65      3312

              precision    recall  f1-score   support

           0       0.72      0.91      0.80      2227
           1       0.60      0.28      0.38      1085

    accuracy                           0.70      3312
   macro avg       0.66      0.59      0.59      3312
weighted avg       0.68      0.70      0.67      3312



In [349]:
preprocessor= ColumnTransformer( 
    transformers = [
        ('cat',ohe,onehot_cat_cols),
        ('ord',oe,ordinal_cat_cols),
        ('num',ie,numerical_cols),
        ])
rf_pipeline = Pipeline(steps =[
    ('preprocessor',preprocessor),
    ('rf', RandomForestRegressor(n_estimators =100,random_state=42))
    ]) 
rf_pipeline.fit(X_train,y_train.ravel())

y_pred_rf = np.round(rf_pipeline.predict(X_test))
conf_mat_rf = confusion_matrix(y_test, y_pred_rf)
print(classification_report(y_test,y_pred_rf))

              precision    recall  f1-score   support

           0       0.73      0.79      0.76      2227
           1       0.48      0.40      0.44      1085

    accuracy                           0.66      3312
   macro avg       0.61      0.60      0.60      3312
weighted avg       0.65      0.66      0.65      3312



In [350]:
lr_pipeline = Pipeline(steps = [
    ('preprocessor',preprocessor),
    ('lr', LogisticRegression(penalty='l2',max_iter=1000, random_state=42))
    ])
lr_pipeline.fit(X_train,y_train.ravel())
y_pred_lr = lr_pipeline.predict(X_test)
conf_mat_lr = confusion_matrix(y_test, y_pred_lr)
print(classification_report(y_test,y_pred_lr))

              precision    recall  f1-score   support

           0       0.72      0.91      0.80      2227
           1       0.60      0.28      0.38      1085

    accuracy                           0.70      3312
   macro avg       0.66      0.59      0.59      3312
weighted avg       0.68      0.70      0.67      3312



In [351]:
y_pred_rf = np.round(rf_pipeline.predict(X_test))
conf_mat_rf = confusion_matrix(y_test, y_pred_rf)


group_col = 'race'
privileged_groups = [{'race': 1}] # assuming 1 means Caucasian
unprivileged_groups = [{'race': 0}] # assuming 0 means African American


sp_diff = (conf_mat_rf[1,1]/np.sum(conf_mat_rf[1,:])) - (conf_mat_rf[0,1]/np.sum(conf_mat_rf[0,:]))
print(f"Statistical parity difference: {sp_diff:.3f}")

tpr_priv = conf_mat_rf[1,1]/np.sum(conf_mat_rf[1,:])
tpr_unpriv = conf_mat_rf[0,1]/np.sum(conf_mat_rf[0,:])
eopp_diff = tpr_priv - tpr_unpriv
print(f"Equal opportunity difference: {eopp_diff:.3f}")



Statistical parity difference: 0.193
Equal opportunity difference: 0.193
Predictive rate parity difference: -0.031


In [353]:
feature_importances = rf_pipeline.named_steps['rf'].feature_importances_
onehot_names = rf_pipeline.named_steps['preprocessor'].named_transformers_['cat'].get_feature_names_out(onehot_cat_cols)
ord_names = rf_pipeline.named_steps['preprocessor'].named_transformers_['ord'].get_feature_names_out(ordinal_cat_cols)
feature_names = np.append(onehot_names,ord_names)
c_charge_degree_idx = np.where(feature_names == 'c_charge_degree')[0][0]
c_charge_degree_importance = feature_importances[c_charge_degree_idx]
print(f"Influence'c_charge_degree': {c_charge_degree_importance:.3f}")

Influence'c_charge_degree': 0.057


In [355]:

coef_lr = lr_pipeline.named_steps['lr'].coef_[0]
onehot_names = rf_pipeline.named_steps['preprocessor'].named_transformers_['cat'].get_feature_names_out(onehot_cat_cols)
ord_names = rf_pipeline.named_steps['preprocessor'].named_transformers_['ord'].get_feature_names_out(ordinal_cat_cols)
num_names = rf_pipeline.named_steps['preprocessor'].named_transformers_['num'].get_feature_names_out(numerical_cols)
feature_names = np.append(onehot_names,ord_names)
feature_names = np.append(feature_names,num_names)
c_charge_degree_idx = np.where(feature_names == 'c_charge_degree')[0][0]
#c_charge_degree_idx = X_train.columns.get_loc('c_charge_degree')
c_charge_degree_coef = coef_lr[c_charge_degree_idx]

print(f"Coeff 'c_charge_degree' in Logistic Regression: {c_charge_degree_coef:.3f}")

Coeff 'c_charge_degree' in Logistic Regression: 0.075
