In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OrdinalEncoder,  StandardScaler
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import random
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.impute import SimpleImputer

In [2]:
compas = pd.read_csv('compas-scores.csv')
compas = compas.drop_duplicates(subset = compas.columns.values.tolist()[1:], keep='first')
compas = compas.fillna("0")
compas = compas[compas['is_recid'] != -1]
columns_drop = ['id', 'name', 'first', 'last', 'compas_screening_date', 'dob', 'days_b_screening_arrest', 'c_jail_in', 'c_jail_out', 'c_case_number', 'c_offense_date', 'c_arrest_date', 'c_days_from_compas', 'r_case_number', 'r_charge_degree', 'r_days_from_arrest', 'r_offense_date', 'r_charge_desc', 'r_jail_in', 'r_jail_out', 'is_violent_recid', 'num_vr_cases', 'vr_case_number', 'vr_offense_date', 'vr_charge_degree' , 'vr_charge_desc', 'screening_date', 'v_screening_date','num_r_cases','decile_score.1','c_charge_desc']
compas = compas.drop(columns=columns_drop, errors = 'ignore')
[train, test] = train_test_split(compas,test_size=0.3)
X_train = train.drop('is_recid',axis=1) 
y_train =train['is_recid']
X_test = test.drop('is_recid',axis=1)
y_test = test['is_recid'] 

In [3]:
ordinal_cat_cols = [ 'age_cat', 'c_charge_degree','decile_score','v_decile_score','v_score_text','score_text'] 
oe = OrdinalEncoder() 
ohe = OneHotEncoder(handle_unknown='ignore')
onehot_cat_cols = ['sex','race','type_of_assessment','v_type_of_assessment']
numerical_cols =['age', 'juv_fel_count','juv_misd_count','juv_other_count','priors_count'] 
ie = SimpleImputer(missing_values=np.nan,strategy='mean')
scaler=StandardScaler()
preprocessor= ColumnTransformer( 
    transformers = [
        ('cat',ohe,onehot_cat_cols),
        ('ord',oe,ordinal_cat_cols),
        ('num', ie, numerical_cols),
        ])

In [5]:
rf_pipeline = Pipeline(steps =[
    ('preprocessor',preprocessor),
    ('rf', RandomForestRegressor(n_estimators =100,random_state=42))
    ]) 
rf_pipeline.fit(X_train,y_train.ravel())

y_pred_rf = np.round(rf_pipeline.predict(X_test))
conf_mat_rf = confusion_matrix(y_test, y_pred_rf)
print(classification_report(y_test,y_pred_rf))

lr_pipeline = Pipeline(steps = [
    ('preprocessor',preprocessor),
    ('lr', LogisticRegression(max_iter=1000, random_state=42))
    ])

lr_pipeline.fit(X_train,y_train.ravel())
y_pred_lr = lr_pipeline.predict(X_test)
conf_mat_lr = confusion_matrix(y_test, y_pred_lr)
print(classification_report(y_test,y_pred_lr))

              precision    recall  f1-score   support

           0       0.73      0.79      0.76      2221
           1       0.49      0.41      0.44      1091

    accuracy                           0.66      3312
   macro avg       0.61      0.60      0.60      3312
weighted avg       0.65      0.66      0.66      3312

              precision    recall  f1-score   support

           0       0.72      0.91      0.80      2221
           1       0.60      0.28      0.38      1091

    accuracy                           0.70      3312
   macro avg       0.66      0.59      0.59      3312
weighted avg       0.68      0.70      0.66      3312



In [6]:
# Voorspellingen maken voor RandomForestRegressor
y_pred_rf = np.round(rf_pipeline.predict(X_test))
compas_pred_rf = pd.DataFrame({'Race': compas.loc[X_test.index, 'race'],
                           'Score Text': compas.loc[X_test.index, 'score_text'],
                           'Actual': y_test,
                           'Predicted': y_pred_rf})

# Voorspellingen maken voor LogisticRegression
y_pred_lr = lr_pipeline.predict(X_test)
compas_pred_lr = pd.DataFrame({'Race': compas.loc[X_test.index, 'race'],
                           'Score Text': compas.loc[X_test.index, 'score_text'],
                           'Actual': y_test,
                           'Predicted': y_pred_lr})

# Sorteer de DataFrames op ras en scorecategorie
compas_pred_rf_sorted = compas_pred_rf.sort_values(['Race', 'Score Text'])
compas_pred_lr_sorted = compas_pred_lr.sort_values(['Race', 'Score Text'])


In [7]:
unique_races = compas['race'].unique()

for race in unique_races:
    # Voor RandomForestRegressor
    compas_race_rf = compas_pred_rf_sorted[compas_pred_rf_sorted['Race'] == race]
    conf_mat_rf_race = confusion_matrix(compas_race_rf['Actual'], compas_race_rf['Predicted'])
    report_rf_race = classification_report(compas_race_rf['Actual'], compas_race_rf['Predicted'])
    
    print(f"Race: {race} - RandomForestRegressor")
    print("Confusion Matrix:")
    print(conf_mat_rf_race)
    print("Classification Report:")
    print(report_rf_race)
    
    # Voor LogisticRegression
    compas_race_lr = compas_pred_lr_sorted[compas_pred_lr_sorted['Race'] == race]
    conf_mat_lr_race = confusion_matrix(compas_race_lr['Actual'], compas_race_lr['Predicted'])
    report_lr_race = classification_report(compas_race_lr['Actual'], compas_race_lr['Predicted'])
    
    print(f"Race: {race} - LogisticRegression")
    print("Confusion Matrix:")
    print(conf_mat_lr_race)
    print("Classification Report:")
    print(report_lr_race)
    print("\n")

Race: Other - RandomForestRegressor
Confusion Matrix:
[[113  17]
 [ 31  12]]
Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.87      0.82       130
           1       0.41      0.28      0.33        43

    accuracy                           0.72       173
   macro avg       0.60      0.57      0.58       173
weighted avg       0.69      0.72      0.70       173

Race: Other - LogisticRegression
Confusion Matrix:
[[129   1]
 [ 40   3]]
Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.99      0.86       130
           1       0.75      0.07      0.13        43

    accuracy                           0.76       173
   macro avg       0.76      0.53      0.50       173
weighted avg       0.76      0.76      0.68       173



Race: African-American - RandomForestRegressor
Confusion Matrix:
[[755 289]
 [329 316]]
Classification Report:
              precision    recall  f

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [8]:
group_counts = compas['race'].value_counts()
positive_outcomes = compas[compas['is_recid'] == 1]['race'].value_counts()
negative_outcomes = compas[compas['is_recid'] == 0]['race'].value_counts()


In [9]:
group_proportions = positive_outcomes / group_counts


In [10]:
overall_positive_proportion = compas[compas['is_recid'] == 1]['race'].count() / compas['race'].count()


In [11]:
disparate_impact = group_proportions / overall_positive_proportion


In [12]:
print(disparate_impact)

African-American    1.178567
Asian               0.618662
Caucasian           0.850215
Hispanic            0.771056
Native American     1.076410
Other               0.739207
Name: race, dtype: float64


In [13]:
group_counts = compas['c_charge_degree'].value_counts()
positive_outcomes = compas[compas['is_recid'] == 1]['c_charge_degree'].value_counts()
negative_outcomes = compas[compas['is_recid'] == 0]['c_charge_degree'].value_counts()

In [14]:
group_proportions = positive_outcomes / group_counts


In [15]:
overall_positive_proportion = compas[compas['is_recid'] == 1]['c_charge_degree'].count() / compas['c_charge_degree'].count()

In [16]:
disparate_impact = group_proportions / overall_positive_proportion


In [18]:
print(disparate_impact)

F    1.065464
M    0.864202
O    2.070018
Name: c_charge_degree, dtype: float64
