# Implementation

## Logistic Regression with 7 Features

In [2]:
# loading data and general information
import pandas as pd
import numpy as np

df_compas = pd.read_csv('compas-scores-two-years.csv')

df_compas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7214 entries, 0 to 7213
Data columns (total 53 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       7214 non-null   int64  
 1   name                     7214 non-null   object 
 2   first                    7214 non-null   object 
 3   last                     7214 non-null   object 
 4   compas_screening_date    7214 non-null   object 
 5   sex                      7214 non-null   object 
 6   dob                      7214 non-null   object 
 7   age                      7214 non-null   int64  
 8   age_cat                  7214 non-null   object 
 9   race                     7214 non-null   object 
 10  juv_fel_count            7214 non-null   int64  
 11  decile_score             7214 non-null   int64  
 12  juv_misd_count           7214 non-null   int64  
 13  juv_other_count          7214 non-null   int64  
 14  priors_count            

In [3]:
# inspection of the first 6 rows of the dataset
df_compas.head(6)

Unnamed: 0,id,name,first,last,compas_screening_date,sex,dob,age,age_cat,race,...,v_decile_score,v_score_text,v_screening_date,in_custody,out_custody,priors_count.1,start,end,event,two_year_recid
0,1,miguel hernandez,miguel,hernandez,2013-08-14,Male,1947-04-18,69,Greater than 45,Other,...,1,Low,2013-08-14,2014-07-07,2014-07-14,0,0,327,0,0
1,3,kevon dixon,kevon,dixon,2013-01-27,Male,1982-01-22,34,25 - 45,African-American,...,1,Low,2013-01-27,2013-01-26,2013-02-05,0,9,159,1,1
2,4,ed philo,ed,philo,2013-04-14,Male,1991-05-14,24,Less than 25,African-American,...,3,Low,2013-04-14,2013-06-16,2013-06-16,4,0,63,0,1
3,5,marcu brown,marcu,brown,2013-01-13,Male,1993-01-21,23,Less than 25,African-American,...,6,Medium,2013-01-13,,,1,0,1174,0,0
4,6,bouthy pierrelouis,bouthy,pierrelouis,2013-03-26,Male,1973-01-22,43,25 - 45,Other,...,1,Low,2013-03-26,,,2,0,1102,0,0
5,7,marsha miles,marsha,miles,2013-11-30,Male,1971-08-22,44,25 - 45,Other,...,1,Low,2013-11-30,2013-11-30,2013-12-01,0,1,853,0,0


In [4]:
# available labels for each row
print(df_compas.columns)

Index(['id', 'name', 'first', 'last', 'compas_screening_date', 'sex', 'dob',
       'age', 'age_cat', 'race', 'juv_fel_count', 'decile_score',
       'juv_misd_count', 'juv_other_count', 'priors_count',
       'days_b_screening_arrest', 'c_jail_in', 'c_jail_out', 'c_case_number',
       'c_offense_date', 'c_arrest_date', 'c_days_from_compas',
       'c_charge_degree', 'c_charge_desc', 'is_recid', 'r_case_number',
       'r_charge_degree', 'r_days_from_arrest', 'r_offense_date',
       'r_charge_desc', 'r_jail_in', 'r_jail_out', 'violent_recid',
       'is_violent_recid', 'vr_case_number', 'vr_charge_degree',
       'vr_offense_date', 'vr_charge_desc', 'type_of_assessment',
       'decile_score.1', 'score_text', 'screening_date',
       'v_type_of_assessment', 'v_decile_score', 'v_score_text',
       'v_screening_date', 'in_custody', 'out_custody', 'priors_count.1',
       'start', 'end', 'event', 'two_year_recid'],
      dtype='object')


In [5]:
# removing unused columns
df_LR7 = df_compas.drop(columns=['id', 'name', 'first', 'last', 'compas_screening_date', 'dob',
       'age_cat', 'race', 'decile_score', 'juv_other_count',
       'days_b_screening_arrest', 'c_jail_in', 'c_jail_out', 'c_case_number',
       'c_offense_date', 'c_arrest_date', 'c_days_from_compas', 'is_recid', 'r_case_number',
       'r_charge_degree', 'r_days_from_arrest', 'r_offense_date',
       'r_charge_desc', 'r_jail_in', 'r_jail_out', 'violent_recid',
       'is_violent_recid', 'vr_case_number', 'vr_charge_degree',
       'vr_offense_date', 'vr_charge_desc', 'type_of_assessment',
       'decile_score.1', 'score_text', 'screening_date',
       'v_type_of_assessment', 'v_decile_score', 'v_score_text',
       'v_screening_date', 'in_custody', 'out_custody', 'priors_count.1',
       'start', 'end', 'event'])

# transforming string values into numerical
df_LR7['sex'] = df_LR7['sex'].astype('category')
df_LR7['sex'] = df_LR7['sex'].cat.codes

df_LR7['c_charge_degree'] = df_LR7['c_charge_degree'].astype('category')
df_LR7['c_charge_degree'] = df_LR7['c_charge_degree'].cat.codes

df_LR7['c_charge_desc'] = df_LR7['c_charge_desc'].astype('category')
df_LR7['c_charge_desc'] = df_LR7['c_charge_desc'].cat.codes

In [6]:
# checking for null values in the data => it has no null values
df_LR7.isnull().sum()

sex                0
age                0
juv_fel_count      0
juv_misd_count     0
priors_count       0
c_charge_degree    0
c_charge_desc      0
two_year_recid     0
dtype: int64

In [7]:
# splitting the dataset in features and the variable to predict

X_overall_LR7 = df_LR7.drop(columns='two_year_recid')
y_overall_LR7 = df_LR7['two_year_recid']

#### Overall

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

lr = LogisticRegression(max_iter=1000)

kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=28)

# accuracy
kfscore_accuracy_overall_LR7 = cross_val_score(lr, X_overall_LR7, y_overall_LR7, cv=kf, scoring='accuracy')
accuracy_overall_LR7 = round(np.average(kfscore_accuracy_overall_LR7) * 100, 2)
print('Accuracy overall:', accuracy_overall_LR7)

# standard deviation
accuracy_overall_LR7_std = round(np.std(kfscore_accuracy_overall_LR7) * 100, 2)
table_overall_LR7 = str(accuracy_overall_LR7) + ' [+/-' + str(accuracy_overall_LR7_std) + ']'

# F1-score
kfscore_f1_overall_LR7 = cross_val_score(lr, X_overall_LR7, y_overall_LR7, cv=kf, scoring='f1')
F1_mean_LR7 = round(np.mean(kfscore_f1_overall_LR7) * 100, 2)
print('Mean of the F1-score:', F1_mean_LR7)

# standard deviaton of the F1-score
F1_std_LR7 = round(np.std(kfscore_f1_overall_LR7) * 100, 2)
print('Standard deviation of the F1-score:', F1_std_LR7)
table_F1_LR7 = str(F1_mean_LR7) + ' [+/-' + str(F1_std_LR7) + ']'


Accuracy overall: 67.58
Mean of the F1-score: 59.94
Standard deviation of the F1-score: 1.85


#### Black people

In [9]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix

# dataset for Black people
df_black_LR7 = df_compas.loc[df_compas['race'] == "African-American"]
df_black_LR7 = df_black_LR7.drop(columns=['id', 'name', 'first', 'last', 'compas_screening_date', 'dob',
       'age_cat', 'race', 'decile_score', 'juv_other_count',
       'days_b_screening_arrest', 'c_jail_in', 'c_jail_out', 'c_case_number',
       'c_offense_date', 'c_arrest_date', 'c_days_from_compas', 'is_recid', 'r_case_number',
       'r_charge_degree', 'r_days_from_arrest', 'r_offense_date',
       'r_charge_desc', 'r_jail_in', 'r_jail_out', 'violent_recid',
       'is_violent_recid', 'vr_case_number', 'vr_charge_degree',
       'vr_offense_date', 'vr_charge_desc', 'type_of_assessment',
       'decile_score.1', 'score_text', 'screening_date',
       'v_type_of_assessment', 'v_decile_score', 'v_score_text',
       'v_screening_date', 'in_custody', 'out_custody', 'priors_count.1',
       'start', 'end', 'event'])

# encoding
df_black_LR7['sex'] = df_black_LR7['sex'].astype('category')
df_black_LR7['sex'] = df_black_LR7['sex'].cat.codes
df_black_LR7['c_charge_degree'] = df_black_LR7['c_charge_degree'].astype('category')
df_black_LR7['c_charge_degree'] = df_black_LR7['c_charge_degree'].cat.codes
df_black_LR7['c_charge_desc'] = df_black_LR7['c_charge_desc'].astype('category')
df_black_LR7['c_charge_desc'] = df_black_LR7['c_charge_desc'].cat.codes

# splitting the data
X_black_LR7 = df_black_LR7.drop(columns='two_year_recid')
y_black_LR7 = df_black_LR7['two_year_recid']

# accuracy
kfscore_accuracy_black_LR7 = cross_val_score(lr, X_black_LR7, y_black_LR7, cv=kf, scoring='accuracy')
accuracy_black_LR7 = round(np.average(kfscore_accuracy_black_LR7) * 100, 2)
print('Accuracy for Black people:', accuracy_black_LR7)

# standard deviation
accuracy_black_LR7_std = round(np.std(kfscore_accuracy_black_LR7) * 100, 2)
table_black_LR7 = str(accuracy_black_LR7) + ' [+/-' + str(accuracy_black_LR7_std) + ']'

# false positive and false negative
y_pred_black_LR7 = cross_val_predict(lr, X_black_LR7, y_black_LR7, cv=kf)
conf_mat_black_LR7 = confusion_matrix(y_black_LR7, y_pred_black_LR7, normalize='true')
FP_black_LR7 = round(conf_mat_black_LR7[0,1] * 100, 2)
FN_black_LR7 = round(conf_mat_black_LR7[1,0] * 100, 2)
print('False Positive for Black people:', FP_black_LR7)
print('False Negative for Black people:', FN_black_LR7)

Accuracy for Black people: 66.88
False Positive for Black people: 37.77
False Negative for Black people: 28.72


#### White people

In [10]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix

# dataset for White people 
df_white_LR7 = df_compas.loc[df_compas['race'] == 'Caucasian']
df_white_LR7 = df_white_LR7.drop(columns=['id', 'name', 'first', 'last', 'compas_screening_date', 'dob',
       'age_cat', 'race', 'decile_score', 'juv_other_count',
       'days_b_screening_arrest', 'c_jail_in', 'c_jail_out', 'c_case_number',
       'c_offense_date', 'c_arrest_date', 'c_days_from_compas', 'is_recid', 'r_case_number',
       'r_charge_degree', 'r_days_from_arrest', 'r_offense_date',
       'r_charge_desc', 'r_jail_in', 'r_jail_out', 'violent_recid',
       'is_violent_recid', 'vr_case_number', 'vr_charge_degree',
       'vr_offense_date', 'vr_charge_desc', 'type_of_assessment',
       'decile_score.1', 'score_text', 'screening_date',
       'v_type_of_assessment', 'v_decile_score', 'v_score_text',
       'v_screening_date', 'in_custody', 'out_custody', 'priors_count.1',
       'start', 'end', 'event'])

# encoding
df_white_LR7['sex'] = df_white_LR7['sex'].astype('category')
df_white_LR7['sex'] = df_white_LR7['sex'].cat.codes
df_white_LR7['c_charge_degree'] = df_white_LR7['c_charge_degree'].astype('category')
df_white_LR7['c_charge_degree'] = df_white_LR7['c_charge_degree'].cat.codes
df_white_LR7['c_charge_desc'] = df_white_LR7['c_charge_desc'].astype('category')
df_white_LR7['c_charge_desc'] = df_white_LR7['c_charge_desc'].cat.codes

# splitting the data
X_white_LR7 = df_white_LR7.drop(columns='two_year_recid')
y_white_LR7 = df_white_LR7['two_year_recid']

# accuracy
kfscore_accuracy_white_LR7 = cross_val_score(lr, X_white_LR7, y_white_LR7, cv=kf, scoring='accuracy')
accuracy_white_LR7 = round(np.average(kfscore_accuracy_white_LR7) * 100, 2)
print('Accuracy for White people:', accuracy_white_LR7)
accuracy_white_LR7_std = round(np.std(kfscore_accuracy_overall_LR7) * 100, 2)
table_white_LR7 = str(accuracy_overall_LR7) + ' [+/-' + str(accuracy_overall_LR7_std) + ']'

# false positive and false negative
y_pred_white_LR7 = cross_val_predict(lr, X_white_LR7, y_white_LR7, cv=kf)
conf_mat_white_LR7 = confusion_matrix(y_white_LR7, y_pred_white_LR7, normalize='true')
FP_white_LR7 = round(conf_mat_white_LR7[0,1] * 100, 2)
FN_white_LR7 = round(conf_mat_white_LR7[1,0] * 100, 2)
print('False Positive for White people:', FP_white_LR7)
print('False Negative for White people:', FN_white_LR7)

Accuracy for White people: 67.4
False Positive for White people: 11.29
False Negative for White people: 65.42


#### Percentages within the data

In [11]:
# Black people
percentage_black = round(len(df_black_LR7) / len(df_compas) * 100, 2)
print("Percentage of Black people: " + str(percentage_black) + "%")

# White people
percentage_white = round(len(df_white_LR7) / len(df_compas) * 100, 2)
print("Percentage of White people: " + str(percentage_white) + "%")

# female
df_female = df_compas.loc[df_compas['sex'] == "Female"]
percentage_female = round(len(df_female) / len(df_compas) * 100, 2)
print("Percentage of female people: " + str(percentage_female) + "%")

# male
df_male = df_compas.loc[df_compas['sex'] == "Male"]
percentage_male = round(len(df_male) / len(df_compas) * 100, 2)
print("Percentage of male people: " + str(percentage_male) + "%")


Percentage of Black people: 51.23%
Percentage of White people: 34.02%
Percentage of female people: 19.34%
Percentage of male people: 80.66%


## Logistic Regression with 2 Features

#### Overall

In [12]:
# removing unused columns to only two features
df_LR2 = df_LR7.drop(columns=['sex', 'c_charge_degree', 'c_charge_desc'])

# add the total number of previous convictions as a feature 
df_LR2['total_number'] = df_LR2['juv_fel_count'] + df_LR2['juv_misd_count'] + df_LR2['priors_count']
df_LR2 = df_LR2.drop(columns=['juv_fel_count', 'juv_misd_count', 'priors_count'])

# splitting the data
X_overall_LR2 = df_LR2.drop(columns='two_year_recid')
y_overall_LR2 = df_LR2['two_year_recid']

# accuracy
kfscore_accuracy_overall_LR2 = cross_val_score(lr, X_overall_LR2, y_overall_LR2, cv=kf, scoring='accuracy')
accuracy_overall_LR2 = round(np.average(kfscore_accuracy_overall_LR2) * 100, 2)
print('Accuracy overall:', accuracy_overall_LR2)

# standard deviation
accuracy_overall_LR2_std = round(np.std(kfscore_accuracy_overall_LR2) * 100, 2)
table_overall_LR2 = str(accuracy_overall_LR2) + ' [+/-' + str(accuracy_overall_LR2_std) + ']'

# F1-score
kfscore_f1_overall_LR2 = cross_val_score(lr, X_overall_LR2, y_overall_LR2, cv=kf, scoring='f1')
F1_mean_LR2 = round(np.mean(kfscore_f1_overall_LR2) * 100, 2)
print('Mean of the F1-score:', F1_mean_LR2)

# standard deviation of the F1-score
F1_std_LR2 = round(np.std(kfscore_f1_overall_LR2) * 100, 2)
print('Standard deviation of the F1-score:', F1_std_LR2)
table_F1_LR2 = str(F1_mean_LR2) + ' [+/-' + str(F1_std_LR2) + ']'

Accuracy overall: 67.51
Mean of the F1-score: 58.38
Standard deviation of the F1-score: 3.06


#### African-American people

In [13]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix

# dataset for Black people
df_black_LR2 = df_black_LR7.drop(columns=['sex', 'c_charge_degree', 'c_charge_desc'])

# add the total number of previous convictions as a feature 
df_black_LR2['total_number'] = df_black_LR2['juv_fel_count'] + df_black_LR2['juv_misd_count'] + df_black_LR2['priors_count']
df_black_LR2 = df_black_LR2.drop(columns=['juv_fel_count', 'juv_misd_count', 'priors_count'])

# splitting the data
X_black_LR2 = df_black_LR2.drop(columns='two_year_recid')
y_black_LR2 = df_black_LR2['two_year_recid']

# accuracy
kfscore_accuracy_black_LR2 = cross_val_score(lr, X_black_LR2, y_black_LR2, cv=kf, scoring='accuracy')
accuracy_black_LR2 = round(np.average(kfscore_accuracy_black_LR2) * 100, 2)
print('Accuracy for Black people:', accuracy_black_LR2)

# standard deviation
accuracy_black_LR2_std = round(np.std(kfscore_accuracy_black_LR2) * 100, 2)
table_black_LR2 = str(accuracy_black_LR2) + ' [+/-' + str(accuracy_black_LR2_std) + ']'

# false positive and false negative
y_pred_black_LR2 = cross_val_predict(lr, X_black_LR2, y_black_LR2, cv=10)
conf_mat_black_LR2 = confusion_matrix(y_black_LR2, y_pred_black_LR2, normalize='true')
FP_black_LR2 = round(conf_mat_black_LR2[0,1] * 100, 2)
FN_black_LR2 = round(conf_mat_black_LR2[1,0] * 100, 2)
print('False Positive for Black people:', FP_black_LR2)
print('False Negative for Black people:', FN_black_LR2)

Accuracy for Black people: 67.56
False Positive for Black people: 35.71
False Negative for Black people: 28.93


#### Caucasian people

In [14]:
# dataset for White people
df_white_LR2 = df_white_LR7.drop(columns=['sex', 'c_charge_degree', 'c_charge_desc'])

# add the total number of previous convictions as a feature 
df_white_LR2['total_number'] = df_white_LR2['juv_fel_count'] + df_white_LR2['juv_misd_count'] + df_white_LR2['priors_count']
df_white_LR2 = df_white_LR2.drop(columns=['juv_fel_count', 'juv_misd_count', 'priors_count'])

# splitting the data
X_white_LR2 = df_white_LR2.drop(columns='two_year_recid')
y_white_LR2 = df_white_LR2['two_year_recid']

# accuracy
kfscore_accuracy_white_LR2 = cross_val_score(lr, X_white_LR2, y_white_LR2, cv=kf, scoring='accuracy')
accuracy_white_LR2 = round(np.average(kfscore_accuracy_white_LR2) * 100, 2)
print('Accuracy for White people:', accuracy_white_LR2)

#standard deviation
accuracy_white_LR2_std = round(np.std(kfscore_accuracy_overall_LR2) * 100, 2)
table_white_LR2 = str(accuracy_white_LR2) + ' [+/-' + str(accuracy_white_LR2_std) + ']'

# false positive and false negative
y_pred_white_LR2 = cross_val_predict(lr, X_white_LR2, y_white_LR2, cv=10)
conf_mat_white_LR2 = confusion_matrix(y_white_LR2, y_pred_white_LR2, normalize='true')
FP_white_LR2 = round(conf_mat_white_LR2[0,1] * 100, 2)
FN_white_LR2 = round(conf_mat_white_LR2[1,0] * 100, 2)
print('False Positive for White people:', FP_white_LR2)
print('False Negative for White people:', FN_white_LR2)

Accuracy for White people: 67.97
False Positive for White people: 10.48
False Negative for White people: 65.32


## Nonlinear Support Vector Machine with radial basis kernel and 7 Features

In [15]:
from sklearn.svm import SVC

svm = SVC(kernel='rbf')

# accuracy
kfscore_accuracy_overall_SVM = cross_val_score(svm, X_overall_LR7, y_overall_LR7, cv=kf, scoring='accuracy')
accuracy_overall_SVM = round(np.average(kfscore_accuracy_overall_SVM) * 100, 2)
print('Accuracy overall:', accuracy_overall_SVM)

# standard deviation
accuracy_overall_SVM_std = round(np.std(kfscore_accuracy_overall_SVM) * 100, 2)
table_overall_SVM = str(accuracy_overall_SVM) + ' [+/-' + str(accuracy_overall_SVM_std) + ']'

# F1-score
kfscore_f1_overall_SVM = cross_val_score(svm, X_overall_LR7, y_overall_LR7, cv=kf, scoring='f1')
F1_mean_SVM = round(np.mean(kfscore_f1_overall_SVM) * 100, 2)
print('Mean of the F1-score:', F1_mean_SVM)

# standard deviaton of the F1-score
F1_std_SVM = round(np.std(kfscore_f1_overall_SVM * 100), 2)
print('Standard deviation of the F1-score:', F1_std_SVM)
table_F1_SVM = str(F1_mean_SVM) + ' [+/-' + str(F1_std_SVM) + ']'

# accuracy for Black dataset
kfscore_accuracy_black_SVM = cross_val_score(svm, X_black_LR7, y_black_LR7, cv=kf, scoring='accuracy')
accuracy_black_SVM = round(np.average(kfscore_accuracy_black_SVM) * 100, 2)
print('Accuracy for Black people:', accuracy_black_SVM)

# standard deviation for Black dataset
accuracy_black_SVM_std = round(np.std(kfscore_accuracy_black_SVM) * 100, 2)
table_black_SVM = str(accuracy_black_SVM) + ' [+/-' + str(accuracy_black_SVM_std) + ']'

# accuracy for White dataset
kfscore_accuracy_white_SVM = cross_val_score(svm, X_white_LR7, y_white_LR7, cv=kf, scoring='accuracy')
accuracy_white_SVM = round(np.average(kfscore_accuracy_white_SVM) * 100, 2)
print('Accuracy for White people:', accuracy_white_SVM)

# standard deviation for White dataset
accuracy_white_SVM_std = round(np.std(kfscore_accuracy_white_SVM) * 100, 2)
table_white_SVM = str(accuracy_white_SVM) + ' [+/-' + str(accuracy_white_SVM_std) + ']'

# false positive and false negative for Black dataset
y_pred_black_SVM = cross_val_predict(svm, X_black_LR7, y_black_LR7, cv=kf)
conf_mat_black_SVM = confusion_matrix(y_black_LR7, y_pred_black_LR7, normalize='true')
FP_black_SVM = round(conf_mat_black_SVM[0,1] * 100, 2)
FN_black_SVM = round(conf_mat_black_SVM[1,0] * 100, 2)
print('False Positive for Black people:', FP_black_SVM)
print('False Negative for Black people:', FN_black_SVM)

# false positive and false negative for White dataset
y_pred_white_SVM = cross_val_predict(svm, X_white_LR7, y_white_LR7, cv=kf)
conf_mat_white_SVM = confusion_matrix(y_white_LR7, y_pred_white_LR7, normalize='true')
FP_white_SVM = round(conf_mat_white_SVM[0,1] * 100, 2)
FN_white_SVM = round(conf_mat_white_SVM[1,0] * 100, 2)
print('False Positive for White people:', FP_white_SVM)
print('False Negative for White people:', FN_white_SVM)

Accuracy overall: 62.84
Mean of the F1-score: 50.11
Standard deviation of the F1-score: 1.98
Accuracy for Black people: 61.2
Accuracy for White people: 61.74
False Positive for Black people: 37.77
False Negative for Black people: 28.72
False Positive for White people: 11.29
False Negative for White people: 65.42


# Comparison

## Logistic Regression with 8 features (including race)

#### Overall

In [16]:
# removing unused columns
df_LR8 = df_compas.drop(columns=['id', 'name', 'first', 'last', 'compas_screening_date', 'dob',
       'age_cat', 'decile_score', 'juv_other_count',
       'days_b_screening_arrest', 'c_jail_in', 'c_jail_out', 'c_case_number',
       'c_offense_date', 'c_arrest_date', 'c_days_from_compas', 'is_recid', 'r_case_number',
       'r_charge_degree', 'r_days_from_arrest', 'r_offense_date',
       'r_charge_desc', 'r_jail_in', 'r_jail_out', 'violent_recid',
       'is_violent_recid', 'vr_case_number', 'vr_charge_degree',
       'vr_offense_date', 'vr_charge_desc', 'type_of_assessment',
       'decile_score.1', 'score_text', 'screening_date',
       'v_type_of_assessment', 'v_decile_score', 'v_score_text',
       'v_screening_date', 'in_custody', 'out_custody', 'priors_count.1',
       'start', 'end', 'event'])

# encoding
df_LR8['sex'] = df_LR8['sex'].astype('category')
df_LR8['sex'] = df_LR8['sex'].cat.codes
df_LR8['c_charge_degree'] = df_LR8['c_charge_degree'].astype('category')
df_LR8['c_charge_degree'] = df_LR8['c_charge_degree'].cat.codes
df_LR8['c_charge_desc'] = df_LR8['c_charge_desc'].astype('category')
df_LR8['c_charge_desc'] = df_LR8['c_charge_desc'].cat.codes
df_LR8['race'] = df_LR8['race'].astype('category')
df_LR8['race'] = df_LR8['race'].cat.codes

# splitting the data
X_overall_LR8 = df_LR8.drop(columns='two_year_recid')
y_overall_LR8 = df_LR8['two_year_recid']

# accuracy
kfscore_accuracy_overall_LR8 = cross_val_score(lr, X_overall_LR8, y_overall_LR8, cv=kf, scoring='accuracy')
accuracy_overall_LR8 = round(np.average(kfscore_accuracy_overall_LR8) * 100, 2)
print('Accuracy overall:', accuracy_overall_LR8)

# standard deviation
accuracy_overall_LR8_std = round(np.std(kfscore_accuracy_overall_LR8) * 100, 2)
table_overall_LR8 = str(accuracy_overall_LR8) + ' [+/-' + str(accuracy_overall_LR8_std) + ']'

# F1-score
kfscore_f1_overall_LR8 = cross_val_score(lr, X_overall_LR8, y_overall_LR8, cv=kf, scoring='f1')
F1_mean_LR8 = round(np.mean(kfscore_f1_overall_LR8) * 100, 2)
print('Mean of the F1-score:', F1_mean_LR8)

# standard deviation of the F1-score
F1_std_LR8 = round(np.std(kfscore_f1_overall_LR8) * 100, 2)
print('Standard deviation of the F1-score:', F1_std_LR8)
table_F1_LR8 = str(F1_mean_LR8) + ' [+/-' + str(F1_std_LR8) + ']'


Accuracy overall: 67.74
Mean of the F1-score: 60.42
Standard deviation of the F1-score: 2.07


#### African-American people

In [17]:
# dataset for Black people
df_black_LR8 = df_compas.loc[df_compas['race'] == "African-American"]
df_black_LR8 = df_black_LR8.drop(columns=['id', 'name', 'first', 'last', 'compas_screening_date', 'dob',
       'age_cat', 'decile_score', 'juv_other_count',
       'days_b_screening_arrest', 'c_jail_in', 'c_jail_out', 'c_case_number',
       'c_offense_date', 'c_arrest_date', 'c_days_from_compas', 'is_recid', 'r_case_number',
       'r_charge_degree', 'r_days_from_arrest', 'r_offense_date',
       'r_charge_desc', 'r_jail_in', 'r_jail_out', 'violent_recid',
       'is_violent_recid', 'vr_case_number', 'vr_charge_degree',
       'vr_offense_date', 'vr_charge_desc', 'type_of_assessment',
       'decile_score.1', 'score_text', 'screening_date',
       'v_type_of_assessment', 'v_decile_score', 'v_score_text',
       'v_screening_date', 'in_custody', 'out_custody', 'priors_count.1',
       'start', 'end', 'event'])

# encoding
df_black_LR8['sex'] = df_black_LR8['sex'].astype('category')
df_black_LR8['sex'] = df_black_LR8['sex'].cat.codes
df_black_LR8['c_charge_degree'] = df_black_LR8['c_charge_degree'].astype('category')
df_black_LR8['c_charge_degree'] = df_black_LR8['c_charge_degree'].cat.codes
df_black_LR8['c_charge_desc'] = df_black_LR8['c_charge_desc'].astype('category')
df_black_LR8['c_charge_desc'] = df_black_LR8['c_charge_desc'].cat.codes
df_black_LR8['race'] = df_black_LR8['race'].astype('category')
df_black_LR8['race'] = df_black_LR8['race'].cat.codes

# splitting the data
X_black_LR8 = df_black_LR8.drop(columns='two_year_recid')
y_black_LR8 = df_black_LR8['two_year_recid']

# accuracy
kfscore_accuracy_black_LR8 = cross_val_score(lr, X_black_LR8, y_black_LR8, cv=kf, scoring='accuracy')
accuracy_black_LR8 = round(np.average(kfscore_accuracy_black_LR8) * 100, 2)
print('Accuracy for Black people:', accuracy_black_LR8)

# standard deviation
accuracy_black_LR8_std = round(np.std(kfscore_accuracy_black_LR8) * 100, 2)
table_black_LR8 = str(accuracy_black_LR8) + ' [+/-' + str(accuracy_black_LR8_std) + ']'

# false positive and false negative
y_pred_black_LR8 = cross_val_predict(lr, X_black_LR8, y_black_LR8, cv=kf)
conf_mat_black_LR8 = confusion_matrix(y_black_LR8, y_pred_black_LR8, normalize='true')
FP_black_LR8 = round(conf_mat_black_LR8[0,1] * 100, 2)
FN_black_LR8 = round(conf_mat_black_LR8[1,0] * 100, 2)
print('False Positive for Black people:', FP_black_LR8)
print('False Negative for Black people:', FN_black_LR8)

Accuracy for Black people: 66.91
False Positive for Black people: 37.77
False Negative for Black people: 28.67


#### Caucasian people

In [18]:
# dataset for White people
df_white_LR8 = df_compas.loc[df_compas['race'] == 'Caucasian']
df_white_LR8 = df_white_LR8.drop(columns=['id', 'name', 'first', 'last', 'compas_screening_date', 'dob',
       'age_cat', 'decile_score', 'juv_other_count',
       'days_b_screening_arrest', 'c_jail_in', 'c_jail_out', 'c_case_number',
       'c_offense_date', 'c_arrest_date', 'c_days_from_compas', 'is_recid', 'r_case_number',
       'r_charge_degree', 'r_days_from_arrest', 'r_offense_date',
       'r_charge_desc', 'r_jail_in', 'r_jail_out', 'violent_recid',
       'is_violent_recid', 'vr_case_number', 'vr_charge_degree',
       'vr_offense_date', 'vr_charge_desc', 'type_of_assessment',
       'decile_score.1', 'score_text', 'screening_date',
       'v_type_of_assessment', 'v_decile_score', 'v_score_text',
       'v_screening_date', 'in_custody', 'out_custody', 'priors_count.1',
       'start', 'end', 'event'])

# encoding
df_white_LR8['sex'] = df_white_LR8['sex'].astype('category')
df_white_LR8['sex'] = df_white_LR8['sex'].cat.codes
df_white_LR8['c_charge_degree'] = df_white_LR8['c_charge_degree'].astype('category')
df_white_LR8['c_charge_degree'] = df_white_LR8['c_charge_degree'].cat.codes
df_white_LR8['c_charge_desc'] = df_white_LR8['c_charge_desc'].astype('category')
df_white_LR8['c_charge_desc'] = df_white_LR8['c_charge_desc'].cat.codes
df_white_LR8['race'] = df_white_LR8['race'].astype('category')
df_white_LR8['race'] = df_white_LR8['race'].cat.codes

# splitting the data
X_white_LR8 = df_white_LR8.drop(columns='two_year_recid')
y_white_LR8 = df_white_LR8['two_year_recid']

# accuracy
kfscore_accuracy_white_LR8 = cross_val_score(lr, X_white_LR8, y_white_LR8, cv=kf, scoring='accuracy')
accuracy_white_LR8 = round(np.average(kfscore_accuracy_white_LR8) * 100, 2)
print('Accuracy for White people:', accuracy_white_LR8)

# standard deviation
accuracy_white_LR8_std = round(np.std(kfscore_accuracy_overall_LR8) * 100, 2)
table_white_LR8 = str(accuracy_overall_LR8) + ' [+/-' + str(accuracy_overall_LR8_std) + ']'

# false positive and false negative
y_pred_white_LR8 = cross_val_predict(lr, X_white_LR8, y_white_LR8, cv=kf)
conf_mat_white_LR8 = confusion_matrix(y_white_LR8, y_pred_white_LR8, normalize='true')
FP_white_LR8 = round(conf_mat_white_LR8[0,1] * 100, 2)
FN_white_LR8 = round(conf_mat_white_LR8[1,0] * 100, 2)
print('False Positive for White people:', FP_white_LR8)
print('False Negative for White people:', FN_white_LR8)

Accuracy for White people: 67.4
False Positive for White people: 11.29
False Negative for White people: 65.42


# Further Observation

## Logistic Regression with all features

#### Overall

In [18]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

# splitting the data
X_overall_LR = df_compas.drop(columns='two_year_recid')
y_overall_LR = df_compas['two_year_recid']

# sort data into numerical and categorical columns
numerical_cols = X_overall_LR.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = X_overall_LR.select_dtypes(include=['object']).columns

# pipeline to encode the data
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=0.0)),
    ('scaler', MinMaxScaler())
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='-1')),
    ('enocder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# preprocessor to combine the transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ],
    remainder='passthrough'
).set_output(transform='pandas')

# add the preprocessor to the model
lr_process = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', lr)
])

In [19]:
# accuracy
kfscore_accuracy_overall_LR = cross_val_score(lr_process, X_overall_LR, y_overall_LR, cv=kf, scoring='accuracy')
accuracy_overall_LR = round(np.average(kfscore_accuracy_overall_LR) * 100, 2)
print('Accuracy overall:', accuracy_overall_LR)

# standard deviation
accuracy_overall_LR_std = round(np.std(kfscore_accuracy_overall_LR) * 100, 2)
table_overall_LR = str(accuracy_overall_LR) + ' [+/-' + str(accuracy_overall_LR_std) + ']'

# F1-score
kfscore_f1_overall_LR = cross_val_score(lr_process, X_overall_LR, y_overall_LR, cv=kf, scoring='f1')
F1_mean_LR = round(np.mean(kfscore_f1_overall_LR) * 100, 2)
print('Mean of the F1-score:', F1_mean_LR)

# standard deviation of the F1-score
F1_std_LR = round(np.std(kfscore_f1_overall_LR) * 100, 2)
print('Standard deviation of the F1-score:', F1_std_LR)
table_F1_LR = str(F1_mean_LR) + ' [+/-' + str(F1_std_LR) + ']'

Accuracy overall: 97.6
Mean of the F1-score: 97.41
Standard deviation of the F1-score: 0.68


#### African-American people

In [20]:
# dataset for Black people
df_black_LR = df_compas.loc[df_compas['race'] == "African-American"]

# splitting the data
X_black_LR = df_black_LR.drop(columns='two_year_recid')
y_black_LR = df_black_LR['two_year_recid']

# sort data into numerical and categorical columns
numerical_cols = X_black_LR.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = X_black_LR.select_dtypes(include=['object']).columns

# accuracy
kfscore_accuracy_black_LR = cross_val_score(lr_process, X_black_LR, y_black_LR, cv=kf, scoring='accuracy')
accuracy_black_LR = round(np.average(kfscore_accuracy_black_LR) * 100, 2)
print('Accuracy for Black people:', accuracy_black_LR)

# standard deviation
accuracy_black_LR_std = round(np.std(kfscore_accuracy_black_LR) * 100, 2)
table_black_LR = str(accuracy_black_LR) + ' [+/-' + str(accuracy_black_LR_std) + ']'

# false positive and false negative
y_pred_black_LR = cross_val_predict(lr_process, X_black_LR, y_black_LR, cv=kf)
conf_mat_black_LR = confusion_matrix(y_black_LR, y_pred_black_LR, normalize='true')
FP_black_LR = round(conf_mat_black_LR[0,1] * 100, 2)
FN_black_LR = round(conf_mat_black_LR[1,0] * 100, 2)
print('False Positive for Black people:', FP_black_LR)
print('False Negative for Black people:',  FN_black_LR)

Accuracy for Black people: 96.78
False Positive for Black people: 6.35
False Negative for Black people: 0.26


#### Caucasian people

In [21]:
# dataset for White people
df_white_LR = df_compas.loc[df_compas['race'] == 'Caucasian']

# splitting the data
X_white_LR = df_white_LR.drop(columns='two_year_recid')
y_white_LR = df_white_LR['two_year_recid']

# sort data into numerical and categorical columns
numerical_cols = X_white_LR.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = X_white_LR.select_dtypes(include=['object']).columns

# accuracy
kfscore_accuracy_white_LR = cross_val_score(lr_process, X_white_LR, y_white_LR, cv=kf, scoring='accuracy')
accuracy_white_LR = round(np.average(kfscore_accuracy_white_LR) * 100, 2)
print('Accuracy for White people:', accuracy_white_LR)

# standard deviation
accuracy_white_LR_std = round(np.std(kfscore_accuracy_overall_LR) * 100, 2)
table_white_LR = str(accuracy_overall_LR) + ' [+/-' + str(accuracy_overall_LR_std) + ']'

# false positive and false negative
y_pred_white_LR = cross_val_predict(lr_process, X_white_LR, y_white_LR, cv=kf)
conf_mat_white_LR = confusion_matrix(y_white_LR, y_pred_white_LR, normalize='true')
FP_white_LR = round(conf_mat_white_LR[0,1] * 100, 2)
FN_white_LR = round(conf_mat_white_LR[1,0] * 100, 2)
print('False Positive for White people:', FP_white_LR)
print('False Negative for White people:', FN_white_LR)

Accuracy for White people: 97.68
False Positive for White people: 3.83
False Negative for White people: 0.0


####

# Experiments

## Mitigation

#### Vanilla XG Boost


In [19]:
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

# the chosen parameters originate from the paper about Tree Boosting Methods

# parameters 
params_vanilla = {
    'max_depth': 6,
    'learning_rate': 0.3,
    'subsample': 1,
    'colsample_bytree': 1,
}

# model
xgb_vanilla = XGBClassifier(**params_vanilla)

# F1-score
xgb_cv = cross_val_score(xgb_vanilla, X_overall_LR8, y_overall_LR8, cv=kf, scoring='f1')
F1_mean_vanilla = round(np.mean(xgb_cv) * 100, 2)
print('Mean of the F1-score:', F1_mean_vanilla)

# standard deviation of the F1-score
F1_std_vanilla = round(np.std(xgb_cv) * 100, 2)
print('Standard deviation of the F1-score:', F1_std_vanilla)

# false positive and false negative for Black dataset
y_pred_black_vanilla = cross_val_predict(xgb_vanilla, X_black_LR8, y_black_LR8, cv=kf)
conf_mat_black_vanilla = confusion_matrix(y_black_LR8, y_pred_black_vanilla, normalize='true')
FP_black_vanilla = round(conf_mat_black_vanilla[0,1] * 100, 2)
print('False Positive for Black people:', FP_black_vanilla)
FN_black_vanilla = round(conf_mat_black_vanilla[1,0] * 100, 2)
print('False Negative for Black people:', FN_black_vanilla)

# false positive and false negative for White dataset
y_pred_white_vanilla = cross_val_predict(xgb_vanilla, X_white_LR8, y_white_LR8, cv=kf)
conf_mat_white_vanilla = confusion_matrix(y_white_LR8, y_pred_white_vanilla, normalize='true')
FP_white_vanilla = round(conf_mat_white_vanilla[0,1] * 100, 2)
print('False Positive for White people:', FP_white_vanilla)
FN_white_vanilla = round(conf_mat_white_vanilla[1,0] * 100, 2)
print('False Negative for White people:', FN_white_vanilla)


Mean of the F1-score: 60.05
Standard deviation of the F1-score: 1.84
False Positive for Black people: 34.15
False Negative for Black people: 34.67
False Positive for White people: 24.73
False Negative for White people: 54.24


#### XG Boost with RandomizedSearch


In [20]:
# beginning parameters for tuning are from the paper about Tree Boosting Methods
# param = {
#     'max_depth': [3, 6, 12, 20],
#     'learning_rate': [0.02, 0.1, 0.2],
#     'subsample': [0.4, 0.8, 1],
#     'colsample_bytree': [0.4, 0.6, 1],
#     'n_estimators': [100, 1000, 5000],
# }

# parameters
param_rs = {
    'max_depth': [3],
    'learning_rate': [0.1],
    'subsample': [1],
    'colsample_bytree': [0.6],
    'n_estimators': [100],
}

# model
xgb_rs = XGBClassifier()

# randomized search and tuning
random_search = RandomizedSearchCV(xgb_rs, param_distributions=param_rs, scoring='f1', cv=kf, n_iter=1)
random_search.fit(X_overall_LR8, y_overall_LR8)

# F1-score
F1_mean_rs = round(np.mean(random_search.score(X_overall_LR8, y_overall_LR8)) * 100, 2)
print('Mean of the F1-score:', F1_mean_rs)

# standard deviation of the F1-score
F1_std_rs = random_search.cv_results_['std_test_score']
F1_std_rs_rounded = round(F1_std_rs[0] * 100, 2)
print('Standard deviation of the F1-score:', F1_std_rs_rounded)

# false positive and false negative for Black dataset
y_pred_black_rs = cross_val_predict(random_search, X_black_LR8, y_black_LR8, cv=kf)
conf_mat_black_rs = confusion_matrix(y_black_LR8, y_pred_black_rs, normalize='true')
FP_black_rs = round(conf_mat_black_rs[0,1] * 100, 2)
print('False Positive for Black people:', FP_black_rs)
FN_black_rs = round(conf_mat_black_rs[1,0] * 100, 2)
print('False Negative for Black people:', FN_black_rs)

# false positive and false negative for White dataset
y_pred_white_rs = cross_val_predict(random_search, X_white_LR8, y_white_LR8, cv=kf)
conf_mat_white_rs = confusion_matrix(y_white_LR8, y_pred_white_rs, normalize='true')
FP_white_rs = round(conf_mat_white_rs[0,1] * 100, 2)
print('False Positive for White people:', FP_white_rs)
FN_white_rs = round(conf_mat_white_rs[1,0] * 100, 2)
print('False Negative for White people:', FN_white_rs)

Mean of the F1-score: 63.91
Standard deviation of the F1-score: 2.66
False Positive for Black people: 33.15
False Negative for Black people: 31.51
False Positive for White people: 15.52
False Negative for White people: 57.56


#### XG Boost with GridSearch

In [21]:
# beginning parameters for tuning are from the paper about Tree Boosting Methods
# param = {
#     'max_depth': [3, 6, 12, 20],
#     'learning_rate': [0.02, 0.1, 0.2],
#     'subsample': [0.4, 0.8, 1],
#     'colsample_bytree': [0.4, 0.6, 1],
#     'n_estimators': [100, 1000, 5000],
# }

# parameters
param_gs = {
    'max_depth': [3],
    'learning_rate': [0.1],
    'subsample': [0.4],
    'colsample_bytree': [0.6],
    'n_estimators': [100],
}

# model
xgb_gs = XGBClassifier()

# grid search and tuning
grid_search = GridSearchCV(xgb_gs, param_grid=param_gs, scoring='f1', cv=kf, n_jobs=-1)
grid_search.fit(X_overall_LR8, y_overall_LR8)

# F1-score
F1_mean_gs = round(np.mean(grid_search.score(X_overall_LR8, y_overall_LR8)) * 100, 2)
print('Mean of the F1-score:', F1_mean_gs)

# standard deviation of the F1-score
F1_std_gs = grid_search.cv_results_['std_test_score']
F1_std_gs_rounded = round(F1_std_gs[0] * 100, 2)
print('Standard deviation of the F1-score:', F1_std_gs_rounded)

# false positive and false negative for Black dataset
y_pred_black_gs = cross_val_predict(grid_search, X_black_LR8, y_black_LR8, cv=kf)
conf_mat_black_gs = confusion_matrix(y_black_LR8, y_pred_black_gs, normalize='true')
FP_black_gs = round(conf_mat_black_gs[0,1] * 100, 2)
print('False Positive for Black people:', FP_black_rs)
FN_black_gs = round(conf_mat_black_gs[1,0] * 100, 2)
print('False Negative for Black people:', FN_black_rs)

# false positive and false negative for White dataset
y_pred_white_gs = cross_val_predict(grid_search, X_white_LR8, y_white_LR8, cv=kf)
conf_mat_white_gs = confusion_matrix(y_white_LR8, y_pred_white_gs, normalize='true')
FP_white_gs = round(conf_mat_white_gs[0,1] * 100, 2)
print('False Positive for White people:', FP_white_rs)
FN_white_gs = round(conf_mat_white_gs[1,0] * 100, 2)
print('False Negative for White people:', FN_white_rs)



Mean of the F1-score: 64.64
Standard deviation of the F1-score: 2.58
False Positive for Black people: 33.15
False Negative for Black people: 31.51
False Positive for White people: 15.52
False Negative for White people: 57.56


#### Grid Search with XG Boost and Fairlearn

In [22]:
from fairlearn.preprocessing import CorrelationRemover

# transforming the data with the correlation remover
cr = CorrelationRemover(sensitive_feature_ids=['race'])
X_overall_fairlearn = cr.fit_transform(X_overall_LR8)

# beginning parameters for tuning are from the paper about Tree Boosting Methods
# param = {
#     'max_depth': [3, 6, 12, 20],
#     'learning_rate': [0.02, 0.1, 0.2],
#     'subsample': [0.4, 0.8, 1],
#     'colsample_bytree': [0.4, 0.6, 1],
#     'n_estimators': [100, 1000, 5000],
# }

# parameters
param_gs_cr = {
    'max_depth': [3],
    'learning_rate': [0.2],
    'subsample': [0.4],
    'colsample_bytree': [0.4],
    'n_estimators': [100],
}

# model
xgb_gs_cr = XGBClassifier()

# grid search and tuning
grid_search_cr = GridSearchCV(xgb_gs_cr, param_grid=param_gs_cr, scoring='f1', cv=kf, n_jobs=-1)
grid_search_cr.fit(X_overall_fairlearn, y_overall_LR8)

# Accuracy
grid_search_cr_acc = GridSearchCV(xgb_gs_cr, param_grid=param_gs_cr, scoring='accuracy', cv=kf, n_jobs=-1)
grid_search_cr_acc.fit(X_overall_fairlearn, y_overall_LR8)
acc_mean_gs_cr = round(np.mean(grid_search_cr_acc.score(X_overall_fairlearn, y_overall_LR8)) * 100, 2)
print('Mean of the Accuracy:', acc_mean_gs_cr)

# F1-score
F1_mean_gs_cr = round(np.mean(grid_search_cr.score(X_overall_fairlearn, y_overall_LR8)) * 100, 2)
print('Mean of the F1-score:', F1_mean_gs_cr)

# standard deviation of the F1-score
F1_std_gs_cr = grid_search_cr.cv_results_['std_test_score']
F1_std_gs_cr_rounded = round(F1_std_gs_cr[0] * 100, 2)
print('Standard deviation of the F1-score:', F1_std_gs_cr_rounded)

# false positive and false negative for Black dataset 
X_black_fairlearn = cr.fit_transform(X_black_LR8)
y_pred_black_gs_cr = cross_val_predict(grid_search_cr, X_black_fairlearn, y_black_LR8, cv=kf)
conf_mat_black_gs_cr = confusion_matrix(y_black_LR8, y_pred_black_gs_cr, normalize='true')
FP_black_gs_cr = round(conf_mat_black_gs_cr[0,1] * 100, 2)
print('False Positive for Black people:', FP_black_gs_cr)
FN_black_gs_cr = round(conf_mat_black_gs_cr[1,0] * 100, 2)
print('False Negative for Black people:', FN_black_gs_cr)

# false positive and false negative for White dataset
X_white_fairlearn = cr.fit_transform(X_white_LR8)
y_pred_white_gs_cr = cross_val_predict(grid_search_cr, X_white_fairlearn, y_white_LR8, cv=kf)
conf_mat_white_gs_cr = confusion_matrix(y_white_LR8, y_pred_white_gs_cr, normalize='true')
FP_white_gs_cr = round(conf_mat_white_gs_cr[0,1] * 100, 2)
print('False Positive for White people:', FP_white_gs_cr)
FN_white_gs_cr = round(conf_mat_white_gs_cr[1,0] * 100, 2)
print('False Negative for White people:', FN_white_gs_cr)


Mean of the Accuracy: 69.98
Mean of the F1-score: 64.95
Standard deviation of the F1-score: 2.77
False Positive for Black people: 34.09
False Negative for Black people: 30.56
False Positive for White people: 17.88
False Negative for White people: 56.73


#### XGBoost with all Features

In [23]:
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, FunctionTransformer

# XGBoost without any parameters
xgb = XGBClassifier()

# splitting the data
X_overall_LR = df_compas.drop(columns='two_year_recid')
y_overall_LR = df_compas['two_year_recid']

# sort data into numerical and categorical columns
numerical_cols = X_overall_LR.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = X_overall_LR.select_dtypes(include=['object']).columns

# pipeline to encode the data
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=0.0)),
    ('scaler', MinMaxScaler())
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='-1')),
    ('enocder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# One Hot Encoder uses special character like [, ] or < but they need to be removed 
def clean_feature_names(X):
    X.columns = X.columns.str.replace(r'[\[\]<]', '', regex=True)
    return X

# use it as an additional transformer in the final pipeline
cleaner_transformer = FunctionTransformer(clean_feature_names)

# preprocessor to combine the transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ],
    remainder='passthrough'
).set_output(transform='pandas')

# add the preprocessor to the model
xgb_process = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('cleaner', cleaner_transformer),
    ('model', xgb)
])

In [25]:
# accuracy
kfscore_accuracy_overall_XGB = cross_val_score(xgb_process, X_overall_LR, y_overall_LR, cv=kf, scoring='accuracy')
accuracy_overall_XGB = round(np.average(kfscore_accuracy_overall_XGB) * 100, 2)
print('Accuracy overall:', accuracy_overall_XGB)

# standard deviation
accuracy_overall_XGB_std = round(np.std(kfscore_accuracy_overall_XGB) * 100, 2)
table_overall_XGB = str(accuracy_overall_XGB) + ' [+/-' + str(accuracy_overall_XGB_std) + ']'

# F1-score
kfscore_f1_overall_XGB = cross_val_score(xgb_process, X_overall_LR, y_overall_LR, cv=kf, scoring='f1')
F1_mean_XGB = round(np.mean(kfscore_f1_overall_XGB) * 100, 2)
print('Mean of the F1-score:', F1_mean_XGB)

# standard deviation of the F1-score
F1_std_XGB = round(np.std(kfscore_f1_overall_XGB) * 100, 2)
print('Standard deviation of the F1-score:', F1_std_XGB)  
table_F1_XGB = str(F1_mean_XGB) + ' [+/-' + str(F1_std_XGB) + ']'

Accuracy overall: 98.96
Mean of the F1-score: 98.85
Standard deviation of the F1-score: 0.43


In [26]:
print('Standard deviation for overall accuracy:', accuracy_overall_XGB_std)

Standard deviation for overall accuracy: 0.39


#### African-American people

In [27]:
# dataset for Black people
df_black_LR = df_compas.loc[df_compas['race'] == "African-American"]

# splitting the data
X_black_LR = df_black_LR.drop(columns='two_year_recid')
y_black_LR = df_black_LR['two_year_recid']

# sort data into numerical and categorical columns
numerical_cols = X_black_LR.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = X_black_LR.select_dtypes(include=['object']).columns

# accuracy
kfscore_accuracy_black_XGB = cross_val_score(xgb_process, X_black_LR, y_black_LR, cv=kf, scoring='accuracy')
accuracy_black_XGB = round(np.average(kfscore_accuracy_black_XGB) * 100, 2)
print('Accuracy for Black people:', accuracy_black_XGB)

# standard deviation
accuracy_black_XGB_std = round(np.std(kfscore_accuracy_black_XGB) * 100, 2)
table_black_XGB = str(accuracy_black_XGB) + ' [+/-' + str(accuracy_black_XGB_std) + ']'

# false positive and false negative
y_pred_black_XGB = cross_val_predict(xgb_process, X_black_LR, y_black_LR, cv=kf)
conf_mat_black_XGB = confusion_matrix(y_black_LR, y_pred_black_XGB, normalize='true')
FP_black_XGB = round(conf_mat_black_XGB[0,1] * 100, 2)
FN_black_XGB = round(conf_mat_black_XGB[1,0] * 100, 2)
print('False Positive for Black people:', FP_black_XGB)
print('False Negative for Black people:',  FN_black_XGB)

Accuracy for Black people: 98.4
False Positive for Black people: 2.4
False Negative for Black people: 0.84


In [28]:
print('Standard deviation for Black accuracy:', accuracy_black_XGB_std)

Standard deviation for Black accuracy: 0.47


#### Caucasian people

In [29]:
# dataset for White people
df_white_XGB = df_compas.loc[df_compas['race'] == 'Caucasian']

# splitting the data
X_white_XGB = df_white_XGB.drop(columns='two_year_recid')
y_white_XGB = df_white_XGB['two_year_recid']

# sort data into numerical and categorical columns
numerical_cols = X_white_XGB.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = X_white_XGB.select_dtypes(include=['object']).columns

# accuracy
kfscore_accuracy_white_XGB = cross_val_score(xgb_process, X_white_XGB, y_white_XGB, cv=kf, scoring='accuracy')
accuracy_white_XGB = round(np.average(kfscore_accuracy_white_XGB) * 100, 2)
print('Accuracy for White people:', accuracy_white_XGB)

# standard deviation
accuracy_white_XGB_std = round(np.std(kfscore_accuracy_overall_XGB) * 100, 2)
table_white_XGB = str(accuracy_overall_XGB) + ' [+/-' + str(accuracy_overall_XGB_std) + ']'

# false positive and false negative
y_pred_white_XGB = cross_val_predict(xgb_process, X_white_XGB, y_white_XGB, cv=kf)
conf_mat_white_XGB = confusion_matrix(y_white_XGB, y_pred_white_XGB, normalize='true')
FP_white_XGB = round(conf_mat_white_XGB[0,1] * 100, 2)
FN_white_XGB = round(conf_mat_white_XGB[1,0] * 100, 2)
print('False Positive for White people:', FP_white_XGB)
print('False Negative for White people:', FN_white_XGB)

Accuracy for White people: 99.35
False Positive for White people: 0.81
False Negative for White people: 0.41


In [30]:
print('Standard deviation for White accuracy:',accuracy_white_XGB_std) 

Standard deviation for White accuracy: 0.39


#### Logistic Regression with RandomizedSearch

In [31]:
# beginning parameters for tuning are from the articles in Level Up Coding, Toward Data Science and StackOverflow
# the default solver of the logistic regression is 'lbfgs'. In order to compare it to the previous results with LR this is not changed.
# param = {
#     'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
#     'penalty': ['l2', None],
#     'solver': ['lbfgs'], 
#     'max_iter': [1000, 1500, 2000, 2500, 3000],
# }

# parameters
param_lr_rs = {
    'C': [100],
    'penalty': ['l2'],
    'solver': ['lbfgs'], # the default solver of the logistic regression is 'lbfgs'. In order to compare it to the previous results with LR this is not changed.
    'max_iter': [1000],
}

# model
lr = LogisticRegression()

# random search and tuning
random_search_lr = RandomizedSearchCV(lr, param_distributions=param_lr_rs, scoring='f1', cv=kf, n_iter=1)
random_search_lr.fit(X_overall_LR8, y_overall_LR8)

# F1-score
F1_mean_lr_rs = round(np.mean(random_search_lr.score(X_overall_LR8, y_overall_LR8)) * 100, 2)
print('Mean of the F1-score:', F1_mean_lr_rs)

# standard deviation of the F1-score
F1_std_lr_rs = random_search_lr.cv_results_['std_test_score']
F1_std_lr_rs_rounded = round(F1_std_lr_rs[0] * 100, 2)
print('Standard deviation of the F1-score:', F1_std_lr_rs_rounded)

# false positive and false negative for Black dataset
y_pred_black_lr_rs = cross_val_predict(random_search_lr, X_black_LR8, y_black_LR8, cv=kf)
conf_mat_black_lr_rs = confusion_matrix(y_black_LR8, y_pred_black_lr_rs, normalize='true')
FP_black_lr_rs = round(conf_mat_black_lr_rs[0,1] * 100, 2)
print('False Positive for Black people:', FP_black_lr_rs)
FN_black_lr_rs = round(conf_mat_black_lr_rs[1,0] * 100, 2)
print('False Negative for Black people:', FN_black_lr_rs)

# false positive and false negative for White dataset
y_pred_white_lr_rs = cross_val_predict(random_search_lr, X_white_LR8, y_white_LR8, cv=kf)
conf_mat_white_lr_rs = confusion_matrix(y_white_LR8, y_pred_white_lr_rs, normalize='true')
FP_white_lr_rs = round(conf_mat_white_lr_rs[0,1] * 100, 2)
print('False Positive for White people:', FP_white_lr_rs)
FN_white_lr_rs = round(conf_mat_white_lr_rs[1,0] * 100, 2)
print('False Negative for White people:', FN_white_lr_rs)

Mean of the F1-score: 60.33
Standard deviation of the F1-score: 2.08
False Positive for Black people: 37.72
False Negative for Black people: 28.67
False Positive for White people: 11.29
False Negative for White people: 65.42


#### Logistic Regression with GridSearch

In [32]:
# beginning parameters for tuning are from the articles in Level Up Coding, Toward Data Science and StackOverflow
# the default solver of the logistic regression is 'lbfgs'. In order to compare it to the previous results with LR this is not changed.
# param = {
#     'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
#     'penalty': ['l2', None],
#     'solver': ['lbfgs'], 
#     'max_iter': [1000, 1500, 2000, 2500, 3000],
# }

# parameters
param_lr_gs = {
    'C': [1000],
    'penalty': ['l2'],
    'solver': ['lbfgs'], # the default solver of the logistic regression is 'lbfgs'. In order to compare it to the previous results with LR this is not changed.
    'max_iter': [1000],
}

# model
lr = LogisticRegression()

# grid search and tuning
grid_search_lr = GridSearchCV(lr, param_grid=param_lr_gs, scoring='f1', cv=kf, n_jobs=-1)
grid_search_lr.fit(X_overall_LR8, y_overall_LR8)

# F1-score
F1_mean_lr_gs = round(np.mean(grid_search_lr.score(X_overall_LR8, y_overall_LR8)) * 100, 2)
print('Mean of the F1-score:', F1_mean_lr_gs)

# standard deviation of the F1-score
F1_std_lr_gs = grid_search_lr.cv_results_['std_test_score']
F1_std_lr_gs_rounded = round(F1_std_lr_gs[0] * 100, 2)
print('Standard deviation of the F1-score:', F1_std_lr_gs_rounded)

# false positive and false negative for Black dataset
y_pred_black_lr_gs = cross_val_predict(grid_search_lr, X_black_LR8, y_black_LR8, cv=kf)
conf_mat_black_lr_gs = confusion_matrix(y_black_LR8, y_pred_black_lr_gs, normalize='true')
FP_black_lr_gs = round(conf_mat_black_lr_gs[0,1] * 100, 2)
print('False Positive for Black people:', FP_black_lr_gs)
FN_black_lr_gs = round(conf_mat_black_lr_gs[1,0] * 100, 2)
print('False Negative for Black people:', FN_black_lr_gs)

# false positive and false negative for White dataset
y_pred_white_lr_gs = cross_val_predict(grid_search_lr, X_white_LR8, y_white_LR8, cv=kf)
conf_mat_white_lr_gs = confusion_matrix(y_white_LR8, y_pred_white_lr_gs, normalize='true')
FP_white_lr_gs = round(conf_mat_white_lr_gs[0,1] * 100, 2)
print('False Positive for White people:', FP_white_lr_gs)
FN_white_lr_gs = round(conf_mat_white_lr_gs[1,0] * 100, 2)
print('False Negative for White people:', FN_white_lr_gs)

Mean of the F1-score: 60.33
Standard deviation of the F1-score: 2.07
False Positive for Black people: 37.77
False Negative for Black people: 28.67
False Positive for White people: 11.29
False Negative for White people: 65.42


#### Grid Search with Logistic Regression and Fairlearn

In [33]:
from fairlearn.preprocessing import CorrelationRemover

# transforming the data with the correlation remover
cr = CorrelationRemover(sensitive_feature_ids=['race'])
X_overall_fairlearn = cr.fit_transform(X_overall_LR8)

# beginning parameters for tuning are from the articles in Level Up Coding, Toward Data Science and StackOverflow
# the default solver of the logistic regression is 'lbfgs'. In order to compare it to the previous results with LR this is not changed.
# param = {
#     'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
#     'penalty': ['l2', None],
#     'solver': ['lbfgs'], 
#     'max_iter': [1000, 1500, 2000, 2500, 3000],
# }

# parameters
param_lr_gs_cr = {
    'C': [0.1],
    'penalty': ['l2'],
    'solver': ['lbfgs'],
    'max_iter': [1000],
}

# model
lr_gs_cr = LogisticRegression(**param_lr_gs_cr)

# grid search and tuning
grid_search_lr_cr = GridSearchCV(lr_gs_cr, param_grid=param_lr_gs_cr, scoring='f1', cv=kf, n_jobs=-1)
grid_search_lr_cr.fit(X_overall_fairlearn, y_overall_LR8)

# Accuracy
grid_search_lr_cr_acc = GridSearchCV(lr_gs_cr, param_grid=param_lr_gs_cr, scoring='accuracy', cv=kf, n_jobs=-1)
grid_search_lr_cr_acc.fit(X_overall_fairlearn, y_overall_LR8)
acc_mean_lr_gs_cr = round(np.mean(grid_search_lr_cr_acc.score(X_overall_fairlearn, y_overall_LR8)) * 100, 2)
print('Mean of the Accuracy:', acc_mean_lr_gs_cr)

# F1-score
F1_mean_lr_gs_cr = round(np.mean(grid_search_lr_cr.score(X_overall_fairlearn, y_overall_LR8)) * 100, 2)
print('Mean of the F1-score:', F1_mean_lr_gs_cr)

# standard deviation of the F1-score
F1_std_lr_gs_cr = grid_search_lr_cr.cv_results_['std_test_score']
F1_std_lr_gs_cr_rounded = round(F1_std_lr_gs_cr[0] * 100, 2)
print('Standard deviation of the F1-score:', F1_std_lr_gs_cr_rounded)

# false positive and false negative for Black dataset
X_black_lr_fairlearn = cr.fit_transform(X_black_LR8)
y_pred_black_lr_gs_cr = cross_val_predict(grid_search_lr_cr, X_black_lr_fairlearn, y_black_LR8, cv=kf)
conf_mat_black_lr_gs_cr = confusion_matrix(y_black_LR8, y_pred_black_lr_gs_cr, normalize='true')
FP_black_lr_gs_cr = round(conf_mat_black_lr_gs_cr[0,1] * 100, 2)
print('False Positive for Black people:', FP_black_lr_gs_cr)
FN_black_lr_gs_cr = round(conf_mat_black_lr_gs_cr[1,0] * 100, 2)
print('False Negative for Black people:', FN_black_lr_gs_cr)

# false positive and false negative for White dataset
X_white_lr_fairlearn = cr.fit_transform(X_white_LR8)
y_pred_white_lr_gs_cr = cross_val_predict(grid_search_lr_cr, X_white_lr_fairlearn, y_white_LR8, cv=kf)
conf_mat_white_lr_gs_cr = confusion_matrix(y_white_LR8, y_pred_white_lr_gs_cr, normalize='true')
FP_white_lr_gs_cr = round(conf_mat_white_lr_gs_cr[0,1] * 100, 2)
print('False Positive for White people:', FP_white_lr_gs_cr)
FN_white_lr_gs_cr = round(conf_mat_white_lr_gs_cr[1,0] * 100, 2)
print('False Negative for White people:', FN_white_lr_gs_cr)


Mean of the Accuracy: 65.89
Mean of the F1-score: 57.21
Standard deviation of the F1-score: 2.84
False Positive for Black people: 37.66
False Negative for Black people: 28.88
False Positive for White people: 11.42
False Negative for White people: 65.53


# Statistical Comparison

#### Mann-Whitney U test

In [36]:
from scipy.stats import mannwhitneyu

# create arrays
f1_scores_xgb = []
f1_scores_lr = []

# for each fold append the F1-score
for i in range(kf.get_n_splits()):
    f1_scores_xgb.append(grid_search_cr.cv_results_[f'split{i}_test_score'])
    f1_scores_lr.append(grid_search_lr_cr.cv_results_[f'split{i}_test_score'])

# flattening the list so it is only one array
f1_scores_xgb = [item for sublist in f1_scores_xgb for item in sublist]
f1_scores_lr = [item for sublist in f1_scores_lr for item in sublist]

# mann-whitney u test
statistics_mwu, pvalue_mwu = mannwhitneyu(f1_scores_xgb, f1_scores_lr, alternative='two-sided')

# print results
print("U statistic: ", statistics_mwu)
print("P-value: ", pvalue_mwu)

# interpretation
alpha = 0.05
if pvalue_mwu < alpha:
    print("Reject the null hypothesis: Different distribution.")
else:
    print("Fail to reject the null hypothesis: Same distribution.")

U statistic:  86.0
P-value:  0.007262595901896159
Reject the null hypothesis: Different distribution.


#### ttest_ind_from_stats

In [37]:
from scipy.stats import ttest_ind_from_stats

# ttest_ind_from_stats test
statistics_tt, pvalue_tt = ttest_ind_from_stats(F1_mean_gs_cr, F1_std_gs_cr, 10, F1_mean_lr_gs_cr, F1_std_lr_gs_cr, 10, equal_var=True, alternative='two-sided')

# print results
print("T-statistic: ", statistics_tt)
print("P-value: ", pvalue_tt)

# interpretation
alpha = 0.05
if pvalue_tt < alpha:
    print("Reject the null hypothesis: average values of two samples are not identical.")
else:
    print("Fail to reject the null hypothesis: average values of two samples are identical.")

T-statistic:  [616.74690187]
P-value:  [2.20610556e-40]
Reject the null hypothesis: average values of two samples are not identical.


Sources:

Chen, T., & Guestrin, C. (2016). XGBoost: A Scalable Tree Boosting System. In Proceedings of
the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining (pp. 785â€“794). Association for Computing Machinery. https://doi.org/10.1145/2939672.2939785

Dressel, A. & Farid, H. (2018). The accuracy, fairness, and limits of predicting recidivism. Science Advances, 4 (1), eaao5580. https://doi.org/10.1126/sciadv.aao5580

Geeks for Geeks. (2022). How to make a table in Python?. https://www.geeksforgeeks.org/how-to-make-a-table-in-python/

Group, M. (21.05.2023). A Comprehensive Analysis of Hyperparameter Optimization in Logistic Regression Models. Level Up Coding. https://levelup.gitconnected.com/a-comprehensive-analysis-of-hyperparameter-optimization-in-logistic-regression-models-521564c1bfc0\

Harris, C. R., Millman, K. J., van der Walt, S. J., Gommers, R., Virtanen, P., Cournapeau, D., 
Wieser, E., Taylor, J., Berg, S., Smith, N. J., Kern, R., Picus, M., Hoyer, S., van Kerkwijk, M. H., Brett, M., Haldane, A., FernÃ¡ndez del RÃ­o, J., Wiebe, M., Peterson, P., â€¦ Oliphant, T. E. (2020). Array programming with NumPy. Nature, 585, 357-362. https://doi.org/10.1038/s41586-020-2649-2

Pedregosa, F., Varoquaux, G., Gramfort, A., Michel, V., Thirion, B., Grisel, O., Blondel, M.,
Prettenhofer, P., Weiss, R., Dubourg, V., Vanderplas, J., Passos, A., Cournapeau, D., Brucher, M., Perrot, M., & Duchesnay, E. (2011). Scikit-learn: Machine Learning in Python. Journal of Machine Learning Research, 12, 2825-2830. https://jmlr.csail.mit.edu/papers/v12/pedregosa11a.html

Qiao, F. (08.01.2019). Logistic Regression Model Tuning with scikit-learn â€” Part 1: Comparison of metrics along the model tuning process. Towards Data Science.
https://towardsdatascience.com/logistic-regression-model-tuning-with-scikit-learn-part-1-425142e01af5

RegenerativeToday. (18.05.2022). Step by Step Tutorial on Logistic Regression in Python | sklearn |Jupyter Notebook [Video]. Youtube. https://www.youtube.com/watch?v=bSXIbCZNBw0

Ryan Nolan Data. (28.08.2023). A Comprehensive Guide to Cross-Validation with Scikit-Learn and Python [Video]. Youtube. https://www.youtube.com/watch?v=glLNo1ZnmPA&list=PLcQVY5V2UY4LNmObS0gqNVyNdVfXnHwu8&index=14

Scikit-Learn. (n.d.). User Guide. https://scikit-learn.org/stable/user_guide.html

Scikit-Learn. (n.d.). API Reference. https://scikit-learn.org/stable/api/index.html

SciPy. (n.d.). User Guide. https://docs.scipy.org/doc/scipy/tutorial/index.html

SciPy. (n.d.). API Reference. https://docs.scipy.org/doc/scipy/reference/index.html

Stackoverflow. (2014). Fine-tuning parameters in Logistic Regression. https://stackoverflow.com/questions/21816346/fine-tuning-parameters-in-logistic-regression

Stackoverflow. (2014). How does the list comprehension to flatten a python list work? [duplicate]. https://stackoverflow.com/questions/25674169/how-does-the-list-comprehension-to-flatten-a-python-list-work

Stackoverflow. (2016). Python's tabulate number of decimal. https://stackoverflow.com/questions/37079957/pythons-tabulate-number-of-decimal

Stackoverflow. (2018). How can I standardize only numeric variables in an sklearn pipeline?. https://stackoverflow.com/questions/48673402/how-can-i-standardize-only-numeric-variables-in-an-sklearn-pipeline

Stackoverflow.(2018). getting the confusion matrix for each cross validation fold. https://stackoverflow.com/questions/49587820/getting-the-confusion-matrix-for-each-cross-validation-fold

The pandas development team. (2024). pandas-dev/pandas: Pandas (v2.2.2). Zenodo.
https://doi.org/10.5281/zenodo.10957263

Velarde, G., Weichert, M., Deshmunkh, A., Deshmane, S., Sudhir, A., Sharma, K. & Joshi, V. (2024). Tree boosting methods for balanced and imbalanced classification and their robustness over time in risk assessment. Intelligent Systems with Applications. 22, 200354. https://doi.org/10.1016/j.iswa.2024.200354

Virtanen, P., Gommers, R., Oliphant, T. E., Haberland, M., Reddy, T., Cournapeau, D., Burovski,
E., Peterson, P., Weckesser, W., Bright, J., van der Walt, S. J., Brett, M., Wilson, J., Millman, K. J., Mayorov, N., Nelson, A. R. J., Jones, E., Kern, R., Larson, E., Carey, CJ, â€¦ SciPy 1.0 Contributors (2020). SciPy 1.0: Fundamental Algorithms for Scientific Computing in Python. Nature Methods, 17(3), 261-272. https://doi.org/10.1038/s41592-019-0686-2

Weerts, H., DudÃ­k, M., Edgar, R., Jalali, A., Lutz, R. & Madaio, M. (2023). Fairlearn: Assessing and Improving Fairness of AI Systems. Journal of Machine Learning Research, 24 (257), 1-8. http://jmlr.org/papers/v24/23-0389.html

Weerts, H. (19.06.2024). An Introduction to Responsible Machine Learning. GitHub. https://hildeweerts.github.io/responsiblemachinelearning/index.html