# Recreation

## Logistic Regression with 7 Features

In [41]:
# loading data and observations

import pandas as pd
import numpy as np

df_compas = pd.read_csv('compas-scores-two-years.csv')

df_compas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7214 entries, 0 to 7213
Data columns (total 53 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       7214 non-null   int64  
 1   name                     7214 non-null   object 
 2   first                    7214 non-null   object 
 3   last                     7214 non-null   object 
 4   compas_screening_date    7214 non-null   object 
 5   sex                      7214 non-null   object 
 6   dob                      7214 non-null   object 
 7   age                      7214 non-null   int64  
 8   age_cat                  7214 non-null   object 
 9   race                     7214 non-null   object 
 10  juv_fel_count            7214 non-null   int64  
 11  decile_score             7214 non-null   int64  
 12  juv_misd_count           7214 non-null   int64  
 13  juv_other_count          7214 non-null   int64  
 14  priors_count            

In [42]:
df_compas.head(6)

Unnamed: 0,id,name,first,last,compas_screening_date,sex,dob,age,age_cat,race,...,v_decile_score,v_score_text,v_screening_date,in_custody,out_custody,priors_count.1,start,end,event,two_year_recid
0,1,miguel hernandez,miguel,hernandez,2013-08-14,Male,1947-04-18,69,Greater than 45,Other,...,1,Low,2013-08-14,2014-07-07,2014-07-14,0,0,327,0,0
1,3,kevon dixon,kevon,dixon,2013-01-27,Male,1982-01-22,34,25 - 45,African-American,...,1,Low,2013-01-27,2013-01-26,2013-02-05,0,9,159,1,1
2,4,ed philo,ed,philo,2013-04-14,Male,1991-05-14,24,Less than 25,African-American,...,3,Low,2013-04-14,2013-06-16,2013-06-16,4,0,63,0,1
3,5,marcu brown,marcu,brown,2013-01-13,Male,1993-01-21,23,Less than 25,African-American,...,6,Medium,2013-01-13,,,1,0,1174,0,0
4,6,bouthy pierrelouis,bouthy,pierrelouis,2013-03-26,Male,1973-01-22,43,25 - 45,Other,...,1,Low,2013-03-26,,,2,0,1102,0,0
5,7,marsha miles,marsha,miles,2013-11-30,Male,1971-08-22,44,25 - 45,Other,...,1,Low,2013-11-30,2013-11-30,2013-12-01,0,1,853,0,0


In [43]:
print(df_compas.columns)

Index(['id', 'name', 'first', 'last', 'compas_screening_date', 'sex', 'dob',
       'age', 'age_cat', 'race', 'juv_fel_count', 'decile_score',
       'juv_misd_count', 'juv_other_count', 'priors_count',
       'days_b_screening_arrest', 'c_jail_in', 'c_jail_out', 'c_case_number',
       'c_offense_date', 'c_arrest_date', 'c_days_from_compas',
       'c_charge_degree', 'c_charge_desc', 'is_recid', 'r_case_number',
       'r_charge_degree', 'r_days_from_arrest', 'r_offense_date',
       'r_charge_desc', 'r_jail_in', 'r_jail_out', 'violent_recid',
       'is_violent_recid', 'vr_case_number', 'vr_charge_degree',
       'vr_offense_date', 'vr_charge_desc', 'type_of_assessment',
       'decile_score.1', 'score_text', 'screening_date',
       'v_type_of_assessment', 'v_decile_score', 'v_score_text',
       'v_screening_date', 'in_custody', 'out_custody', 'priors_count.1',
       'start', 'end', 'event', 'two_year_recid'],
      dtype='object')


In [44]:
# removing unused columns

df_LR7 = df_compas.drop(columns=['id', 'name', 'first', 'last', 'compas_screening_date', 'dob',
       'age_cat', 'race', 'decile_score', 'juv_other_count', 
       'days_b_screening_arrest', 'c_jail_in', 'c_jail_out', 'c_case_number',
       'c_offense_date', 'c_arrest_date', 'c_days_from_compas', 'is_recid', 'r_case_number',
       'r_charge_degree', 'r_days_from_arrest', 'r_offense_date',
       'r_charge_desc', 'r_jail_in', 'r_jail_out', 'violent_recid',
       'is_violent_recid', 'vr_case_number', 'vr_charge_degree',
       'vr_offense_date', 'vr_charge_desc', 'type_of_assessment',
       'decile_score.1', 'score_text', 'screening_date',
       'v_type_of_assessment', 'v_decile_score', 'v_score_text',
       'v_screening_date', 'in_custody', 'out_custody', 'priors_count.1',
       'start', 'end', 'event'])

# transforming string values into numerical

df_LR7['sex'] = df_LR7['sex'].astype('category')
df_LR7['sex'] = df_LR7['sex'].cat.codes

df_LR7['c_charge_degree'] = df_LR7['c_charge_degree'].astype('category')
df_LR7['c_charge_degree'] = df_LR7['c_charge_degree'].cat.codes

df_LR7['c_charge_desc'] = df_LR7['c_charge_desc'].astype('category')
df_LR7['c_charge_desc'] = df_LR7['c_charge_desc'].cat.codes

In [45]:
# null values

df_LR7.isnull().sum()

sex                0
age                0
juv_fel_count      0
juv_misd_count     0
priors_count       0
c_charge_degree    0
c_charge_desc      0
two_year_recid     0
dtype: int64

In [46]:
# splitting of the dataset in features and the variable to predict

X_overall_LR7 = df_LR7.drop(columns='two_year_recid')
y_overall_LR7 = df_LR7['two_year_recid']

#### Overall

In [47]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

lr = LogisticRegression(max_iter=1000)

kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=28)

# accuracy
kfscore_accuracy_overall_LR7 = cross_val_score(lr, X_overall_LR7, y_overall_LR7, cv=kf, scoring='accuracy')
accuracy_overall_LR7 = round(np.average(kfscore_accuracy_overall_LR7) * 100, 2)
print(accuracy_overall_LR7)

accuracy_overall_LR7_std = round(np.std(kfscore_accuracy_overall_LR7) * 100, 2)

table_overall_LR7 = str(accuracy_overall_LR7) + ' [+/-' + str(accuracy_overall_LR7_std) + ']'

# F1-score
kfscore_f1_overall_LR7 = cross_val_score(lr, X_overall_LR7, y_overall_LR7, cv=kf, scoring='f1')
F1_mean_LR7 = round(np.mean(kfscore_f1_overall_LR7), 2)
print(F1_mean_LR7)

F1_std_LR7 = round(np.std(kfscore_f1_overall_LR7), 2)
print(F1_std_LR7)

table_F1_LR7 = str(F1_mean_LR7) + ' [+/-' + str(F1_std_LR7) + ']'


67.58
0.6
0.02


#### African-American people

In [48]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix

df_black_LR7 = df_compas.loc[df_compas['race'] == "African-American"]
df_black_LR7 = df_black_LR7.drop(columns=['id', 'name', 'first', 'last', 'compas_screening_date', 'dob',
       'age_cat', 'race', 'decile_score', 'juv_other_count', 
       'days_b_screening_arrest', 'c_jail_in', 'c_jail_out', 'c_case_number',
       'c_offense_date', 'c_arrest_date', 'c_days_from_compas', 'is_recid', 'r_case_number',
       'r_charge_degree', 'r_days_from_arrest', 'r_offense_date',
       'r_charge_desc', 'r_jail_in', 'r_jail_out', 'violent_recid',
       'is_violent_recid', 'vr_case_number', 'vr_charge_degree',
       'vr_offense_date', 'vr_charge_desc', 'type_of_assessment',
       'decile_score.1', 'score_text', 'screening_date',
       'v_type_of_assessment', 'v_decile_score', 'v_score_text',
       'v_screening_date', 'in_custody', 'out_custody', 'priors_count.1',
       'start', 'end', 'event'])

df_black_LR7['sex'] = df_black_LR7['sex'].astype('category')
df_black_LR7['sex'] = df_black_LR7['sex'].cat.codes
df_black_LR7['c_charge_degree'] = df_black_LR7['c_charge_degree'].astype('category')
df_black_LR7['c_charge_degree'] = df_black_LR7['c_charge_degree'].cat.codes
df_black_LR7['c_charge_desc'] = df_black_LR7['c_charge_desc'].astype('category')
df_black_LR7['c_charge_desc'] = df_black_LR7['c_charge_desc'].cat.codes

X_black_LR7 = df_black_LR7.drop(columns='two_year_recid')
y_black_LR7 = df_black_LR7['two_year_recid']

kfscore_accuracy_black_LR7 = cross_val_score(lr, X_black_LR7, y_black_LR7, cv=kf, scoring='accuracy')

accuracy_black_LR7 = round(np.average(kfscore_accuracy_black_LR7) * 100, 2)
print(accuracy_black_LR7) 

accuracy_black_LR7_std = round(np.std(kfscore_accuracy_black_LR7) * 100, 2)

table_black_LR7 = str(accuracy_black_LR7) + ' [+/-' + str(accuracy_black_LR7_std) + ']'

y_pred_black_LR7 = cross_val_predict(lr, X_black_LR7, y_black_LR7, cv=kf)
conf_mat_black_LR7 = confusion_matrix(y_black_LR7, y_pred_black_LR7, normalize='true')

FP_black_LR7 = round(conf_mat_black_LR7[0,1] * 100, 2)
FN_black_LR7 = round(conf_mat_black_LR7[1,0] * 100, 2)

print(FP_black_LR7)
print(FN_black_LR7)

66.88
37.77
28.72


#### Caucasian people

In [49]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix

df_white_LR7 = df_compas.loc[df_compas['race'] == 'Caucasian']
df_white_LR7 = df_white_LR7.drop(columns=['id', 'name', 'first', 'last', 'compas_screening_date', 'dob',
       'age_cat', 'race', 'decile_score', 'juv_other_count', 
       'days_b_screening_arrest', 'c_jail_in', 'c_jail_out', 'c_case_number',
       'c_offense_date', 'c_arrest_date', 'c_days_from_compas', 'is_recid', 'r_case_number',
       'r_charge_degree', 'r_days_from_arrest', 'r_offense_date',
       'r_charge_desc', 'r_jail_in', 'r_jail_out', 'violent_recid',
       'is_violent_recid', 'vr_case_number', 'vr_charge_degree',
       'vr_offense_date', 'vr_charge_desc', 'type_of_assessment',
       'decile_score.1', 'score_text', 'screening_date',
       'v_type_of_assessment', 'v_decile_score', 'v_score_text',
       'v_screening_date', 'in_custody', 'out_custody', 'priors_count.1',
       'start', 'end', 'event'])

df_white_LR7['sex'] = df_white_LR7['sex'].astype('category')
df_white_LR7['sex'] = df_white_LR7['sex'].cat.codes
df_white_LR7['c_charge_degree'] = df_white_LR7['c_charge_degree'].astype('category')
df_white_LR7['c_charge_degree'] = df_white_LR7['c_charge_degree'].cat.codes
df_white_LR7['c_charge_desc'] = df_white_LR7['c_charge_desc'].astype('category')
df_white_LR7['c_charge_desc'] = df_white_LR7['c_charge_desc'].cat.codes

X_white_LR7 = df_white_LR7.drop(columns='two_year_recid')
y_white_LR7 = df_white_LR7['two_year_recid']

kfscore_accuracy_white_LR7 = cross_val_score(lr, X_white_LR7, y_white_LR7, cv=kf, scoring='accuracy')

accuracy_white_LR7 = round(np.average(kfscore_accuracy_white_LR7) * 100, 2)
print(accuracy_white_LR7)

accuracy_white_LR7_std = round(np.std(kfscore_accuracy_overall_LR7) * 100, 2)

table_white_LR7 = str(accuracy_overall_LR7) + ' [+/-' + str(accuracy_overall_LR7_std) + ']'

y_pred_white_LR7 = cross_val_predict(lr, X_white_LR7, y_white_LR7, cv=kf)
conf_mat_white_LR7 = confusion_matrix(y_white_LR7, y_pred_white_LR7, normalize='true')

FP_white_LR7 = round(conf_mat_white_LR7[0,1] * 100, 2)
FN_white_LR7 = round(conf_mat_white_LR7[1,0] * 100, 2)

print(FP_white_LR7)
print(FN_white_LR7)

67.4
11.29
65.42


## Logistic Regression with 2 Features

In [50]:
# removing unused columns

df_LR2 = df_LR7.drop(columns=['sex', 'c_charge_degree', 'c_charge_desc'])

# remove all features except age and the total number of previous convictions

df_LR2['total_number'] = df_LR2['juv_fel_count'] + df_LR2['juv_misd_count'] + df_LR2['priors_count']
df_LR2 = df_LR2.drop(columns=['juv_fel_count', 'juv_misd_count', 'priors_count'])

X_overall_LR2 = df_LR2.drop(columns='two_year_recid')
y_overall_LR2 = df_LR2['two_year_recid']

#### Overall

In [51]:
#accuracy
kfscore_accuracy_overall_LR2 = cross_val_score(lr, X_overall_LR2, y_overall_LR2, cv=kf, scoring='accuracy')
accuracy_overall_LR2 = round(np.average(kfscore_accuracy_overall_LR2) * 100, 2)
print(accuracy_overall_LR2)

accuracy_overall_LR2_std = round(np.std(kfscore_accuracy_overall_LR2) * 100, 2)

table_overall_LR2 = str(accuracy_overall_LR2) + ' [+/-' + str(accuracy_overall_LR2_std) + ']'

# F1-score
kfscore_f1_overall_LR2 = cross_val_score(lr, X_overall_LR2, y_overall_LR2, cv=kf, scoring='f1')
F1_mean_LR2 = round(np.mean(kfscore_f1_overall_LR2), 2)
print(F1_mean_LR2)

F1_std_LR2 = round(np.std(kfscore_f1_overall_LR2), 2)
table_F1_LR2 = str(F1_mean_LR2) + ' [+/-' + str(F1_std_LR2) + ']'

67.51
0.58


#### African-American people

In [52]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix

df_black_LR2 = df_black_LR7.drop(columns=['sex', 'c_charge_degree', 'c_charge_desc'])

df_black_LR2['total_number'] = df_black_LR2['juv_fel_count'] + df_black_LR2['juv_misd_count'] + df_black_LR2['priors_count']
df_black_LR2 = df_black_LR2.drop(columns=['juv_fel_count', 'juv_misd_count', 'priors_count'])

X_black_LR2 = df_black_LR2.drop(columns='two_year_recid')
y_black_LR2 = df_black_LR2['two_year_recid']

kfscore_accuracy_black_LR2 = cross_val_score(lr, X_black_LR2, y_black_LR2, cv=kf, scoring='accuracy')
accuracy_black_LR2 = round(np.average(kfscore_accuracy_black_LR2) * 100, 2)
print(accuracy_black_LR2)

accuracy_black_LR2_std = round(np.std(kfscore_accuracy_black_LR2) * 100, 2)

table_black_LR2 = str(accuracy_black_LR2) + ' [+/-' + str(accuracy_black_LR2_std) + ']'

y_pred_black_LR2 = cross_val_predict(lr, X_black_LR2, y_black_LR2, cv=10)
conf_mat_black_LR2 = confusion_matrix(y_black_LR2, y_pred_black_LR2, normalize='true')

FP_black_LR2 = round(conf_mat_black_LR2[0,1] * 100, 2)
FN_black_LR2 = round(conf_mat_black_LR2[1,0] * 100, 2)

print(FP_black_LR2)
print(FN_black_LR2)

67.56
35.71
28.93


#### Caucasian people

In [53]:
df_white_LR2 = df_white_LR7.drop(columns=['sex', 'c_charge_degree', 'c_charge_desc'])

df_white_LR2['total_number'] = df_white_LR2['juv_fel_count'] + df_white_LR2['juv_misd_count'] + df_white_LR2['priors_count']
df_white_LR2 = df_white_LR2.drop(columns=['juv_fel_count', 'juv_misd_count', 'priors_count'])

X_white_LR2 = df_white_LR2.drop(columns='two_year_recid')
y_white_LR2 = df_white_LR2['two_year_recid']

kfscore_accuracy_white_LR2 = cross_val_score(lr, X_white_LR2, y_white_LR2, cv=kf, scoring='accuracy')
accuracy_white_LR2 = round(np.average(kfscore_accuracy_white_LR2) * 100, 2)
print(accuracy_white_LR2)

accuracy_white_LR2_std = round(np.std(kfscore_accuracy_overall_LR2) * 100, 2)

table_white_LR2 = str(accuracy_white_LR2) + ' [+/-' + str(accuracy_white_LR2_std) + ']'

y_pred_white_LR2 = cross_val_predict(lr, X_white_LR2, y_white_LR2, cv=10)
conf_mat_white_LR2 = confusion_matrix(y_white_LR2, y_pred_white_LR2, normalize='true')

FP_white_LR2 = round(conf_mat_white_LR2[0,1] * 100, 2)
FN_white_LR2 = round(conf_mat_white_LR2[1,0] * 100, 2)

print(FP_white_LR2)
print(FN_white_LR2)

67.97
10.48
65.32


## Nonlinear Support Vector Machine with radial basis kernel and 7 Features

In [54]:
from sklearn.svm import SVC

svm = SVC(kernel='rbf')

# accuracy
kfscore_accuracy_overall_SVM = cross_val_score(svm, X_overall_LR7, y_overall_LR7, cv=kf, scoring='accuracy')
accuracy_overall_SVM = round(np.average(kfscore_accuracy_overall_SVM) * 100, 2)
print(accuracy_overall_SVM)

#standard deviation
accuracy_overall_SVM_std = round(np.std(kfscore_accuracy_overall_SVM) * 100, 2)

table_overall_SVM = str(accuracy_overall_SVM) + ' [+/-' + str(accuracy_overall_SVM_std) + ']'

# F1-score
kfscore_f1_overall_SVM = cross_val_score(svm, X_overall_LR7, y_overall_LR7, cv=kf, scoring='f1')
F1_mean_SVM = round(np.mean(kfscore_f1_overall_SVM), 2)
print(F1_mean_SVM)

F1_std_SVM = round(np.std(kfscore_f1_overall_SVM), 2)
table_F1_SVM = str(F1_mean_SVM) + ' [+/-' + str(F1_std_SVM) + ']'

# accuracy black
kfscore_accuracy_black_SVM = cross_val_score(svm, X_black_LR7, y_black_LR7, cv=kf, scoring='accuracy')

accuracy_black_SVM = round(np.average(kfscore_accuracy_black_SVM) * 100, 2)
print(accuracy_black_SVM)

#standard deviation black
accuracy_black_SVM_std = round(np.std(kfscore_accuracy_black_SVM) * 100, 2)

table_black_SVM = str(accuracy_black_SVM) + ' [+/-' + str(accuracy_black_SVM_std) + ']'

# accuracy white
kfscore_accuracy_white_SVM = cross_val_score(svm, X_white_LR7, y_white_LR7, cv=kf, scoring='accuracy')

accuracy_white_SVM = round(np.average(kfscore_accuracy_white_SVM) * 100, 2)
print(accuracy_white_SVM)

#standard deviation white
accuracy_white_SVM_std = round(np.std(kfscore_accuracy_white_SVM) * 100, 2)

table_white_SVM = str(accuracy_white_SVM) + ' [+/-' + str(accuracy_white_SVM_std) + ']'

# false positive and negative black
y_pred_black_SVM = cross_val_predict(svm, X_black_LR7, y_black_LR7, cv=kf)
conf_mat_black_SVM = confusion_matrix(y_black_LR7, y_pred_black_LR7, normalize='true')

FP_black_SVM = round(conf_mat_black_SVM[0,1] * 100, 2)
FN_black_SVM = round(conf_mat_black_SVM[1,0] * 100, 2)

print(FP_black_SVM)
print(FN_black_SVM)

# false positive and negative white
y_pred_white_SVM = cross_val_predict(svm, X_white_LR7, y_white_LR7, cv=kf)
conf_mat_white_SVM = confusion_matrix(y_white_LR7, y_pred_white_LR7, normalize='true')

FP_white_SVM = round(conf_mat_white_SVM[0,1] * 100, 2)
FN_white_SVM = round(conf_mat_white_SVM[1,0] * 100, 2)

print(FP_white_SVM)
print(FN_white_SVM)

62.84
0.5
61.2
61.74
37.77
28.72
11.29
65.42


## Logistic Regression with 8 features (including race)

#### Overall

In [55]:
df_LR8 = df_compas.drop(columns=['id', 'name', 'first', 'last', 'compas_screening_date', 'dob',
       'age_cat', 'decile_score', 'juv_other_count', 
       'days_b_screening_arrest', 'c_jail_in', 'c_jail_out', 'c_case_number',
       'c_offense_date', 'c_arrest_date', 'c_days_from_compas', 'is_recid', 'r_case_number',
       'r_charge_degree', 'r_days_from_arrest', 'r_offense_date',
       'r_charge_desc', 'r_jail_in', 'r_jail_out', 'violent_recid',
       'is_violent_recid', 'vr_case_number', 'vr_charge_degree',
       'vr_offense_date', 'vr_charge_desc', 'type_of_assessment',
       'decile_score.1', 'score_text', 'screening_date',
       'v_type_of_assessment', 'v_decile_score', 'v_score_text',
       'v_screening_date', 'in_custody', 'out_custody', 'priors_count.1',
       'start', 'end', 'event'])

# transforming string values into numerical

df_LR8['sex'] = df_LR8['sex'].astype('category')
df_LR8['sex'] = df_LR8['sex'].cat.codes

df_LR8['c_charge_degree'] = df_LR8['c_charge_degree'].astype('category')
df_LR8['c_charge_degree'] = df_LR8['c_charge_degree'].cat.codes

df_LR8['c_charge_desc'] = df_LR8['c_charge_desc'].astype('category')
df_LR8['c_charge_desc'] = df_LR8['c_charge_desc'].cat.codes

df_LR8['race'] = df_LR8['race'].astype('category')
df_LR8['race'] = df_LR8['race'].cat.codes


# splitting of the dataset in features and the variable to predict

X_overall_LR8 = df_LR8.drop(columns='two_year_recid')
y_overall_LR8 = df_LR8['two_year_recid']

# accuracy
kfscore_accuracy_overall_LR8 = cross_val_score(lr, X_overall_LR8, y_overall_LR8, cv=kf, scoring='accuracy')
accuracy_overall_LR8 = round(np.average(kfscore_accuracy_overall_LR8) * 100, 2)
print(accuracy_overall_LR8)

accuracy_overall_LR8_std = round(np.std(kfscore_accuracy_overall_LR8) * 100, 2)

table_overall_LR8 = str(accuracy_overall_LR8) + ' [+/-' + str(accuracy_overall_LR8_std) + ']'

# F1-score
kfscore_f1_overall_LR8 = cross_val_score(lr, X_overall_LR8, y_overall_LR8, cv=kf, scoring='f1')
F1_mean_LR8 = round(np.mean(kfscore_f1_overall_LR8), 2)
print(F1_mean_LR8)

F1_std_LR8 = round(np.std(kfscore_f1_overall_LR8), 2)
print(F1_std_LR8)

table_F1_LR8 = str(F1_mean_LR8) + ' [+/-' + str(F1_std_LR8) + ']'


67.74
0.6
0.02


#### African-American people

In [56]:
df_black_LR8 = df_compas.loc[df_compas['race'] == "African-American"]
df_black_LR8 = df_black_LR8.drop(columns=['id', 'name', 'first', 'last', 'compas_screening_date', 'dob',
       'age_cat', 'decile_score', 'juv_other_count', 
       'days_b_screening_arrest', 'c_jail_in', 'c_jail_out', 'c_case_number',
       'c_offense_date', 'c_arrest_date', 'c_days_from_compas', 'is_recid', 'r_case_number',
       'r_charge_degree', 'r_days_from_arrest', 'r_offense_date',
       'r_charge_desc', 'r_jail_in', 'r_jail_out', 'violent_recid',
       'is_violent_recid', 'vr_case_number', 'vr_charge_degree',
       'vr_offense_date', 'vr_charge_desc', 'type_of_assessment',
       'decile_score.1', 'score_text', 'screening_date',
       'v_type_of_assessment', 'v_decile_score', 'v_score_text',
       'v_screening_date', 'in_custody', 'out_custody', 'priors_count.1',
       'start', 'end', 'event'])

df_black_LR8['sex'] = df_black_LR8['sex'].astype('category')
df_black_LR8['sex'] = df_black_LR8['sex'].cat.codes

df_black_LR8['c_charge_degree'] = df_black_LR8['c_charge_degree'].astype('category')
df_black_LR8['c_charge_degree'] = df_black_LR8['c_charge_degree'].cat.codes

df_black_LR8['c_charge_desc'] = df_black_LR8['c_charge_desc'].astype('category')
df_black_LR8['c_charge_desc'] = df_black_LR8['c_charge_desc'].cat.codes

df_black_LR8['race'] = df_black_LR8['race'].astype('category')
df_black_LR8['race'] = df_black_LR8['race'].cat.codes


X_black_LR8 = df_black_LR8.drop(columns='two_year_recid')
y_black_LR8 = df_black_LR8['two_year_recid']

kfscore_accuracy_black_LR8 = cross_val_score(lr, X_black_LR8, y_black_LR8, cv=kf, scoring='accuracy')

accuracy_black_LR8 = round(np.average(kfscore_accuracy_black_LR8) * 100, 2)
print(accuracy_black_LR8) 

accuracy_black_LR8_std = round(np.std(kfscore_accuracy_black_LR8) * 100, 2)

table_black_LR8 = str(accuracy_black_LR8) + ' [+/-' + str(accuracy_black_LR8_std) + ']'

y_pred_black_LR8 = cross_val_predict(lr, X_black_LR8, y_black_LR8, cv=kf)
conf_mat_black_LR8 = confusion_matrix(y_black_LR8, y_pred_black_LR8, normalize='true')

FP_black_LR8 = round(conf_mat_black_LR8[0,1] * 100, 2)
FN_black_LR8 = round(conf_mat_black_LR8[1,0] * 100, 2)

print(FP_black_LR8)
print(FN_black_LR8)

66.91
37.77
28.67


#### Caucasian people

In [57]:
df_white_LR8 = df_compas.loc[df_compas['race'] == 'Caucasian']
df_white_LR8 = df_white_LR8.drop(columns=['id', 'name', 'first', 'last', 'compas_screening_date', 'dob',
       'age_cat', 'decile_score', 'juv_other_count', 
       'days_b_screening_arrest', 'c_jail_in', 'c_jail_out', 'c_case_number',
       'c_offense_date', 'c_arrest_date', 'c_days_from_compas', 'is_recid', 'r_case_number',
       'r_charge_degree', 'r_days_from_arrest', 'r_offense_date',
       'r_charge_desc', 'r_jail_in', 'r_jail_out', 'violent_recid',
       'is_violent_recid', 'vr_case_number', 'vr_charge_degree',
       'vr_offense_date', 'vr_charge_desc', 'type_of_assessment',
       'decile_score.1', 'score_text', 'screening_date',
       'v_type_of_assessment', 'v_decile_score', 'v_score_text',
       'v_screening_date', 'in_custody', 'out_custody', 'priors_count.1',
       'start', 'end', 'event'])

df_white_LR8['sex'] = df_white_LR8['sex'].astype('category')
df_white_LR8['sex'] = df_white_LR8['sex'].cat.codes

df_white_LR8['c_charge_degree'] = df_white_LR8['c_charge_degree'].astype('category')
df_white_LR8['c_charge_degree'] = df_white_LR8['c_charge_degree'].cat.codes

df_white_LR8['c_charge_desc'] = df_white_LR8['c_charge_desc'].astype('category')
df_white_LR8['c_charge_desc'] = df_white_LR8['c_charge_desc'].cat.codes

df_white_LR8['race'] = df_white_LR8['race'].astype('category')
df_white_LR8['race'] = df_white_LR8['race'].cat.codes

X_white_LR8 = df_white_LR8.drop(columns='two_year_recid')
y_white_LR8 = df_white_LR8['two_year_recid']

kfscore_accuracy_white_LR8 = cross_val_score(lr, X_white_LR8, y_white_LR8, cv=kf, scoring='accuracy')

accuracy_white_LR8 = round(np.average(kfscore_accuracy_white_LR8) * 100, 2)
print(accuracy_white_LR8)

accuracy_white_LR8_std = round(np.std(kfscore_accuracy_overall_LR8) * 100, 2)

table_white_LR8 = str(accuracy_overall_LR8) + ' [+/-' + str(accuracy_overall_LR8_std) + ']'

y_pred_white_LR8 = cross_val_predict(lr, X_white_LR8, y_white_LR8, cv=kf)
conf_mat_white_LR8 = confusion_matrix(y_white_LR8, y_pred_white_LR8, normalize='true')

FP_white_LR8 = round(conf_mat_white_LR8[0,1] * 100, 2)
FN_white_LR8 = round(conf_mat_white_LR8[1,0] * 100, 2)

print(FP_white_LR8)
print(FN_white_LR8)

67.4
11.29
65.42


## Logistic Regression with all features

#### Overall

In [58]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder


X_overall_LR = df_compas.drop(columns='two_year_recid')
y_overall_LR = df_compas['two_year_recid']

numerical_cols = X_overall_LR.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = X_overall_LR.select_dtypes(include=['object']).columns

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=0.0)),
    ('scaler', MinMaxScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='-1')),
    ('enocder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ],
    remainder='passthrough'
).set_output(transform='pandas')

lr_process = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', lr)
])


In [59]:
# accuracy
kfscore_accuracy_overall_LR = cross_val_score(lr_process, X_overall_LR, y_overall_LR, cv=kf, scoring='accuracy')
accuracy_overall_LR = round(np.average(kfscore_accuracy_overall_LR) * 100, 2)
print(accuracy_overall_LR)

accuracy_overall_LR_std = round(np.std(kfscore_accuracy_overall_LR) * 100, 2)

table_overall_LR = str(accuracy_overall_LR) + ' [+/-' + str(accuracy_overall_LR_std) + ']'

# F1-score
kfscore_f1_overall_LR = cross_val_score(lr_process, X_overall_LR, y_overall_LR, cv=kf, scoring='f1')
F1_mean_LR = round(np.mean(kfscore_f1_overall_LR), 2)
print(F1_mean_LR)

F1_std_LR = round(np.std(kfscore_f1_overall_LR), 2)
print(F1_std_LR)

table_F1_LR = str(F1_mean_LR) + ' [+/-' + str(F1_std_LR) + ']'

97.6
0.97
0.01


#### African-American people

In [60]:
df_black_LR = df_compas.loc[df_compas['race'] == "African-American"]

X_black_LR = df_black_LR.drop(columns='two_year_recid')
y_black_LR = df_black_LR['two_year_recid']

numerical_cols = X_black_LR.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = X_black_LR.select_dtypes(include=['object']).columns

kfscore_accuracy_black_LR = cross_val_score(lr_process, X_black_LR, y_black_LR, cv=kf, scoring='accuracy')

accuracy_black_LR = round(np.average(kfscore_accuracy_black_LR) * 100, 2)
print(accuracy_black_LR) 

accuracy_black_LR_std = round(np.std(kfscore_accuracy_black_LR) * 100, 2)

table_black_LR = str(accuracy_black_LR) + ' [+/-' + str(accuracy_black_LR_std) + ']'

y_pred_black_LR = cross_val_predict(lr_process, X_black_LR, y_black_LR, cv=kf)
conf_mat_black_LR = confusion_matrix(y_black_LR, y_pred_black_LR, normalize='true')

FP_black_LR = round(conf_mat_black_LR[0,1] * 100, 2)
FN_black_LR = round(conf_mat_black_LR[1,0] * 100, 2)

print(FP_black_LR)
print(FN_black_LR)

96.78
6.35
0.26


#### Caucasian people

In [61]:
df_white_LR = df_compas.loc[df_compas['race'] == 'Caucasian']

X_white_LR = df_white_LR.drop(columns='two_year_recid')
y_white_LR = df_white_LR['two_year_recid']

numerical_cols = X_white_LR.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = X_white_LR.select_dtypes(include=['object']).columns

kfscore_accuracy_white_LR = cross_val_score(lr_process, X_white_LR, y_white_LR, cv=kf, scoring='accuracy')

accuracy_white_LR = round(np.average(kfscore_accuracy_white_LR) * 100, 2)
print(accuracy_white_LR)

accuracy_white_LR_std = round(np.std(kfscore_accuracy_overall_LR) * 100, 2)

table_white_LR = str(accuracy_overall_LR) + ' [+/-' + str(accuracy_overall_LR_std) + ']'

y_pred_white_LR = cross_val_predict(lr_process, X_white_LR, y_white_LR, cv=kf)
conf_mat_white_LR = confusion_matrix(y_white_LR, y_pred_white_LR, normalize='true')

FP_white_LR = round(conf_mat_white_LR[0,1] * 100, 2)
FN_white_LR = round(conf_mat_white_LR[1,0] * 100, 2)

print(FP_white_LR)
print(FN_white_LR)

97.68
3.83
0.0


#### 

## Percentages within the data

In [62]:
# black people
percentage_black = round(len(df_black_LR7) / len(df_compas) * 100, 2)
print("Percentage of black people: " + str(percentage_black) + "%")

# white people
percentage_white = round(len(df_white_LR7) / len(df_compas) * 100, 2)
print("Percentage of white people: " + str(percentage_white) + "%")

# female
df_female = df_compas.loc[df_compas['sex'] == "Female"]
percentage_female = round(len(df_female) / len(df_compas) * 100, 2)
print("Percentage of female people: " + str(percentage_female) + "%")

# male
df_male = df_compas.loc[df_compas['sex'] == "Male"]
percentage_male = round(len(df_male) / len(df_compas) * 100, 2)
print("Percentage of male people: " + str(percentage_male) + "%")


Percentage of black people: 51.23%
Percentage of white people: 34.02%
Percentage of female people: 19.34%
Percentage of male people: 80.66%


## Mann-Whitney U rank test with LR7 and LR2

In [63]:
from scipy.stats import mannwhitneyu

U,p = mannwhitneyu(kfscore_f1_overall_LR7, kfscore_f1_overall_LR2, use_continuity=True, alternative='two-sided', axis=0, method='auto', nan_policy='propagate', keepdims=False)

print('U = ' + str(U))
print('p = ' + str(p))
# Da der p-Wert nicht weniger als 0,05 ist, kann die Nullhypothese nicht abgelehnt werden. 
# Also gibt es keinen Beweis, dass die F1-Werte in den Gruppen unterschiedlich sind.

U = 63.0
p = 0.3447042220069576


## Recreation of Table 2 in Dressel and Farids (2018) paper

In [64]:
from tabulate import tabulate

header = ["Metric", "LR7", "LR2", "NL-SVM", "COMPAS", "LR8", "LR"]

data = [
    ["Accuracy (overall)", table_overall_LR7, table_overall_LR2, table_overall_SVM, "65.4", table_overall_LR8, table_overall_LR],
    ["Accuracy (black)", table_black_LR7, table_black_LR2, table_overall_SVM, "63.8", table_black_LR8, table_black_LR],
    ["Accuracy (white)", table_white_LR7, table_white_LR2, table_white_SVM, "67.0", table_white_LR8, table_white_LR],
    ["False Positive (black)", FP_black_LR7, FP_black_LR2, FP_black_SVM, "44.8", FP_black_LR8, FP_black_LR],
    ["False Positive (white)", FP_white_LR7, FP_white_LR2, FP_white_SVM, "23.5", FP_white_LR8, FP_white_LR],
    ["False Negative (black)", FN_black_LR7, FN_black_LR2, FN_black_SVM, "28.0", FN_black_LR8, FN_black_LR],
    ["False Negative (white)", FN_white_LR7, FN_white_LR2, FN_white_SVM, "47.7", FN_white_LR8, FN_white_LR],
    ["Mean (F1)", table_F1_LR7, table_F1_LR2, table_F1_SVM, "-", table_F1_LR8, table_F1_LR ],
]

print(tabulate(data, headers=header, tablefmt="fancy_grid", floatfmt=".2f"))

╒════════════════════════╤═════════════════╤═════════════════╤═════════════════╤══════════╤═════════════════╤═════════════════╕
│ Metric                 │ LR7             │ LR2             │ NL-SVM          │ COMPAS   │ LR8             │ LR              │
╞════════════════════════╪═════════════════╪═════════════════╪═════════════════╪══════════╪═════════════════╪═════════════════╡
│ Accuracy (overall)     │ 67.58 [+/-1.62] │ 67.51 [+/-2.14] │ 62.84 [+/-1.4]  │ 65.4     │ 67.74 [+/-1.88] │ 97.6 [+/-0.64]  │
├────────────────────────┼─────────────────┼─────────────────┼─────────────────┼──────────┼─────────────────┼─────────────────┤
│ Accuracy (black)       │ 66.88 [+/-1.81] │ 67.56 [+/-1.62] │ 62.84 [+/-1.4]  │ 63.8     │ 66.91 [+/-1.87] │ 96.78 [+/-0.61] │
├────────────────────────┼─────────────────┼─────────────────┼─────────────────┼──────────┼─────────────────┼─────────────────┤
│ Accuracy (white)       │ 67.58 [+/-1.62] │ 67.97 [+/-2.14] │ 61.74 [+/-0.95] │ 67.0     │ 67.74 [+/-1.

## Bias-Mitigation to improve Fairness

#### Vanilla XG Boost


In [65]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
# the chosen parameters originate from the paper about Tree Boosting Methods

# LR7 as comparison
table_F1_LR7 = str(round(np.mean(kfscore_f1_overall_LR7), 4)) + ' [+/-' + str(F1_std_LR7) + ']'

# Vanilla F1-Score

params = {
    'max_depth': 6,
    'learning_rate': 0.3,
    'subsample': 1,
    'colsample_bytree': 1,
}

xgb = XGBClassifier(**params)

xgb_cv = cross_val_score(xgb, X_overall_LR7, y_overall_LR7, cv=kf, scoring='f1')
F1_mean_vanilla = round(np.mean(xgb_cv), 4)
print(F1_mean_vanilla)
F1_std_vanilla = round(np.std(xgb_cv), 4)
print(F1_std_vanilla)
table_F1_vanilla = str(F1_mean_vanilla) + ' [+/-' + str(F1_std_vanilla) + ']'

0.6064
0.0236


#### XG Boost with RandomizedSearch


In [66]:
# Random Search
param_rs = {
    'max_depth': [3],
    'learning_rate': [0.02],
    'subsample': [1],
    'colsample_bytree': [0.4],
    'n_estimators': [1000],
}

xgb_rs = XGBClassifier(**param_rs)

# Results RandomizedSearch: {'subsample': 1, 'n_estimators': 1000, 'max_depth': 3, 'learning_rate': 0.02, 'colsample_bytree': 0.4}
# table_F1_tuned_RS = 0.6262595894157256


random_search = RandomizedSearchCV(xgb_rs, param_distributions=param_rs, scoring='f1', cv=kf)
random_search.fit(X_overall_LR7, y_overall_LR7)

F1_mean_rs = round(np.mean(random_search.best_score_), 4)
print(F1_mean_rs)
F1_std_rs = round(np.std(random_search.best_score_), 4)
print(F1_std_rs)
table_F1_rs = str(F1_mean_rs) + ' [+/-' + str(F1_std_rs) + ']'



0.6263
0.0


#### XG Boost with GridSearch

In [67]:
param_gs = {
     'max_depth': [3],
    'learning_rate': [0.1],
    'subsample': [0.4],
    'colsample_bytree': [1],
    'n_estimators': [100],
}

xgb_gs = XGBClassifier(**param_gs)

# Results GridSearch: {'colsample_bytree': 1, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.4}
# table_F1_tuned_RS = 0.6321110864467466

grid_search = GridSearchCV(xgb_gs, param_grid=param_gs, scoring='f1', cv=kf, n_jobs=-1)
grid_search.fit(X_overall_LR7, y_overall_LR7)

F1_mean_gs = round(np.mean(grid_search.best_score_), 4)
print(F1_mean_gs)
F1_std_gs = round(np.std(grid_search.best_score_), 4)
print(F1_std_gs)
table_F1_gs = str(F1_mean_gs) + ' [+/-' + str(F1_std_gs) + ']'

0.6321
0.0


#### Fairlearn

In [68]:
from fairlearn.preprocessing import CorrelationRemover

param_fairlearn_gs = {
    'max_depth': [3],
    'learning_rate': [0.02],
    'subsample': [0.8],
    'colsample_bytree': [0.4],
    'n_estimators': [1000],
}

xgb_gs = XGBClassifier(**param_fairlearn_gs)

cr = CorrelationRemover(sensitive_feature_ids=['race'])
cr.fit(X_overall_LR8)
X_overall_fairlearn = cr.transform(X_overall_LR8)

fairlearn_gs = GridSearchCV(xgb_gs, param_grid=param_fairlearn_gs, scoring='f1', cv=kf, n_jobs=-1)
fairlearn_gs.fit(X_overall_fairlearn, y_overall_LR7)
# {'colsample_bytree': 0.4, 'learning_rate': 0.02, 'max_depth': 3, 'n_estimators': 1000, 'subsample': 0.8}
# 0.6240178246633066

F1_mean_fairlearn_gs = round(np.mean(fairlearn_gs.best_score_), 4)
print(F1_mean_fairlearn_gs)
F1_std_fairlearn_gs = round(np.std(fairlearn_gs.best_score_), 4)
print(F1_std_fairlearn_gs)
table_F1_fairlearn_gs = str(F1_mean_fairlearn_gs) + ' [+/-' + str(F1_std_fairlearn_gs) + ']'

0.624
0.0


In [69]:
from fairlearn.postprocessing import ThresholdOptimizer
from sklearn.metrics import f1_score

param_gs_threshold_opt = {
    'max_depth': [3],
    'learning_rate': [0.02],
    'subsample': [1],
    'colsample_bytree': [0.6],
    'n_estimators': [5000],
}

xgb_gs = XGBClassifier(**param_gs_threshold_opt)

grid_search = GridSearchCV(xgb_gs, param_grid=param_gs_threshold_opt, scoring='f1', cv=kf, n_jobs=-1)
grid_search.fit(X_overall_LR8, y_overall_LR7)

best_model = grid_search.best_estimator_

# {'colsample_bytree': 0.6, 'learning_rate': 0.02, 'max_depth': 3, 'n_estimators': 5000, 'subsample': 1}
# 0.6522

threshold_optimizer = ThresholdOptimizer(
    estimator=best_model,
    constraints="equalized_odds",
    predict_method="predict_proba",
    objective='accuracy_score',
)

threshold_optimizer.fit(X_overall_LR8, y_overall_LR7, sensitive_features=X_overall_LR8['race'])

y_pred = threshold_optimizer.predict(X_overall_LR8, sensitive_features=X_overall_LR8['race'], random_state=18)

table_F1_fairlearn_threshold_opt = f1_score(y_overall_LR7, y_pred)
table_F1_fairlearn_threshold_opt = round(table_F1_fairlearn_threshold_opt, 4)
print(table_F1_fairlearn_threshold_opt)

0.6565


  positive_probs[sensitive_feature_vector == a] = interpolated_predictions[


#### AI Fairness 360

In [71]:
from aif360.algorithms.preprocessing import OptimPreproc
from aif360.algorithms.preprocessing.optim_preproc_helpers.distortion_functions import get_distortion_compas
from aif360.datasets import BinaryLabelDataset
from aif360.algorithms.preprocessing.optim_preproc_helpers.opt_tools import OptTools
from sklearn.pipeline import Pipeline

data = X_overall_LR8.copy()
data['label'] = y_overall_LR7
data['race'] = X_overall_LR8['race']

# {0: 'African-American', 1: 'Asian', 2: 'Caucasian', 3: 'Hispanic', 4: 'Native American', 5: 'Other'}
privileged_groups = [{'race': 2}]
unprivileged_groups =  [{'race': 0}, {'race': 1}, {'race': 3}, {'race': 4}, {'race': 5}]

dataset = BinaryLabelDataset(df=data, label_names=['label'], protected_attribute_names=['race'])

optim_options = {
    "distortion_fun": get_distortion_compas,
    "epsilon": 0.05,
    "clist": [0.99, 1.99, 2.99],
    "dlist": [.1, 0.05, 0]

}
OP = OptimPreproc(OptTools, optim_options)

dataset_transf = OP.fit_transform(dataset)

X_overall_LR8_transf = dataset_transf.features
y_overall_LR7_transf = dataset_transf.labels.ravel()

param_gs_threshold_opt = {
    'max_depth': [3],
    'learning_rate': [0.02],
    'subsample': [1],
    'colsample_bytree': [0.6],
    'n_estimators': [5000],
}

xgb_gs = XGBClassifier(**param_gs_threshold_opt)

grid_search = GridSearchCV(xgb_gs, param_grid=param_gs_threshold_opt, scoring='f1', cv=kf, n_jobs=-1)
grid_search.fit(X_overall_LR8_transf, y_overall_LR7_transf)

best_model = grid_search.best_estimator_

# {'colsample_bytree': 0.6, 'learning_rate': 0.02, 'max_depth': 3, 'n_estimators': 5000, 'subsample': 1}
# 0.6522

threshold_optimizer = ThresholdOptimizer(
    estimator=best_model,
    constraints="equalized_odds",
    predict_method="predict_proba",
    objective='accuracy_score',
)

threshold_optimizer.fit(X_overall_LR8_transf, y_overall_LR7_transf, sensitive_features=X_overall_LR8['race'])

y_pred = threshold_optimizer.predict(X_overall_LR8_transf, sensitive_features=X_overall_LR8['race'], random_state=18)

table_F1_fairlearn_threshold_opt = f1_score(y_overall_LR7_transf, y_pred)
table_F1_fairlearn_threshold_opt = round(table_F1_fairlearn_threshold_opt, 4)
print(table_F1_fairlearn_threshold_opt)

MemoryError: Unable to allocate 884. MiB for an array with shape (926983200,) and data type int8

In [50]:
from tabulate import tabulate

header = ["LR7", "Vanilla",  "RandomizedSearch", "GridSearch", "Fairlearn", "AI Fairness 360"]

data = [
    [table_F1_LR7, table_F1_vanilla,  table_F1_rs, table_F1_gs, table_F1_fairlearn_threshold_opt],
]

print(tabulate(data, headers=header, tablefmt="fancy_grid", floatfmt=".4f"))

╒══════════════════╤════════════════════╤════════════════════╤═════════════════╤═════════════╕
│ LR7              │ Vanilla            │ RandomizedSearch   │ GridSearch      │   Fairlearn │
╞══════════════════╪════════════════════╪════════════════════╪═════════════════╪═════════════╡
│ 0.5994 [+/-0.02] │ 0.6064 [+/-0.0236] │ 0.6263 [+/-0.0]    │ 0.6321 [+/-0.0] │      0.6565 │
╘══════════════════╧════════════════════╧════════════════════╧═════════════════╧═════════════╛


Quellen:

RegenerativeToday. (18.05.2022). Step by Step Tutorial on Logistic Regression in Python | sklearn |Jupyter Notebook [Video]. Youtube. https://www.youtube.com/watch?v=bSXIbCZNBw0

Dressel, A. & Farid, H. (2018). The accuracy, fairness, and limits of predicting recidivism. Science Advances, 4 (1), eaao5580. https://doi.org/10.1126/sciadv.aao5580

Ryan Nolan Data. (28.08.2023). A Comprehensive Guide to Cross-Validation with Scikit-Learn and Python [Video]. Youtube. https://www.youtube.com/watch?v=glLNo1ZnmPA&list=PLcQVY5V2UY4LNmObS0gqNVyNdVfXnHwu8&index=14

Scikit-Learn. (n.d.). User Guide. https://scikit-learn.org/stable/user_guide.html

Scikit-Learn. (n.d.). API Reference. https://scikit-learn.org/stable/api/index.html

Geeks for Geeks. (2022). How to make a table in Python?. https://www.geeksforgeeks.org/how-to-make-a-table-in-python/

Stackoverflow. (2016). Python's tabulate number of decimal. https://stackoverflow.com/questions/37079957/pythons-tabulate-number-of-decimal

Stackoverflow. (2018). How can I standardize only numeric variables in an sklearn pipeline?. https://stackoverflow.com/questions/48673402/how-can-i-standardize-only-numeric-variables-in-an-sklearn-pipeline

Velarde, G., Weichert, M., Deshmunkh, A., Deshmane, S., Sudhir, A., Sharma, K. & Joshi, V. (2024). Tree boosting methods for balanced and imbalanced classification and their robustness over time in risk assessment. Intelligent Systems with Applications. 22, 200354. https://doi.org/10.1016/j.iswa.2024.200354

Weerts, H. (19.06.2024). An Introduction to Responsible Machine Learning. GitHub. https://hildeweerts.github.io/responsiblemachinelearning/index.html
