In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

from sklearn.utils import resample

## Dataset Bias Mitigation: Resampling 
1. based on RFS, the top ranking feature that is the most important in predicting violent Recividism Score produced by COMPAS in the logistic regression are: 
-           Feature    Weight
- 0             sex  0.717787
- 2  juv_misd_count  0.563509

2. based on RFS, the two important feature in prediciting two year recividism collected by Prolific are:
- Selected Features: ['sex', 'age_cat']
- Accuracy: 0.8343881856540084

In [2]:
compas = pd.read_csv('data/compas-scores-two-years-violent.csv')
compas.head()

Unnamed: 0,id,name,first,last,compas_screening_date,sex,dob,age,age_cat,race,...,v_score_text,v_screening_date,in_custody,out_custody,priors_count.1,start,end,event,two_year_recid,two_year_recid.1
0,1,miguel hernandez,miguel,hernandez,2013-08-14,Male,1947-04-18,69,Greater than 45,Other,...,Low,2013-08-14,2014-07-07,2014-07-14,0,0,327,0,0,0
1,3,kevon dixon,kevon,dixon,2013-01-27,Male,1982-01-22,34,25 - 45,African-American,...,Low,2013-01-27,2013-01-26,2013-02-05,0,9,159,1,1,1
2,5,marcu brown,marcu,brown,2013-01-13,Male,1993-01-21,23,Less than 25,African-American,...,Medium,2013-01-13,,,1,0,1174,0,0,0
3,6,bouthy pierrelouis,bouthy,pierrelouis,2013-03-26,Male,1973-01-22,43,25 - 45,Other,...,Low,2013-03-26,,,2,0,1102,0,0,0
4,7,marsha miles,marsha,miles,2013-11-30,Male,1971-08-22,44,25 - 45,Other,...,Low,2013-11-30,2013-11-30,2013-12-01,0,1,853,0,0,0


In [3]:
compas['c_jail_in'] = pd.to_datetime(compas['c_jail_in'], format='mixed')
compas['c_jail_out'] = pd.to_datetime(compas['c_jail_out'], format='mixed')

compas['v_screening_date'] = pd.to_datetime(compas['v_screening_date'], format='mixed')
compas['vr_offense_date'] = pd.to_datetime(compas['vr_offense_date'], format='mixed')

compas['c_offense_date'] = pd.to_datetime(compas['c_offense_date'], format='mixed')
compas['c_arrest_date'] = pd.to_datetime(compas['c_arrest_date'], format='mixed')

compas['compas_screening_date'] = pd.to_datetime(compas['compas_screening_date'], format='mixed')

In [4]:
compas = compas.drop(['first','last','c_case_number','c_charge_degree', 'c_charge_desc', 'r_case_number','r_charge_degree','vr_case_number','vr_charge_degree','start','end','event'],axis=1)

#remove row with nan scores 
compas = compas.loc[compas['score_text'].notnull()]
#clean out the "-1" decile score from dataset 
compas = compas.loc[compas['v_decile_score'] != -1]

#label encodimg
compas['sex'] = compas['sex'].replace({'Male': 1, 'Female': 0})
compas['score_text'] = compas['score_text'].replace({'Low':0, 'Medium':1, 'High': 2})

#days in jail
compas['jail_out- jail_in'] = (compas['c_jail_out'] - compas['c_jail_in']).dt.days

In [5]:

#compas['reoffend'] = compas['v_decile_score'].apply(lambda x: 1 if x > 5 else 0)
compas.loc[:,'age_cat'] = compas['age_cat'].replace({'Greater than 45': 2, '25 - 45': 1, 'Less than 25': 0})
# Selecting features and target
features = compas[['sex', 'age_cat']]
target = compas['is_violent_recid']

# Splitting the data into training and testing sets
train_features, test_features, train_target, test_target = train_test_split(features, target, test_size=0.2, random_state=42)

# Initializing and training the logistic regression model
model = LogisticRegression()
model.fit(train_features, train_target)

# Making predictions
predicted_classes = model.predict(test_features)

# Evaluating the model
accuracy = accuracy_score(test_target, predicted_classes)
report = classification_report(test_target, predicted_classes)
conf_matrix = confusion_matrix(test_target, predicted_classes)

report

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


'              precision    recall  f1-score   support\n\n           0       0.83      1.00      0.91       785\n           1       0.00      0.00      0.00       163\n\n    accuracy                           0.83       948\n   macro avg       0.41      0.50      0.45       948\nweighted avg       0.69      0.83      0.75       948\n'

In [6]:
# target = compas['two_year_recid']
#single factor 
'''
               Feature  Ranking
0                  sex        1
2              age_cat        2
5       juv_misd_count        3
9       priors_count.1        4
6      juv_other_count        5
3                 race        6
4        juv_fel_count        7
7         priors_count        8
1                  age        9
10   jail_out- jail_in       10
8   c_days_from_compas       11
Selected Features: ['sex']
Accuracy: 0.8343881856540084
Classification Report:
               precision    recall  f1-score   support

           0       0.83      1.00      0.91       791
           1       0.00      0.00      0.00       157

    accuracy                           0.83       948
   macro avg       0.42      0.50      0.45       948
weighted avg       0.70      0.83      0.76       948

Confusion Matrix:
 [[791   0]
 [157   0]]
'''
# two factor 
'''
               Feature  Ranking
0                  sex        1
2              age_cat        1
5       juv_misd_count        2
9       priors_count.1        3
6      juv_other_count        4
3                 race        5
4        juv_fel_count        6
7         priors_count        7
1                  age        8
10   jail_out- jail_in        9
8   c_days_from_compas       10
Selected Features: ['sex', 'age_cat']
Accuracy: 0.8343881856540084
Classification Report:
               precision    recall  f1-score   support

           0       0.83      1.00      0.91       791
           1       0.00      0.00      0.00       157

    accuracy                           0.83       948
   macro avg       0.42      0.50      0.45       948
weighted avg       0.70      0.83      0.76       948

Confusion Matrix:
 [[791   0]
 [157   0]]
'''

"\n               Feature  Ranking\n0                  sex        1\n2              age_cat        1\n5       juv_misd_count        2\n9       priors_count.1        3\n6      juv_other_count        4\n3                 race        5\n4        juv_fel_count        6\n7         priors_count        7\n1                  age        8\n10   jail_out- jail_in        9\n8   c_days_from_compas       10\nSelected Features: ['sex', 'age_cat']\nAccuracy: 0.8343881856540084\nClassification Report:\n               precision    recall  f1-score   support\n\n           0       0.83      1.00      0.91       791\n           1       0.00      0.00      0.00       157\n\n    accuracy                           0.83       948\n   macro avg       0.42      0.50      0.45       948\nweighted avg       0.70      0.83      0.76       948\n\nConfusion Matrix:\n [[791   0]\n [157   0]]\n"

## Resampling: Oversampling the Minority Races to match with the top race 

In [7]:
compas_race = compas['race'].value_counts()
compas_race

race
African-American    2266
Caucasian           1697
Hispanic             458
Other                279
Asian                 27
Native American       11
Name: count, dtype: int64

In [11]:
compas_race = compas['race'].value_counts()
target_count = compas_race.max()

african_american = compas[compas['race'] == 'African-American']
caucasian = compas[compas['race'] == 'Caucasian']
hispanic = compas[compas['race'] == 'Hispanic']
other = compas[compas['race'] == 'Other']
asian = compas[compas['race'] == 'Asian']
native_american = compas[compas['race'] == ' Native American']


african_american_r = resample(african_american, replace=True, n_samples=target_count)
caucasian_r = resample(caucasian, replace=True, n_samples=target_count)
#hispanic_r = resample(hispanic, replace=True, n_samples=target_count)

compas_resampled = pd.concat([african_american_r,caucasian_r, hispanic, other,asian, native_american])
compas_resampled = compas_resampled.sample(frac=1, random_state=42).reset_index(drop=True)

compas_resampled['race'].value_counts()

race
African-American    2266
Caucasian           2266
Hispanic             458
Other                279
Asian                 27
Name: count, dtype: int64

In [14]:
compas_resampled.to_csv('oversample_t2.csv', index=False)

## Resampling: Undersample 

In [17]:
ret = compas['race'].value_counts()
ret['Caucasian']

458

In [20]:
compas_race = compas['race'].value_counts()
target_count = compas_race['Caucasian']
print(target_count)
african_american = compas[compas['race'] == 'African-American']
caucasian = compas[compas['race'] == 'Caucasian']
hispanic = compas[compas['race'] == 'Hispanic']
other = compas[compas['race'] == 'Other']
asian = compas[compas['race'] == 'Asian']
native_american = compas[compas['race'] == ' Native American']


african_american_r = resample(african_american, replace=True, n_samples=target_count)
#caucasian_r = resample(caucasian, replace=True, n_samples=target_count)
#hispanic_r = resample(hispanic, replace=True, n_samples=target_count)

compas_resampled = pd.concat([african_american_r,caucasian, hispanic, other,asian, native_american])
compas_resampled = compas_resampled.sample(frac=1, random_state=42).reset_index(drop=True)

compas_resampled['race'].value_counts()

1697


race
African-American    1697
Caucasian           1697
Hispanic             458
Other                279
Asian                 27
Name: count, dtype: int64

In [21]:
compas_resampled.to_csv('undersample_t2.csv', index=False)

### Resampling: Gender 


In [23]:
compas['sex'].value_counts()

sex
1    3743
0     995
Name: count, dtype: int64

In [34]:
compas_sex = compas['sex'].value_counts()
target = compas_sex.sum() //2

male = compas[compas['sex'] == 1]
female = compas[compas['sex'] == 0]

male_r = resample(male, replace=True, n_samples=target)
female_r = resample(female, replace=True, n_samples=target)
compas_sex_resample = pd.concat([male_r, female_r])
compas_sex_resample = compas_sex_resample.sample(frac=1, random_state=42).reset_index(drop=True)
compas_sex_resample['sex'].value_counts()

sex
1    2369
0    2369
Name: count, dtype: int64

In [35]:
compas_sex_resample.to_csv('comb_sample_sex.csv',index=False)

In [32]:
compas_sex.sum() //2

2369

In [36]:
pd.read_csv('comb_sample_sex.csv')

Unnamed: 0,id,name,compas_screening_date,sex,dob,age,age_cat,race,juv_fel_count,decile_score,...,v_type_of_assessment,v_decile_score,v_score_text,v_screening_date,in_custody,out_custody,priors_count.1,two_year_recid,two_year_recid.1,jail_out- jail_in
0,10548,patrick doerfor,2014-02-27,1,1965-03-17,51,2,Caucasian,0,1,...,Risk of Violence,1,Low,2014-02-27,,,1,0,0,
1,4959,george rodriguez,2014-01-04,1,1959-05-14,56,2,Hispanic,0,2,...,Risk of Violence,1,Low,2014-01-04,2014-01-03,2014-01-16,10,1,1,13.0
2,8341,irving tepper,2013-11-13,1,1968-07-10,47,2,Caucasian,0,5,...,Risk of Violence,3,Low,2013-11-13,2013-11-12,2013-11-13,1,0,0,0.0
3,874,jorge fernandez,2013-03-06,1,1990-01-23,26,1,Hispanic,0,6,...,Risk of Violence,6,Medium,2013-03-06,2013-03-06,2013-03-06,4,0,0,-1.0
4,10888,kemisha douglas,2014-05-23,0,1989-12-19,26,1,African-American,0,8,...,Risk of Violence,4,Low,2014-05-23,2015-05-08,2015-05-15,4,1,1,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4733,1030,melissa harris,2013-04-26,0,1987-08-20,28,1,Caucasian,0,2,...,Risk of Violence,2,Low,2013-04-26,2014-04-25,2014-04-26,0,0,0,0.0
4734,1623,edward johnson,2013-10-22,1,1958-10-07,57,2,African-American,0,7,...,Risk of Violence,4,Low,2013-10-22,2013-10-21,2014-02-16,19,0,0,117.0
4735,7355,melissa becker,2013-08-05,0,1980-04-23,35,1,Caucasian,0,3,...,Risk of Violence,1,Low,2013-08-05,2013-09-18,2013-10-01,2,0,0,1.0
4736,10667,michelle st. surin,2013-05-21,0,1987-02-27,29,1,African-American,0,4,...,Risk of Violence,3,Low,2013-05-21,,,0,0,0,
