In [1]:
import os
import pandas as pd

In [2]:
filepath = 'data/compas-scores-two-years.csv'

try:
    df = pd.read_csv(filepath, index_col='id')

except IOError as err:
    print("IOError: {}".format(err))
    print("To use this class, please download the following file:")
    print("\n\thttps://raw.githubusercontent.com/propublica/compas-analysis/master/compas-scores-two-years.csv")
    print("\nand place it, as-is, in the folder:")
    print("\n\t{}\n".format(os.path.abspath(os.path.join(
       os.path.abspath(__file__), '..', '..', 'data', 'raw', 'compas'))))
    import sys
    sys.exit(1)


In [4]:
df['race'].value_counts()

African-American    3696
Caucasian           2454
Hispanic             637
Other                377
Asian                 32
Native American       18
Name: race, dtype: int64

In [14]:
df['two_year_recid'].value_counts()

0    3963
1    3251
Name: two_year_recid, dtype: int64

In [9]:
df.columns

Index(['name', 'first', 'last', 'compas_screening_date', 'sex', 'dob', 'age',
       'age_cat', 'race', 'juv_fel_count', 'decile_score', 'juv_misd_count',
       'juv_other_count', 'priors_count', 'days_b_screening_arrest',
       'c_jail_in', 'c_jail_out', 'c_case_number', 'c_offense_date',
       'c_arrest_date', 'c_days_from_compas', 'c_charge_degree',
       'c_charge_desc', 'is_recid', 'r_case_number', 'r_charge_degree',
       'r_days_from_arrest', 'r_offense_date', 'r_charge_desc', 'r_jail_in',
       'r_jail_out', 'violent_recid', 'is_violent_recid', 'vr_case_number',
       'vr_charge_degree', 'vr_offense_date', 'vr_charge_desc',
       'type_of_assessment', 'decile_score.1', 'score_text', 'screening_date',
       'v_type_of_assessment', 'v_decile_score', 'v_score_text',
       'v_screening_date', 'in_custody', 'out_custody', 'priors_count.1',
       'start', 'end', 'event', 'two_year_recid'],
      dtype='object')

In [None]:
# two things we could do:
# (1) post-processing of the scores COMPAS gave to make them more fair
# (2) train our own model, after-the-fact, using the actual recidivism outcomes.

In [18]:
# what do we want to keep as features?
features = ['name', 'sex', 'dob', 'age', 'juv_fel_count', 'juv_misd_count', 'juv_other_count', 'priors_count', 'c_charge_desc', 'c_charge_degree', 'Y', 'P']
# our question would be: can we predict two_year_recid in a way that is fair?

In [19]:
df['P'] = df.apply(lambda row: row['race'] != 'Caucasian', axis=1)

In [22]:
df['Y'] = df['decile_score']

In [23]:
df.drop(df.columns.difference(features), 1, inplace=True)

In [24]:
df.head()

Unnamed: 0_level_0,name,sex,dob,age,juv_fel_count,juv_misd_count,juv_other_count,priors_count,c_charge_degree,c_charge_desc,P,Y
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,miguel hernandez,Male,1947-04-18,69,0,0,0,0,F,Aggravated Assault w/Firearm,True,1
3,kevon dixon,Male,1982-01-22,34,0,0,0,0,F,Felony Battery w/Prior Convict,True,3
4,ed philo,Male,1991-05-14,24,0,0,1,4,F,Possession of Cocaine,True,4
5,marcu brown,Male,1993-01-21,23,0,1,0,1,F,Possession of Cannabis,True,8
6,bouthy pierrelouis,Male,1973-01-22,43,0,0,0,2,F,arrest case no charge,True,1


In [25]:
df.to_csv('compas_clean.csv', index=False)