In [28]:
import os
import pandas as pd
import numpy as np

In [29]:
filepath = 'data/compas-scores-two-years.csv'

try:
    df = pd.read_csv(filepath, index_col='id')

except IOError as err:
    print("IOError: {}".format(err))
    print("To use this class, please download the following file:")
    print("\n\thttps://raw.githubusercontent.com/propublica/compas-analysis/master/compas-scores-two-years.csv")
    print("\nand place it, as-is, in the folder:")
    print("\n\t{}\n".format(os.path.abspath(os.path.join(
       os.path.abspath(__file__), '..', '..', 'data', 'raw', 'compas'))))
    import sys
    sys.exit(1)


In [39]:
df.columns

Index(['name', 'first', 'last', 'compas_screening_date', 'sex', 'dob', 'age',
       'age_cat', 'race', 'juv_fel_count', 'decile_score', 'juv_misd_count',
       'juv_other_count', 'priors_count', 'days_b_screening_arrest',
       'c_jail_in', 'c_jail_out', 'c_case_number', 'c_offense_date',
       'c_arrest_date', 'c_days_from_compas', 'c_charge_degree',
       'c_charge_desc', 'is_recid', 'r_case_number', 'r_charge_degree',
       'r_days_from_arrest', 'r_offense_date', 'r_charge_desc', 'r_jail_in',
       'r_jail_out', 'violent_recid', 'is_violent_recid', 'vr_case_number',
       'vr_charge_degree', 'vr_offense_date', 'vr_charge_desc',
       'type_of_assessment', 'decile_score.1', 'score_text', 'screening_date',
       'v_type_of_assessment', 'v_decile_score', 'v_score_text',
       'v_screening_date', 'in_custody', 'out_custody', 'priors_count.1',
       'start', 'end', 'event', 'two_year_recid'],
      dtype='object')

In [34]:
df['c_charge_desc'].value_counts()

Battery                                           1156
arrest case no charge                             1137
Possession of Cocaine                              474
Grand Theft in the 3rd Degree                      425
Driving While License Revoked                      200
Driving Under The Influence                        135
Felony Battery (Dom Strang)                        100
Grand Theft (Motor Vehicle)                         98
Pos Cannabis W/Intent Sel/Del                       97
Possess Cannabis/20 Grams Or Less                   96
Felony Driving While Lic Suspd                      96
Burglary Unoccupied Dwelling                        84
Burglary Conveyance Unoccup                         77
Possession of Cannabis                              76
DUI Property Damage/Injury                          74
Poss3,4 Methylenedioxymethcath                      72
Felony Petit Theft                                  68
Driving License Suspended                           68
Aggrav Bat

In [17]:
df['race'].value_counts()

African-American    3696
Caucasian           2454
Hispanic             637
Other                377
Asian                 32
Native American       18
Name: race, dtype: int64

In [18]:
df['two_year_recid'].value_counts()

0    3963
1    3251
Name: two_year_recid, dtype: int64

In [19]:
df.columns

Index(['name', 'first', 'last', 'compas_screening_date', 'sex', 'dob', 'age',
       'age_cat', 'race', 'juv_fel_count', 'decile_score', 'juv_misd_count',
       'juv_other_count', 'priors_count', 'days_b_screening_arrest',
       'c_jail_in', 'c_jail_out', 'c_case_number', 'c_offense_date',
       'c_arrest_date', 'c_days_from_compas', 'c_charge_degree',
       'c_charge_desc', 'is_recid', 'r_case_number', 'r_charge_degree',
       'r_days_from_arrest', 'r_offense_date', 'r_charge_desc', 'r_jail_in',
       'r_jail_out', 'violent_recid', 'is_violent_recid', 'vr_case_number',
       'vr_charge_degree', 'vr_offense_date', 'vr_charge_desc',
       'type_of_assessment', 'decile_score.1', 'score_text', 'screening_date',
       'v_type_of_assessment', 'v_decile_score', 'v_score_text',
       'v_screening_date', 'in_custody', 'out_custody', 'priors_count.1',
       'start', 'end', 'event', 'two_year_recid'],
      dtype='object')

In [20]:
# two things we could do:
# (1) post-processing of the scores COMPAS gave to make them more fair
# (2) train our own model, after-the-fact, using the actual recidivism outcomes.

In [21]:
# what do we want to keep as features?
features = ['sex_new', 'age', 'juv_fel_count', 'juv_misd_count', 'juv_other_count', 'priors_count', 'c_charge_desc', 'c_charge_degree_new', 'Y', 'P']
# our question would be: can we predict two_year_recid in a way that is fair?

In [22]:
df['sex_new'] = df.apply(lambda row: row['sex'] == 'Male', axis=1)

In [23]:
df['c_charge_degree_new'] = df.apply(lambda row: row['c_charge_degree'] == 'F', axis=1)

In [24]:
df['P'] = df.apply(lambda row: row['race'] != 'Caucasian', axis=1)

In [25]:
df['Y'] = df['two_year_recid']

In [26]:
df.drop(df.columns.difference(features), 1, inplace=True)

In [27]:
df.head()

Unnamed: 0_level_0,age,juv_fel_count,juv_misd_count,juv_other_count,priors_count,c_charge_desc,sex_new,c_charge_degree_new,P,Y
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,69,0,0,0,0,Aggravated Assault w/Firearm,True,True,True,0
3,34,0,0,0,0,Felony Battery w/Prior Convict,True,True,True,1
4,24,0,0,1,4,Possession of Cocaine,True,True,True,1
5,23,0,1,0,1,Possession of Cannabis,True,True,True,0
6,43,0,0,0,2,arrest case no charge,True,True,True,0


In [25]:
df.to_csv('compas_clean.csv', index=False)

In [8]:
with open('data/german.numeric.processed') as f:
    data_raw = np.array([list(map(float, x)) for x in map(lambda x: x.split(), f)])

In [10]:
data_raw.shape

(1000, 25)

In [11]:
col_names = []
for i in range(23):
    col_names.append('col_' + str(i))
col_names.append('P')
col_names.append('Y')

In [13]:
german_df = pd.DataFrame(data=data_raw, columns=col_names)
german_df.head()

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,...,col_15,col_16,col_17,col_18,col_19,col_20,col_21,col_22,P,Y
0,1.0,6.0,4.0,12.0,5.0,5.0,3.0,4.0,67.0,3.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
1,2.0,48.0,2.0,60.0,1.0,3.0,2.0,2.0,22.0,3.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0
2,4.0,12.0,4.0,21.0,1.0,4.0,3.0,3.0,49.0,3.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
3,1.0,42.0,2.0,79.0,1.0,4.0,3.0,4.0,45.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,1.0,24.0,3.0,49.0,1.0,3.0,3.0,4.0,53.0,3.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0


In [14]:
german_df.to_csv('german_clean.csv')