In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
filepath = 'data/compas-scores-two-years.csv'

try:
    df = pd.read_csv(filepath, index_col='id')

except IOError as err:
    print("IOError: {}".format(err))
    print("To use this class, please download the following file:")
    print("\n\thttps://raw.githubusercontent.com/propublica/compas-analysis/master/compas-scores-two-years.csv")
    print("\nand place it, as-is, in the folder:")
    print("\n\t{}\n".format(os.path.abspath(os.path.join(
       os.path.abspath(__file__), '..', '..', 'data', 'raw', 'compas'))))
    import sys
    sys.exit(1)


In [3]:
df.columns

Index(['name', 'first', 'last', 'compas_screening_date', 'sex', 'dob', 'age',
       'age_cat', 'race', 'juv_fel_count', 'decile_score', 'juv_misd_count',
       'juv_other_count', 'priors_count', 'days_b_screening_arrest',
       'c_jail_in', 'c_jail_out', 'c_case_number', 'c_offense_date',
       'c_arrest_date', 'c_days_from_compas', 'c_charge_degree',
       'c_charge_desc', 'is_recid', 'r_case_number', 'r_charge_degree',
       'r_days_from_arrest', 'r_offense_date', 'r_charge_desc', 'r_jail_in',
       'r_jail_out', 'violent_recid', 'is_violent_recid', 'vr_case_number',
       'vr_charge_degree', 'vr_offense_date', 'vr_charge_desc',
       'type_of_assessment', 'decile_score.1', 'score_text', 'screening_date',
       'v_type_of_assessment', 'v_decile_score', 'v_score_text',
       'v_screening_date', 'in_custody', 'out_custody', 'priors_count.1',
       'start', 'end', 'event', 'two_year_recid'],
      dtype='object')

In [4]:
df['c_charge_desc'].value_counts()

# recode ones with counts below 10 to Other
df['c_charge_desc'] = np.where(df.groupby('c_charge_desc')['c_charge_desc'].transform('size') < 10, 'other', df['c_charge_desc'])
df['c_charge_desc']

id
1             Aggravated Assault w/Firearm
3           Felony Battery w/Prior Convict
4                    Possession of Cocaine
5                   Possession of Cannabis
6                    arrest case no charge
7                                  Battery
8                Possession Burglary Tools
9                    arrest case no charge
10                                 Battery
13                                   other
14                 Poss 3,4 MDMA (Ecstasy)
15                                 Battery
16                                 Battery
18          Poss3,4 Methylenedioxymethcath
19                   arrest case no charge
20          Felony Driving While Lic Suspd
21                                 Battery
22           Grand Theft in the 3rd Degree
23           Driving While License Revoked
24                    Possession Of Heroin
25                   arrest case no charge
26           Battery on Law Enforc Officer
27           Possession Of Methamphetamine
28      

In [5]:
one_hot = pd.get_dummies(df['c_charge_desc'], prefix='charge')
one_hot.shape

(7214, 89)

In [6]:
df = pd.concat([df, one_hot], axis=1)

In [7]:
df['decile_score']

id
1         1
3         3
4         4
5         8
6         1
7         1
8         6
9         4
10        1
13        3
14        4
15        6
16        1
18        4
19        1
20        3
21        1
22       10
23        5
24        3
25        6
26        9
27        2
28        4
30        4
32        1
33        3
37        3
38        3
39        7
         ..
10962     6
10963     4
10964     4
10965     4
10966     1
10967     2
10969     3
10971     2
10972     2
10975     2
10976     1
10977     7
10979     1
10980     2
10981     5
10982     8
10984     7
10985    10
10987     2
10988     3
10989     4
10990    10
10992     6
10994     2
10995     9
10996     7
10997     3
10999     1
11000     2
11001     4
Name: decile_score, Length: 7214, dtype: int64

In [8]:
df['race'].value_counts()

African-American    3696
Caucasian           2454
Hispanic             637
Other                377
Asian                 32
Native American       18
Name: race, dtype: int64

In [9]:
df['two_year_recid'].value_counts()

0    3963
1    3251
Name: two_year_recid, dtype: int64

In [10]:
df.columns

Index(['name', 'first', 'last', 'compas_screening_date', 'sex', 'dob', 'age',
       'age_cat', 'race', 'juv_fel_count',
       ...
       'charge_Robbery Sudd Snatch No Weapon',
       'charge_Susp Drivers Lic 1st Offense',
       'charge_Tamper With Witness/Victim/CI',
       'charge_Tampering With Physical Evidence',
       'charge_Uttering a Forged Instrument',
       'charge_Viol Injunct Domestic Violence',
       'charge_Viol Pretrial Release Dom Viol',
       'charge_Viol Prot Injunc Repeat Viol', 'charge_arrest case no charge',
       'charge_other'],
      dtype='object', length=141)

In [11]:
# two things we could do:
# (1) post-processing of the scores COMPAS gave to make them more fair
# (2) train our own model, after-the-fact, using the actual recidivism outcomes.

In [12]:
# what do we want to keep as features?
features = ['sex_new', 'age', 'juv_fel_count', 'juv_misd_count', 'juv_other_count', 'priors_count', 'c_charge_degree_new']
features = np.concatenate((features,one_hot.columns))
features = np.concatenate((features, ['P', 'Y']))
# our question would be: can we predict two_year_recid in a way that is fair?

In [13]:
features

array(['sex_new', 'age', 'juv_fel_count', 'juv_misd_count',
       'juv_other_count', 'priors_count', 'c_charge_degree_new',
       'charge_Agg Battery Grt/Bod/Harm',
       'charge_Aggrav Battery w/Deadly Weapon',
       'charge_Aggravated Assault W/Dead Weap',
       'charge_Aggravated Assault W/dead Weap',
       'charge_Aggravated Assault w/Firearm', 'charge_Aggravated Battery',
       'charge_Aggravated Battery / Pregnant', 'charge_Assault',
       'charge_Att Burgl Unoccupied Dwel', 'charge_Battery',
       'charge_Battery on Law Enforc Officer',
       'charge_Battery on a Person Over 65',
       'charge_Burglary Conveyance Unoccup',
       'charge_Burglary Dwelling Assault/Batt',
       'charge_Burglary Dwelling Occupied',
       'charge_Burglary Structure Unoccup',
       'charge_Burglary Unoccupied Dwelling',
       'charge_Carrying Concealed Firearm', 'charge_Child Abuse',
       'charge_Corrupt Public Servant',
       'charge_Crim Use of Personal ID Info',
       'charge_Cr

In [14]:
# one-hot encode the 'c_charge_degree_new' column

In [15]:
df['sex_new'] = df.apply(lambda row: row['sex'] == 'Male', axis=1)

In [16]:
df['c_charge_degree_new'] = df.apply(lambda row: row['c_charge_degree'] == 'F', axis=1)

In [17]:
def protected_race(row):
    if row['race'] == 'Caucasian':
        return 0
    elif row['race'] == 'African-American':
        return 1
    else:
        return 3

In [18]:
df['P'] = df.apply(lambda row: protected_race(row), axis=1)
df['P']

id
1        3
3        1
4        1
5        1
6        3
7        3
8        0
9        3
10       0
13       0
14       0
15       1
16       0
18       1
19       0
20       1
21       3
22       1
23       0
24       0
25       1
26       1
27       0
28       0
30       3
32       0
33       3
37       1
38       0
39       1
        ..
10962    0
10963    0
10964    1
10965    0
10966    0
10967    1
10969    1
10971    0
10972    0
10975    1
10976    0
10977    1
10979    1
10980    1
10981    1
10982    0
10984    1
10985    1
10987    3
10988    3
10989    1
10990    0
10992    0
10994    1
10995    1
10996    1
10997    1
10999    3
11000    1
11001    3
Name: P, Length: 7214, dtype: int64

In [19]:
df = df.loc[df['P'] != 3]
df.shape

(6150, 144)

In [20]:
df['Y'] = df['two_year_recid']

In [21]:
df.drop(df.columns.difference(features), 1, inplace=True)
df = df.astype('int32')

In [22]:
df.head()

Unnamed: 0_level_0,age,juv_fel_count,juv_misd_count,juv_other_count,priors_count,charge_Agg Battery Grt/Bod/Harm,charge_Aggrav Battery w/Deadly Weapon,charge_Aggravated Assault W/Dead Weap,charge_Aggravated Assault W/dead Weap,charge_Aggravated Assault w/Firearm,...,charge_Uttering a Forged Instrument,charge_Viol Injunct Domestic Violence,charge_Viol Pretrial Release Dom Viol,charge_Viol Prot Injunc Repeat Viol,charge_arrest case no charge,charge_other,sex_new,c_charge_degree_new,P,Y
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,34,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,1,1
4,24,0,0,1,4,0,0,0,0,0,...,0,0,0,0,0,0,1,1,1,1
5,23,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,1,1,0
8,41,0,0,0,14,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,1
10,39,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [54]:
df['P'].mean()

0.6009756097560975

In [55]:
df.to_csv('compas_new.csv', index=False)

In [None]:
with open('data/german.numeric.processed') as f:
    data_raw = np.array([list(map(float, x)) for x in map(lambda x: x.split(), f)])

In [None]:
data_raw.shape

In [None]:
col_names = []
for i in range(23):
    col_names.append('col_' + str(i))
col_names.append('P')
col_names.append('Y')

In [None]:
german_df = pd.DataFrame(data=data_raw, columns=col_names)
german_df.head()

In [None]:
german_df.to_csv('german_clean.csv')