In [11]:
# Increase my page width
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

<h3>1. Dataset exploration</h3>
In this part we will explore the ProPublica COMPAS dataset. The main goal is to find what are the features we are going to use to build the classifier.

<h5>1. 1 ProPublica COMPAS dataset</h5>
We will first present the dataset and observe the features.

In [3]:
import urllib
import os,sys
import numpy as np
import pandas as pd

from sklearn import feature_extraction
from sklearn import preprocessing
from random import seed, shuffle
#from __future__ import division
#from collections import defaultdict
#import utils as ut

SEED = 1234
seed(SEED)
np.random.seed(SEED)

def check_data_file(fname):
    files = os.listdir(".") # get the current directory listing
    print("Looking for file '%s' in the current directory...",fname)

    if fname not in files:
        print("'%s' not found! Downloading from GitHub...",fname)
        addr = "https://raw.githubusercontent.com/propublica/compas-analysis/master/compas-scores-two-years.csv"
        response = urllib.request.urlopen(addr)
        data = response.read()
        fileOut = open(fname, "wb")
        fileOut.write(data)
        fileOut.close()
        print("'%s' download and saved locally..",fname)
    else:
        print("File found in current directory..")
    
COMPAS_INPUT_FILE = "compas-scores-two-years.csv"
check_data_file(COMPAS_INPUT_FILE)

Looking for file '%s' in the current directory... compas-scores-two-years.csv
File found in current directory..


In [4]:
pd.options.display.max_columns = None
df=pd.read_csv('compas-scores-two-years.csv')
df

Unnamed: 0,id,name,first,last,compas_screening_date,sex,dob,age,age_cat,race,juv_fel_count,decile_score,juv_misd_count,juv_other_count,priors_count,days_b_screening_arrest,c_jail_in,c_jail_out,c_case_number,c_offense_date,c_arrest_date,c_days_from_compas,c_charge_degree,c_charge_desc,is_recid,r_case_number,r_charge_degree,r_days_from_arrest,r_offense_date,r_charge_desc,r_jail_in,r_jail_out,violent_recid,is_violent_recid,vr_case_number,vr_charge_degree,vr_offense_date,vr_charge_desc,type_of_assessment,decile_score.1,score_text,screening_date,v_type_of_assessment,v_decile_score,v_score_text,v_screening_date,in_custody,out_custody,priors_count.1,start,end,event,two_year_recid
0,1,miguel hernandez,miguel,hernandez,2013-08-14,Male,1947-04-18,69,Greater than 45,Other,0,1,0,0,0,-1.0,2013-08-13 06:03:42,2013-08-14 05:41:20,13011352CF10A,2013-08-13,,1.0,F,Aggravated Assault w/Firearm,0,,,,,,,,,0,,,,,Risk of Recidivism,1,Low,2013-08-14,Risk of Violence,1,Low,2013-08-14,2014-07-07,2014-07-14,0,0,327,0,0
1,3,kevon dixon,kevon,dixon,2013-01-27,Male,1982-01-22,34,25 - 45,African-American,0,3,0,0,0,-1.0,2013-01-26 03:45:27,2013-02-05 05:36:53,13001275CF10A,2013-01-26,,1.0,F,Felony Battery w/Prior Convict,1,13009779CF10A,(F3),,2013-07-05,Felony Battery (Dom Strang),,,,1,13009779CF10A,(F3),2013-07-05,Felony Battery (Dom Strang),Risk of Recidivism,3,Low,2013-01-27,Risk of Violence,1,Low,2013-01-27,2013-01-26,2013-02-05,0,9,159,1,1
2,4,ed philo,ed,philo,2013-04-14,Male,1991-05-14,24,Less than 25,African-American,0,4,0,1,4,-1.0,2013-04-13 04:58:34,2013-04-14 07:02:04,13005330CF10A,2013-04-13,,1.0,F,Possession of Cocaine,1,13011511MM10A,(M1),0.0,2013-06-16,Driving Under The Influence,2013-06-16,2013-06-16,,0,,,,,Risk of Recidivism,4,Low,2013-04-14,Risk of Violence,3,Low,2013-04-14,2013-06-16,2013-06-16,4,0,63,0,1
3,5,marcu brown,marcu,brown,2013-01-13,Male,1993-01-21,23,Less than 25,African-American,0,8,1,0,1,,,,13000570CF10A,2013-01-12,,1.0,F,Possession of Cannabis,0,,,,,,,,,0,,,,,Risk of Recidivism,8,High,2013-01-13,Risk of Violence,6,Medium,2013-01-13,,,1,0,1174,0,0
4,6,bouthy pierrelouis,bouthy,pierrelouis,2013-03-26,Male,1973-01-22,43,25 - 45,Other,0,1,0,0,2,,,,12014130CF10A,,2013-01-09,76.0,F,arrest case no charge,0,,,,,,,,,0,,,,,Risk of Recidivism,1,Low,2013-03-26,Risk of Violence,1,Low,2013-03-26,,,2,0,1102,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7209,10996,steven butler,steven,butler,2013-11-23,Male,1992-07-17,23,Less than 25,African-American,0,7,0,0,0,-1.0,2013-11-22 05:18:27,2013-11-24 02:59:20,13016249CF10A,2013-11-22,,1.0,F,Deliver Cannabis,0,,,,,,,,,0,,,,,Risk of Recidivism,7,Medium,2013-11-23,Risk of Violence,5,Medium,2013-11-23,2013-11-22,2013-11-24,0,1,860,0,0
7210,10997,malcolm simmons,malcolm,simmons,2014-02-01,Male,1993-03-25,23,Less than 25,African-American,0,3,0,0,0,-1.0,2014-01-31 07:13:54,2014-02-02 04:03:52,14001422CF10A,2014-01-31,,1.0,F,Leaving the Scene of Accident,0,,,,,,,,,0,,,,,Risk of Recidivism,3,Low,2014-02-01,Risk of Violence,5,Medium,2014-02-01,2014-01-31,2014-02-02,0,1,790,0,0
7211,10999,winston gregory,winston,gregory,2014-01-14,Male,1958-10-01,57,Greater than 45,Other,0,1,0,0,0,-1.0,2014-01-13 05:48:01,2014-01-14 07:49:46,14000581CF10A,2014-01-13,,1.0,F,Aggravated Battery / Pregnant,0,,,,,,,,,0,,,,,Risk of Recidivism,1,Low,2014-01-14,Risk of Violence,1,Low,2014-01-14,2014-01-13,2014-01-14,0,0,808,0,0
7212,11000,farrah jean,farrah,jean,2014-03-09,Female,1982-11-17,33,25 - 45,African-American,0,2,0,0,3,-1.0,2014-03-08 08:06:02,2014-03-09 12:18:04,14003308CF10A,2014-03-08,,1.0,M,Battery on Law Enforc Officer,0,,,,,,,,,0,,,,,Risk of Recidivism,2,Low,2014-03-09,Risk of Violence,2,Low,2014-03-09,2014-03-08,2014-03-09,3,0,754,0,0


As we can see the dataset contains a lot of columns (53) and we need to see if all of these features are needed or not. To do this we will first look at the columns where some of the data are empty.

In [5]:
df.isnull().sum()

id                            0
name                          0
first                         0
last                          0
compas_screening_date         0
sex                           0
dob                           0
age                           0
age_cat                       0
race                          0
juv_fel_count                 0
decile_score                  0
juv_misd_count                0
juv_other_count               0
priors_count                  0
days_b_screening_arrest     307
c_jail_in                   307
c_jail_out                  307
c_case_number                22
c_offense_date             1159
c_arrest_date              6077
c_days_from_compas           22
c_charge_degree               0
c_charge_desc                29
is_recid                      0
r_case_number              3743
r_charge_degree            3743
r_days_from_arrest         4898
r_offense_date             3743
r_charge_desc              3801
r_jail_in                  4898
r_jail_o

We can see from the data above, some columns such as r_case, r_charge_degree have empty values. We can't use these part of the data to build our classifier. Also by direct observation we can see that some of the columns such as id and name are not relevant.

<h5>1.2 Filtered Dataset</h5>
We will first remove all columns which contain null (NaN) values. And then we will list the columns with how many unique values they have.

In [12]:
df = df[df.columns[~df.isnull().any()]]
df.nunique()

id                       7214
name                     7158
first                    2800
last                     3950
compas_screening_date     690
sex                         2
dob                      5452
age                        65
age_cat                     3
race                        6
juv_fel_count              11
decile_score               10
juv_misd_count             10
juv_other_count            10
priors_count               37
c_charge_degree             2
is_recid                    2
is_violent_recid            2
type_of_assessment          1
decile_score.1             10
score_text                  3
screening_date            690
v_type_of_assessment        1
v_decile_score             10
v_score_text                3
v_screening_date          690
priors_count.1             37
start                     237
end                      1115
event                       2
two_year_recid              2
dtype: int64

We can see from above that we have some columns which have large number of unique values. These may indicate that they are not relevant, which means we can drop them. We can also see there are some columns which only have 1 unique value, we will not use these data.

In [14]:
df = df[['sex','age_cat', 'race', 'juv_fel_count', 'decile_score', 'juv_misd_count', 'priors_count', 'c_charge_degree', 'is_recid', 
         'is_violent_recid', 'decile_score.1', 'score_text', 'v_score_text', 'priors_count.1', 'event', 'two_year_recid']]
df

Unnamed: 0,sex,age_cat,race,juv_fel_count,decile_score,juv_misd_count,priors_count,c_charge_degree,is_recid,is_violent_recid,decile_score.1,score_text,v_score_text,priors_count.1,event,two_year_recid
0,Male,Greater than 45,Other,0,1,0,0,F,0,0,1,Low,Low,0,0,0
1,Male,25 - 45,African-American,0,3,0,0,F,1,1,3,Low,Low,0,1,1
2,Male,Less than 25,African-American,0,4,0,4,F,1,0,4,Low,Low,4,0,1
3,Male,Less than 25,African-American,0,8,1,1,F,0,0,8,High,Medium,1,0,0
4,Male,25 - 45,Other,0,1,0,2,F,0,0,1,Low,Low,2,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7209,Male,Less than 25,African-American,0,7,0,0,F,0,0,7,Medium,Medium,0,0,0
7210,Male,Less than 25,African-American,0,3,0,0,F,0,0,3,Low,Medium,0,0,0
7211,Male,Greater than 45,Other,0,1,0,0,F,0,0,1,Low,Low,0,0,0
7212,Female,25 - 45,African-American,0,2,0,3,M,0,0,2,Low,Low,3,0,0


<h5>1.3 Performance Metrics</h5>
In this part we will compute basic performance metrics of....