In [1]:
# package imports
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', 500)

In [2]:
# import raw COMPAS data
rawdf = pd.read_csv('data/compas-scores-raw.csv')
print('size of df: ', rawdf.shape)

size of df:  (60843, 28)


In [3]:
## FROM METHODOLOGY PIECE
# Through a public records request, ProPublica obtained two years worth of 
# COMPAS scores from the Broward County Sheriff’s Office in Florida. 
# We received data for all 18,610 people who were scored in 2013 and 2014.
rawdf.Person_ID.nunique()

18610

In [4]:
## Drop -1 from COMPAS score as -1 signifies 
rawdf = rawdf[rawdf.DecileScore >= 0]
rawdf.shape, rawdf.Person_ID.nunique()

((60798, 28), 18610)

In [5]:
# Clean ethnic code text to make consistent label for African-American
rawdf.loc[rawdf.Ethnic_Code_Text == 'African-Am', 'Ethnic_Code_Text'] = 'African-American'

In [6]:
# convert date columns to datetime 
rawdf['Screening_Date'] = (pd.to_datetime(rawdf['Screening_Date']))
rawdf['DateOfBirth'] = pd.to_datetime(rawdf['DateOfBirth'])

# fix issue with years before 1970 becoming coded as '20' instead of '19'
rawdf.loc[rawdf.DateOfBirth > rawdf.Screening_Date, 'DateOfBirth'] = rawdf.DateOfBirth - pd.DateOffset(years=100)

# new col for age at arrest in years
rawdf['AgeAtArrest'] = (rawdf['Screening_Date'] - rawdf['DateOfBirth'])
rawdf['AgeAtArrest'] = rawdf['AgeAtArrest'].apply(lambda x: int(np.floor(x.days/365)))

In [7]:
# make categorical variables dummies
rawdf = pd.get_dummies(rawdf, columns = ['Sex_Code_Text','Ethnic_Code_Text', 'LegalStatus', 'Language',
                                 'CustodyStatus', 'MaritalStatus', 
                                 'RecSupervisionLevelText', # recommended supervision level 
                                'DisplayText' ])

In [8]:
rawdf.head()

Unnamed: 0,Person_ID,AssessmentID,Case_ID,Agency_Text,LastName,FirstName,MiddleName,DateOfBirth,ScaleSet_ID,ScaleSet,AssessmentReason,Screening_Date,RecSupervisionLevel,Scale_ID,RawScore,DecileScore,ScoreText,AssessmentType,IsCompleted,IsDeleted,AgeAtArrest,Sex_Code_Text_Female,Sex_Code_Text_Male,Ethnic_Code_Text_African-American,Ethnic_Code_Text_Arabic,Ethnic_Code_Text_Asian,Ethnic_Code_Text_Caucasian,Ethnic_Code_Text_Hispanic,Ethnic_Code_Text_Native American,Ethnic_Code_Text_Oriental,Ethnic_Code_Text_Other,LegalStatus_Conditional Release,LegalStatus_Deferred Sentencing,LegalStatus_Other,LegalStatus_Parole Violator,LegalStatus_Post Sentence,LegalStatus_Pretrial,LegalStatus_Probation Violator,Language_English,Language_Spanish,CustodyStatus_Jail Inmate,CustodyStatus_Parole,CustodyStatus_Pretrial Defendant,CustodyStatus_Prison Inmate,CustodyStatus_Probation,CustodyStatus_Residential Program,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Separated,MaritalStatus_Significant Other,MaritalStatus_Single,MaritalStatus_Unknown,MaritalStatus_Widowed,RecSupervisionLevelText_High,RecSupervisionLevelText_Low,RecSupervisionLevelText_Medium,RecSupervisionLevelText_Medium with Override Consideration,DisplayText_Risk of Failure to Appear,DisplayText_Risk of Recidivism,DisplayText_Risk of Violence
0,50844,57167,51950,PRETRIAL,Fisher,Kevin,,1992-12-05,22,Risk and Prescreen,Intake,2013-01-01,1,7,-2.08,4,Low,New,1,0,20,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1
1,50844,57167,51950,PRETRIAL,Fisher,Kevin,,1992-12-05,22,Risk and Prescreen,Intake,2013-01-01,1,8,-1.06,2,Low,New,1,0,20,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0
2,50844,57167,51950,PRETRIAL,Fisher,Kevin,,1992-12-05,22,Risk and Prescreen,Intake,2013-01-01,1,18,15.0,1,Low,New,1,0,20,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0
3,50848,57174,51956,PRETRIAL,KENDALL,KEVIN,,1984-09-16,22,Risk and Prescreen,Intake,2013-01-01,1,7,-2.84,2,Low,New,1,0,28,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1
4,50848,57174,51956,PRETRIAL,KENDALL,KEVIN,,1984-09-16,22,Risk and Prescreen,Intake,2013-01-01,1,8,-1.5,1,Low,New,1,0,28,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0


In [9]:
# save data
rawdf.to_csv('data/clean_rawdata.csv')

In [10]:
rawdf.columns

Index(['Person_ID', 'AssessmentID', 'Case_ID', 'Agency_Text', 'LastName',
       'FirstName', 'MiddleName', 'DateOfBirth', 'ScaleSet_ID', 'ScaleSet',
       'AssessmentReason', 'Screening_Date', 'RecSupervisionLevel', 'Scale_ID',
       'RawScore', 'DecileScore', 'ScoreText', 'AssessmentType', 'IsCompleted',
       'IsDeleted', 'AgeAtArrest', 'Sex_Code_Text_Female',
       'Sex_Code_Text_Male', 'Ethnic_Code_Text_African-American',
       'Ethnic_Code_Text_Arabic', 'Ethnic_Code_Text_Asian',
       'Ethnic_Code_Text_Caucasian', 'Ethnic_Code_Text_Hispanic',
       'Ethnic_Code_Text_Native American', 'Ethnic_Code_Text_Oriental',
       'Ethnic_Code_Text_Other', 'LegalStatus_Conditional Release',
       'LegalStatus_Deferred Sentencing', 'LegalStatus_Other',
       'LegalStatus_Parole Violator', 'LegalStatus_Post Sentence',
       'LegalStatus_Pretrial', 'LegalStatus_Probation Violator',
       'Language_English', 'Language_Spanish', 'CustodyStatus_Jail Inmate',
       'CustodyStatus_Parole