# Recidivism prediction model, optimized for fairness

First, load the appropriate libraries for the study: numpy, pandas, matplotlib, sklearn, nltk, re, scipy, tensorflow and keras to name a few.

In [2]:
#!{sys.executable} -m pip install keras
#!{sys.executable} -m pip install -U keras-tuner
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import date
%matplotlib inline
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.preprocessing import MultiLabelBinarizer
from scipy.stats import chi2_contingency, kruskal, f_oneway, normaltest, bartlett
import plotly.express as px

import sys

import kerastuner
from kerastuner import RandomSearch
import tensorflow as tf
from tensorflow import keras
from keras import layers

#Create seed to stablize variation in training and its evaluation
from numpy.random import seed
seed(1)
tf.random.set_seed(2)

#optional task, commented to save time from redoing
nltk.download('stopwords')
nltk.download('wordnet')


ModuleNotFoundError: No module named 'kerastuner'

Then, load the data, there are 2 dataset choices, this report is going to work on the dataset that involves defendants with risk of reoffending within 2 years.

In [None]:
#DEFINE CONSTANTS and NAMES.
# file1 = "compas-scores-raw.csv"
file2 = "compas-scores-two-years.csv"

In [None]:
def create_dataframe_from_file(filename):
    '''
        This function creates a shuffled dataframe from a given file.
    '''
    #create the dataframe.
    df = pd.read_csv(filename)
    #shuffle the dataframe.
    df = df.sample(frac=1, random_state = 32)
    return df

Once dataset is loaded, make it a df and shuffle the set. Take a look at the first few rows to understand the columns. 

In [None]:
#Create a complete dataframe with 
full_df = create_dataframe_from_file(file2)
display(full_df.head())

Then use domain knowledge to determine non useful features to drop and features that would be more useful once processed. For example, turning in and out jail or custody into durations. Then determine which length is longer and use that instead of both.

In [None]:
#Feature engineering
#Calculate duration between start and end date

def dateDiff(s_date, e_date):
    '''
        This function creates the differences from the given 2 dates.
    '''
    sub_s_date = s_date.split("-")
    sub_e_date = e_date.split("-")
    s_date = date(int(sub_s_date[0]), int(sub_s_date[1]), int(sub_s_date[2].split(" ")[0]))
    e_date = date(int(sub_e_date[0]), int(sub_e_date[1]), int(sub_e_date[2].split(" ")[0]))
    delta = e_date - s_date
    return delta.days 

def duration(id_c, s_date, e_date):
    '''
        This function creates the duration from the given 2 dates.
    '''
    sum_date = 0
    a = 0
    while a < full_df.shape[0]:
        if ((str(s_date[a]) != "nan") or (str(s_date[a]) != "nan")):
            sum_date += dateDiff(s_date[a], e_date[a])
        a += 1
    avg_date = sum_date /full_df.shape[0]

    list_duration = []
    i = 0
    while i < full_df.shape[0]:
        if ((str(s_date[i]) == "nan") or (str(s_date[i]) == "nan")):
            list_duration.append([id_c[i], int(avg_date)])
        else:
            list_duration.append([id_c[i], dateDiff(s_date[i], e_date[i])])
        i += 1
    return list_duration

def max_duration(id_c, jail_duration, custody_duration):
    '''
        This function output the max duration between 2 given durations.
    '''
    list_duration = []
    a = 0
    while a < full_df.shape[0]:
        list_duration.append([id_c[a], max([jail_duration[a], custody_duration[a]])])
        a += 1
    return list_duration

In [None]:
cus_df = pd.DataFrame(duration(full_df['id'], full_df['in_custody'], full_df['out_custody']), columns = ['id', 'custody_duration'])
display(cus_df.head())

In [None]:
jail_df = pd.DataFrame(duration(full_df['id'], full_df['c_jail_in'], full_df['c_jail_out']), columns = ['id', 'c_jail_duration'])
display(jail_df.head())

In [None]:
full_df['custody_duration'] = full_df.id.map(cus_df.set_index('id')['custody_duration'])
full_df['c_jail_duration'] = full_df.id.map(jail_df.set_index('id')['c_jail_duration'])
display(full_df.head())

In [None]:
max_df = pd.DataFrame(max_duration(full_df['id'], full_df['c_jail_duration'], full_df['custody_duration']), columns = ['id', 'max_duration'])
display(max_df.head())

In [None]:
full_df['max_duration'] = full_df.id.map(max_df.set_index('id')['max_duration'])
display(full_df.head())

Once this is done, drop all the columns that clearly aren’t useful and start the analysis on the numeric as well as the categorical.

In [None]:
#Using domain knowledge to remove obviously not related features
out_features = ["id", "name", "first", "last", "age", "dob", "c_case_number", "decile_score", "decile_score.1", "priors_count", "priors_count.1","score_text", "v_decile_score", "v_score_text", "violent_recid",
                "compas_screening_date", "c_jail_in", "c_jail_out", "c_offense_date", "c_arrest_date", "screening_date", "v_screening_date", 
                "in_custody", "out_custody",  "vr_case_number",   "vr_charge_degree", "vr_offense_date", "vr_charge_desc", "r_case_number",
                "r_charge_degree", "r_days_from_arrest", "r_offense_date", "r_charge_desc", "type_of_assessment", "v_type_of_assessment",
                "r_jail_in", "r_jail_out", "days_b_screening_arrest", "c_days_from_compas", "event", "end", 'custody_duration', 'c_jail_duration']

full_df = full_df.drop(out_features, axis=1)

Using .describe(), we have a good overview of the numerical data. We can quickly see the distribution. Use box plot to identify outliers to be removed. From this, we can also determine which columns are numerical features. Move on to choose which quantile as appropriate cuts of to outliers and then normalize data. Choose to include outliers, as sensibly, they are valid data. Then convert categorical features into one hot vectors.

In [None]:
#data visualization for numerical features normalization
full_df.describe()

In [None]:
NUMERIC_FEATURES = ["juv_fel_count", "juv_misd_count", "juv_other_count", 
                    "is_violent_recid", "start", "max_duration"]

LABEL = "two_year_recid"

for feature in NUMERIC_FEATURES:
    box_by_label = px.box(full_df, x=LABEL, y=feature, title=feature)
    box_by_label.show()

In [None]:
def rescale(orig_val, min, max):
    '''
    Rescales the value to the 0 to 1 range. Also handles outliers by replacing
    with the max value.
    '''
    if orig_val > max:
        orig_val = max
    return (orig_val - min) / (max - min)

# # This is a big transformation so make a new deep copy of the dataframe
# df_scaled = full_df.copy(deep=True)

# Min / max values determination
# Max values chosen at .99 or .95 quantile appropriately to remove outliers
MIN_FEL_COUNT= full_df["juv_fel_count"].min()
MAX_FEL_COUNT = full_df["juv_fel_count"].quantile(0.99)
MIN_MISD_COUNT = full_df["juv_misd_count"].min()
MAX_MISD_COUNT = full_df["juv_misd_count"].quantile(0.95)
MIN_OTHER_COUNT = full_df["juv_other_count"].min()
MAX_OTHER_COUNT = full_df["juv_other_count"].quantile(0.95)
MIN_VIO_RECID = full_df["is_violent_recid"].min()
MAX_VIO_RECID = full_df["is_violent_recid"].quantile(0.95)
MIN_START = full_df["start"].min()
MAX_START = full_df["start"].quantile(0.95)
MIN_DURATION = full_df["max_duration"].min()
MAX_DURATION = full_df["max_duration"].quantile(0.95)

# Convert column by column
full_df["juv_fel_count"] = full_df["juv_fel_count"].apply(lambda x: rescale(x, MIN_FEL_COUNT, MAX_FEL_COUNT))
full_df["juv_misd_count"] = full_df["juv_misd_count"].apply(lambda x: rescale(x, MIN_MISD_COUNT, MAX_MISD_COUNT))
full_df["juv_other_count"] = full_df["juv_other_count"].apply(lambda x: rescale(x, MIN_OTHER_COUNT, MAX_OTHER_COUNT))
full_df["is_violent_recid"] = full_df["is_violent_recid"].apply(lambda x: rescale(x, MIN_VIO_RECID, MAX_VIO_RECID))
full_df["start"] = full_df["start"].apply(lambda x: rescale(x, MIN_START, MAX_START))
full_df["max_duration"] = full_df["max_duration"].apply(lambda x: rescale(x, MIN_DURATION, MAX_DURATION))

# Sanity check
display(full_df.head())
print("\n")
display(full_df.describe())

After investigation we can see that the Asian and the Native American population to be too small for effective training and would be hard to achieve fairness for these groups so we make a conscious choice to remove it altogether.

In [None]:
#Drop any rows in 'race' column that has an 'Other' value because of its smaller counts.
full_df = full_df[full_df['race'] != 'Asian']
full_df = full_df[full_df['race'] != 'Native American']

# split the df into two dataframe by Label
full_df_is_recid = full_df.drop("two_year_recid",axis=1)
full_df_two_year_recid = full_df.drop("is_recid",axis=1)

In [None]:
# Check the new dataframe of is_recid as Label
display(full_df_is_recid.head())

In [None]:
# Check the new dataframe of two_year_recid as Label
# We will work with this Label since it yields better relationship
display(full_df_two_year_recid.head())

The most important feature is the c_charge_desc which describes the offense the defendants were charged with. Although this is very important data, it’s written in a non-process, non-machine friendly manner, in natural, appreciated English. Attempts to process this feature in CS manner using algorithms yields poor results giving move edge cases since there is no actual rule on which these sentences are structured. Manual work with elbow grease was applied here to turn not so useful feature into one.

In [None]:
# Create a dictionary to convert all the charge description 
# Converting appreviated or unintelligible phrases to understandable ones
# All drugs related offense are generalized to just "Drug" for better  grouping
convert = {'Possession of Cocaine': 'Possession of Drug',
'Battery': 'Battery',
'Felony DUI (level 3)': 'Felony DUI (level 3)',
'Criminal Mischief' : 'Criminal Mischief' ,
'Possession Of Heroin' : 'Possession Of drug',
'Felony Driving While Lic Suspd' : 'Felony Driving While License Suspended',
'Driving While License Revoked' : 'Driving While License Revoked',
'Grand Theft in the 3rd Degree' :'Grand Theft in the third Degree' ,
'arrest case no charge' : 'arrest case no charge',
'Possession Of Alprazolam' : 'Possession Of Drug',
'Pos Cannabis W/Intent Sel/Del': 'Possession of Drug',
'Resist/Obstruct W/O Violence' : 'Resist Officer Without Violence',
'DUI Level 0.15 Or Minor In Veh' : 'DUI Level 0.15 Or Minor In Vehicle',
'Aggravated Assault W/Dead Weap' : 'Aggravated Assault With Deadly Weapon',
'Susp Drivers Lic 1st Offense' : 'Operating With Suspended Driving License first Offense',
'Aggrav Battery w/Deadly Weapon' : 'Aggravated Assault With Deadly Weapon',
'Felony Petit Theft' : 'Felony Petit Theft', 
'Tampering With Physical Evidence' : 'Tampering With Physical Evidence' ,
'Burglary Structure Unoccup' : 'Burglary Unoccupied Dwelling',
'Unlaw LicTag/Sticker Attach' : 'Unlawful Sticker Attachment',
'Offer Agree Secure For Lewd Act' : 'Lewdness Violation',
'Burglary Unoccupied Dwelling' : 'Burglary Unoccupied Dwelling',
'Poss3,4 Methylenedioxymethcath' : 'Possession of drug',
'Driving License Suspended' : 'Driving License Suspended',
'Aggravated Assault w/Firearm' : 'Aggravated Assault With Deadly Weapon',
'False Imprisonment' : 'Falsely Imprisonment',
'Poss Of RX Without RX' : 'Possession of Controlled Substance Without Prescription',
'Defrauding Innkeeper $300/More' : 'Defrauding Innkeeper',
'Ride Tri-Rail Without Paying' : 'Ride Railroad Without Paying',
'Possession Of Methamphetamine' : 'Possession Of Drug',
'Petit Theft $100- $300' : 'Petit Theft $100- $300',
'Aggravated Battery / Pregnant' : 'Aggravated Battery with Pregnant',
'Leaving the Scene of Accident' : 'Leaving the Scene of Accident',
'Aggravated Assault W/dead Weap' : 'Aggravated Assault With Deadly Weapon',
'Resist Officer w/Violence' : 'Resist Officer With Violence',
'Grand Theft (Motor Vehicle)' : 'Grand Theft (Motor Vehicle)',    
'Stalking' : 'Stalking',                                          
'Felony Battery (Dom Strang)' : 'Felony Battery',
'Possess Cannabis/20 Grams Or Less': 'Possession of Drug',
'Driving Under The Influence' : 'Driving Under The Influence',    
'Carrying Concealed Firearm': 'Carrying Concealed Firearm',       
'Battery on Law Enforc Officer' : 'Battery on Law Enforcement Officer',
'Deliver Cannabis' : 'Delivery Drug',
'Burglary Conveyance Occupied' : 'Burglary Conveyance Occupied',
'Del Morphine at/near Park' : 'Delivery Drug',
'Leave Acc/Attend Veh/More $50' :  'Leaving Accident and Attended Vehicle with More $50',
'Poss Pyrrolidinovalerophenone' : 'Possession of Drug' ,
'Viol Injunct Domestic Violence' : 'Violation Injunction Domestic Violence', 
'Operating W/O Valid License' : 'Operating without Valid License' ,
'Disorderly Conduct' : 'Resist Officer Without Violence',
'Battery on a Person Over 65' : 'Battery one an elderly',
'Aggravated Battery' : 'Aggravated Battery' ,
'Trespass Struct/Conveyance' : 'Trespassing Conveyance',
'Possession of Cannabis' : 'Possession of Drug',
'Burglary Conveyance Assault/Bat' : 'Burglary Conveyance Assault or Battery', 
'Felony Battery w/Prior Convict' : 'Felony Battery with Prior Convicted',
'Petit Theft' : 'Petit Theft',
'Burglary Dwelling Occupied' : 'Burglary Dwelling Occupied',
'DUI Property Damage/Injury' : 'DUI Property Damage or Injury',
'Gambling/Gamb Paraphernalia' : 'Gambling Drug',
'Fighting/Baiting Animals' : 'Animal Abuse',
'False Ownership Info/Pawn Item' : 'Falsely Ownership Info or Pawn Item',
'Assault' : 'Assault',
'Manufacture Cannabis' : 'Manufacture Drug' ,
'Agg Battery Grt/Bod/Harm' : 'Aggravated Battery',
'Poss Of Controlled Substance' : 'Possession Of Controlled Substance',
'Poss of Cocaine W/I/D/S 1000FT Park' : 'Selling Drug',
'Att Burgl Unoccupied Dwel' : 'Burglary Unoccupied Dwelling',
'Del Cannabis For Consideration' : 'Possession of Drug',
'Aggravated Assault' : 'Aggravated Assault' ,
'Forging Bank Bills/Promis Note' : 'Uttering a Forged Instrument', 
'Burglary Structure Assault/Batt' : 'Burglary Conveyance Assault or Battery',
'Opert With Susp DL 2nd Offens' : 'Operating With Suspended Driving License second Offense',
'Robbery W/Firearm' : 'Robbery with Deadly Weapon',
'Cruelty Toward Child' : 'Child Abuse', 
'Fleeing or Eluding a LEO' : 'Fleeing or Eluding a Law Enforcement Officer',
'Disorderly Intoxication' : 'Disorderly Intoxication',
'Burglary Conveyance Unoccup' : 'Burglary Conveyance Unoccupied',
'Crim Use of Personal ID Info' : 'Criminal Use of Driver License Info',
'Poss Wep Conv Felon' : 'Possession of Weapon with convicted felony' ,
'Burglary Dwelling Armed' : 'Burglary Dwelling Armed',
'Possession Firearm School Prop' :  'Possession Firearm School Property', 
'Possession of Benzylpiperazine' : 'Possession of Drug',
'Cash Item w/Intent to Defraud' : 'Cash Item With Intention to Defrauding',
'Crimin Mischief Damage $1000+' : 'Criminal Mischief Damage $1000+',
'Unauth Poss ID Card or DL' : 'Unauthorized Possession of ID Card or Driving License',
'Trespass Private Property': 'Trespassing Private Property',
'Assault Law Enforcement Officer' : 'Assault Law Enforcement Officer',
'Fraudulent Use of Credit Card' : 'Fraudulent Use of Credit Card',
'Littering' : 'Littering',
'Poss Contr Subst W/o Prescript' : 'Possession of Controlled Substance Without Prescription',
'Restraining Order Dating Viol' : 'Restraining Order Dating Violence',
'Possession Burglary Tools' : 'Possession of Burglary Tools',
'Grand Theft of a Fire Extinquisher' : 'Grand Theft of a Fire Extinquisher',
'Fleeing Or Attmp Eluding A Leo' : 'Fleeing or Eluding a Law Enforcement Officer',
'Traffick Amphetamine 28g><200g' : 'Trafficking Drug 28g><200g',
'Agg Fleeing and Eluding' : 'Aggravated Fleeing and Eluding',
'Trespassing/Construction Site' : 'Trespassing Construction Site',
'Reckless Driving' : 'Reckless Driving' ,
'Agg Abuse Elderlly/Disabled Adult' : 'Aggravated Abuse Elderly or disabled Adult', 
'Dealing in Stolen Property' : 'Dealing in Stolen Property',
'Defrauding Innkeeper' : 'Defrauding Innkeeper',
'DUI/Property Damage/Persnl Inj' : 'DUI with Property Damage and Person Injury',
'Grand Theft Firearm' : 'Grand Theft with deadly weapon',
'Kidnapping / Domestic Violence' : 'Kidnapping and Domestic Violence',
'DUI - Enhanced' : 'DUI - Enhanced',
'Failure To Return Hired Vehicle' : 'Failure To Return Hired Property',
'Exposes Culpable Negligence' : 'Exposes Culpable Negligence',
'Opert With Susp DL 2ND Offense' : 'Operating With Suspended Driving License second Offense',
'Use of Anti-Shoplifting Device' : 'Possession of Anti-Shoplifting Device',
'Possession of Hydromorphone' : 'Possession of Drug',
'Uttering a Forged Instrument': 'Uttering a Forged Instrument',
'Stalking (Aggravated)' : 'Aggravated Stalking',
'Purchase Cannabis' : 'Purchase Drug',
'DWI w/Inj Susp Lic / Habit Off' :  'DUI with Injury Suspended Driver License Habit Offense',
'Lve/Scen/Acc/Veh/Prop/Damage' : 'DUI with Property Damage and Person Injury',
'Child Abuse' : 'Child Abuse',
'Tamper With Witness/Victim/CI' : 'Tampering With Witness or Victim or Confidential Informant', 
'Imperson Public Officer or Emplyee' : 'Impersonate Public Officer or Employee',
'Felony Batt(Great Bodily Harm)' : 'Aggravated Battery',
'Possession of Butylone' : 'Possession of Drug',
'Attempted Robbery  No Weapon' : 'Attempted Robbery Without Weapon',
'Felony Battery' : 'Felony Battery',
'Retail Theft $300 2nd Offense' : 'Retail Theft $300 second Offense',
'Burglary Dwelling Assault/Batt' : 'Burglary Dwelling Assault or Battery',
'Prowling/Loitering' : 'Prowling and Loitering', 
'Unlawful Conveyance of Fuel' : 'Unlawful Conveyance of Fuel',
'Del Cannabis At/Near Park' : 'Delivery Drug',
'Deliver Cocaine 1000FT Store' : 'Delivery Drug',
'Possession Of 3,4Methylenediox' : "Possession of drug",
'Deliver Cocaine' : 'Delivery Drug',
'Possession Of Amphetamine' : 'Possession Of Drug',
'Fabricating Physical Evidence' : 'Fabricating Physical Evidence',
'Conspiracy to Deliver Cocaine' : 'Delivery Drug',
'Possession of Morphine' : 'Possession of Drug',
'Neglect Child / Bodily Harm' : 'Child Abuse',
'Sex Offender Fail Comply W/Law' : 'Sexual Offender Failure to Comply with Law officer',
'Fail To Redeliver Hire Prop' : 'Failure to Return Hired Property',
'Simulation of Legal Process' : 'Simulation of Legal Process', 
'Robbery / Weapon' : 'Robbery with Deadly Weapon',
'Robbery Sudd Snatch No Weapon' : 'Robbery',
'Disrupting School Function' : 'Unlawful Disturb Education or Institute',
'Poss/Sell/Del Cocaine 1000FT Sch' : "Selling Drug",
'Lewd or Lascivious Molestation' : 'Lewdness Violation',
'Poss of Methylethcathinone' : 'Possession of drug',
'Robbery / No Weapon' : "Robbery",
'Grand Theft of the 2nd Degree' : 'Grand Theft of the second Degree',
'Possession Of Buprenorphine' : 'Possession Of Drug',
'Possession of Codeine' : 'Possession Of Drug',
'Criminal Mischief Damage <$200' : 'Criminal Mischief Damage <$200',
'Unlaw Use False Name/Identity' : 'Criminal Use of Falsely Identification',
'Viol Prot Injunc Repeat Viol' : "Violation Protect Injunction Repeat Violence",
'Theft' : 'Theft',
'Viol Pretrial Release Dom Viol' : "Domestic Violence",
'Corrupt Public Servant' : 'Corrupt Public officer',
'Burglary With Assault/battery' : 'Burglary With Assault or battery',
'Harm Public Servant Or Family' : 'Harm Public officer Or Family',
'Aggrav Stalking After Injunctn' : 'Aggravated Stalking After Injunction',
'Possess Drug Paraphernalia' : 'Possession of Drug',
'DUI- Enhanced' : 'DUI- Enhanced',
'Lewdness Violation' : 'Lewdness Violation',
'Possession of Hydrocodone' : 'Possession of Drug',       
'Lewd Act Presence Child 16-': 'Lewdness Violation', 
'Poss Cocaine/Intent To Del/Sel' : 'Selling Drug',
'Possession of Oxycodone' : 'Possession of Drug',
'Video Voyeur-<24Y on Child >16' : 'Voyeurism',
'Shoot Into Vehicle' : 'Aggravated Assault',
'Live on Earnings of Prostitute' : 'Live on Earnings of Prostitution',
'Armed Trafficking in Cannabis' : 'Armed Trafficking in Drug',
'Retail Theft $300 1st Offense' : 'Retail Theft $300 first Offense',
'Viol Injunction Protect Dom Vi' : 'Violation Injunction Protect Domestic Violence',
'Fel Drive License Perm Revoke' : 'Felony Driving License Permanently Revoked',
'Possess w/I/Utter Forged Bills': 'Uttering a Forged Instrument',
'Possession Of Carisoprodol' : 'Possession Of Drug',
'Unlicensed Telemarketing' : 'Unlicensed Telemarketing',
'Posses/Disply Susp/Revk/Frd DL' : 'Possession of Suspended Driving License',
'Att Burgl Struc/Conv Dwel/Occp' : 'Attempted Burglary Conveyance Occupied', 
'Possession Of Cocaine': 'Possession Of Drug',
'Obstruct Fire Equipment' : 'Obstruct Fire Equipment',
'Possession of Alcohol Under 21' : 'Possession of Alcohol Under 21',
'Possession of Methadone' : 'Possession of drug',
'Sale/Del Counterfeit Cont Subs' : 'Sale delivery Counterfeit Controlled Substance',
'Sex Battery Deft 18+/Vict 11-' : 'Sexual Battery',
'DWLS Canceled Disqul 1st Off' : 'Driving while license suspended canceled disqualified first offense',
'Purchase Of Cocaine' : 'Purchase Of Drug',
'Delivery of Heroin' : 'Delivery of drug',
'Crim Attempt/Solicit/Consp' : 'Criminal Attempted and Solicitation Conspiracy',
'Violation Of Boater Safety Id' : 'Violation Of Boater Safety Id',
'Flee/Elude LEO-Agg Flee Unsafe' : 'Fleeing or Eluding a Law Enforcement Officer',
'Poss Oxycodone W/Int/Sell/Del': 'Selling Drug',
'Grand Theft (motor Vehicle)' : 'Grand Theft (motor Vehicle)',
'Offer Agree Secure/Lewd Act' : 'Offer Agree Secure Lewdness Act',
'Depriv LEO of Protect/Communic' : 'Depriving Law Enforcement officer of means of protect or communication',
'Fraud Obtain Food or Lodging' : 'Fraudulent Obtain Food or Lodging',
'Sex Batt Faml/Cust Vict 12-17Y' : 'Sexual Battery family custodial Victim 12-17Y',
'Intoxicated/Safety Of Another' : 'Intoxication and Safety Of Another',
'DWLS Susp/Cancel Revoked' : 'Driving while license suspended canceled revoked',
'Aide/Abet Prostitution Lewdness' : 'Aiding and Abet Prostitution Lewdness',
'Solic to Commit Battery' : 'Solicitation to Commit Battery',
'Battery On Fire Fighter' : 'Battery On Fire Fighter',
'Shoot In Occupied Dwell' : 'Shoot In Occupied Dwelling',
'Voyeurism' :'Voyeurism' , 
'Compulsory Attendance Violation' : 'Compulsory Attended Violation', 
'Poss Of 1,4-Butanediol' : 'Possession Of Drug',
'Burgl Dwel/Struct/Convey Armed' : 'Burglary conveyance with Deadly Weapon',
'Poss Pyrrolidinovalerophenone W/I/D/S' : 'Possession Pyrrolidinovalerophenone with intention to distribute schedule',
'Interference with Custody' : 'Interference with Custody',
'Deliver Alprazolam' : 'Delivery Drug',
'Attempted Robbery Firearm' : "Attempted Robbery With Deadly Weapon",
'Trespass Other Struct/Conve' : 'Trespassing Other Conveyance',
'Carjacking with a Firearm' : 'Grand Theft with Deadly weapon',
'False 911 Call' : 'Falsely 911 Call',
'Conspiracy Dealing Stolen Prop' : 'Conspiracy Dealing Stolen Property',
'Failure To Pay Taxi Cab Charge' : 'Failure To Paying Taxi Cab Charge',
'Tresspass in Structure or Conveyance' : 'Trespassing in Conveyance',
'Sexual Performance by a Child' : "Child Pornography",
'Trespass Struct/Convey Occupy' : 'Trespassing Conveyance Occupied',
'Violation of Injunction Order/Stalking/Cyberstalking' : 'Violation of Injunction Order or Stalking or Cyberstalking',
'Sell/Man/Del Pos/w/int Heroin' : 'selling manufacture delivery possession with intention drug',
'Exhibition Weapon School Prop' : 'Exhibition Weapon School Property',
'Prostitution/Lewdness/Assign' : 'Prostitution or Lewdness or Assignation',
'Solicit To Deliver Cocaine' : 'Solicitation To Delivery Drug',
'Escape' : 'Escape',
'Carjacking w/o Deadly Weapon' : 'Carjacking Without Deadly Weapon',
'Aggravated Battery On 65/Older' : 'Aggravated Battery On Elderly',
'Crim Attempt/Solic/Consp' : 'Criminal Attempted, Solicitation Conspiracy',
'Traffic Counterfeit Cred Cards' : 'Trafficking Counterfeit Credit Card',
'Trans/Harm/Material to a Minor' : 'Transmission material harmful to a minor',
'Giving False Crime Report' : 'Contradict Statement',
'Abuse Without Great Harm' : 'Abuse Without Great Harm', 
'Poss Alprazolam W/int Sell/Del' : 'Selling Drug',
'Structuring Transactions' : 'Structuring Transactions',
'Purchase/P/W/Int Cannabis' : 'purchase with intention Drug',
'False Name By Person Arrest' : 'Contradict Statement',
'Unauth C/P/S Sounds>1000/Audio' : 'Unauthorized C/P/S',
'Possession Of Clonazepam' : 'Possession Of Drug',
'False Info LEO During Invest' : 'Falsely Info law enforcement officer During Investigation',
'Money Launder 100K or More Dols' : 'Money Launder 100K or More Dollars',
'Carry Open/Uncov Bev In Pub' : 'Carrying Open Uncovered Beverage In Public',
'Possession Of Anabolic Steroid' : 'Possession Of Anabolic Steroid',
'Crim Use Of Personal Id Info' : 'Criminal Use Of Driver License Info',
'Criminal Attempt 3rd Deg Felon' : 'Criminal Attempted third Degree felony',
'Sell Cannabis' : 'Selling Drug',
'Possession of Cannabis' : 'Possession of Drug',
'Unlaw Lic Use/Disply Of Others' : 'Unlawful Driver License Use and Display Of Others',
'Use Computer for Child Exploit' : 'Child Pornography',
'Attempted Deliv Control Subst' : 'Attempted and Delivery Controlled Substance',
'Tampering with a Victim' : 'Tampering with a Victim',
'Obstruct Officer W/Violence' : 'Resist Officer with Violence',
'Consume Alcoholic Bev Pub' : 'Consume Alcohol Beverage Public',
'Sexual Battery / Vict 12 Yrs +' : 'Sexual Battery with Victim 12Y',
'Possession Of Fentanyl' : 'Possession Of Drug',
'Del 3,4 Methylenedioxymethcath' : 'Delivery drug',
'Unlawful Use Of Police Badges' : 'Unlawful Use Of Police Badges',
'Battery Spouse Or Girlfriend' : 'Battery Spouse Or Girlfriend',
'Deliver Cocaine 1000FT School' : 'Delivery Drug School',
'False Bomb Report' : 'Falsely Bomb Report',
'Computer Pornography' : 'Computer Pornography',
'Use Of 2 Way Device To Fac Fel' : 'Use Of 2 Way Device To Face Felony',
'Possession Of Lorazepam' : 'Possession Of Drug',
'Robbery W/Deadly Weapon' : 'Robbery With Deadly Weapon',
'Attempt Armed Burglary Dwell' : 'Attempted Armed Burglary Dwelling',
'Insurance Fraud' : 'Insurance Fraudulent',
'Possess Mot Veh W/Alt Vin #' : 'Possession of Motor Vehicle With Alternative Number',
'Delivery of 5-Fluoro PB-22' : 'Delivery of Drug',
'Deliver Cocaine 1000FT Park' : 'Delivery Drug',
'Arson II (Vehicle)' : 'Arson (Vehicle)',
'Possession Of Paraphernalia' : 'Possession Of Drug',
'Contradict Statement' : 'Contradict Statement', 
'Consp Traff Oxycodone 28g><30k' : 'Conspire Trafficking Drug',
'Possession of XLR11' : 'Possession of XLR11', 
'Unauthorized Interf w/Railroad' : 'Unauthorized Interference with Railroad',
'Counterfeit Lic Plates/Sticker' : 'Counterfeit License Plates or Sticker',
'Possess Cannabis 1000FTSch' : 'Possession of Drug',
'Aggress/Panhandle/Beg/Solict' : 'Aggressive Panhandling', 
'Poss Trifluoromethylphenylpipe' : 'Possession of Drug',
'Murder In 2nd Degree W/firearm' : 'Murder In Second Degree With Firearm',
'Traffick Hydrocodone   4g><14g' : 'Trafficking Drug',
'Principal In The First Degree' : 'Principal In The first Degree',
'Deliver Cocaine 1000FT Church' : 'Delivery Drug',
'Att Burgl Conv Occp' : 'Attempted Burglary Conveyance Occupied',
'Unl/Disturb Education/Instui' : 'Unlawful Disturb Education or Institute',
'Possession Of Diazepam' : 'Possession Of Diazepam',
'Interfere W/Traf Cont Dev RR' : 'Interference With Trafficking Controlled Device railroad',
'Deliver Cannabis 1000FTSch' : 'Delivery Drug School',
'Possession of LSD' : 'Possession of LSD', 
'Lewd/Lasciv Molest Elder Persn' : 'Lewdness Violation',
'Trespass On School Grounds':  'Trespassing On School Grounds',
'Throw In Occupied Dwell': 'Throw In Occupied Dwelling',
'Fail Sex Offend Report Bylaw' : 'Failure Sexual Offend Report By Law',
'Manslaughter W/Weapon/Firearm' : 'Manslaughter With Deadly Weapon', 
'Throw Missile Into Pub/Priv Dw' :'Throw Missile Into Public or Private Driveway',
'Present Proof of Invalid Insur' : 'Present Proof of Invalid Insurance',
'Theft/To Deprive' : 'Theft',
'Poss Unlaw Issue Driver Licenc' : 'Possession of Unlawful Issuing Driver License' ,
'Extradition/Defendants' : 'Extradition Defendants',
'Tamper With Victim' : 'Tampering With Victim',
'Neglect Child / No Bodily Harm' : 'Child Abuse',
'Poss 3,4 MDMA (Ecstasy)' : 'Possession of drug', 
'Violation License Restrictions' : 'Violation License Restrictions' , 
'Criminal Mischief>$200<$1000' : 'Criminal Mischief>$200<$1000',
'Felon in Pos of Firearm or Amm' : 'Possession of Deadly Weapon',
'Culpable Negligence' : 'Culpable Negligence' ,
'Uttering Forged Bills' : 'Uttering Forged Bills', 
'Possess Countrfeit Credit Card' : 'Possession of Counterfeit Credit Card',
'Leaving Acc/Unattended Veh' : 'Leaving Accident with Unattended Vehicle', 
'Hiring with Intent to Defraud' : 'Defrauding Hired',
'Sell or Offer for Sale Counterfeit Goods' : 'Selling Counterfeit Goods' ,
'Refuse to Supply DNA Sample' :'Refuse to Supply DNA Sample',
'Felony/Driving Under Influence' :'Felony and Driving Under Influence',
'DUI Blood Alcohol Above 0.20' : 'DUI Blood Alcohol Above 0.20' ,
'Aggravated Battery (Firearm/Actual Possession)' : 'Aggravated Battery',
'Exploit Elderly Person 20-100K' : 'Exploit Elderly Person 20-100K', 
'Soliciting For Prostitution' : 'Solicitation For Prostitution',
'Battery Emergency Care Provide': 'Battery Emergency Care Provide',
'Attempted Burg/struct/unocc' : 'Attempted Burglary Conveyance unoccupied', 
'Drivg While Lic Suspd/Revk/Can' : 'Driving while driver license Suspended', 
'Aggravated Assault W/o Firearm' : 'Aggravated Assault Without Weapon',
'Fail To Obey Police Officer' : 'Resist Officer Without Violence',
'Poss F/Arm Delinq' : 'Possession of Firearm Delinquency',
'Open Carrying Of Weapon' : 'Open Carrying Of Weapon',
'Aggrav Child Abuse-Agg Battery' : 'Aggravated Child Abuse',
'D.U.I. Serious Bodily Injury' : 'D.U.I. Serious Bodily Injury',
'Strong Armed  Robbery' : 'Strong Armed  Robbery', 
'Accessory After the Fact' : 'Accessory After the Fact',
'Burglary Assault/Battery Armed' : 'Burglary or Assault or Battery with Deadly Weapon',
'Deliver 3,4 Methylenediox' : 'Delivery drug',
'Att Tamper w/Physical Evidence' : 'Attempted Tampering with Physical Evidence',
'Lewd/Lasc Battery Pers 12+/<16' : 'Lewdness violation',
'Expired DL More Than 6 Months' :"Operating with invalid Driver license",
'Discharge Firearm From Vehicle' :'shoot Firearm From Vehicle', 
'Solicitation On Felony 3 Deg' : 'Solicitation On Felony third Degree',  
'Poss/Sell/Deliver Clonazepam' : 'Selling Clonazepam',
'Traffick Oxycodone     4g><14g' : 'Trafficking Drug 4g><14g' ,
'Fail Register Vehicle' : 'Failure Register Vehicle', 
'Grand Theft Dwell Property' : 'Grand Theft Dwelling Property',
'Felony Committing Prostitution' : 'Prostitution',
'Prostitution/Lewd Act Assignation' : 'Prostitution/Lewdness Act Assignation', 
'Solicit Deliver Cocaine' : 'Delivery Drug',
'Poss of Vessel w/Altered ID NO' : 'Possession of Vessel with Alternative ID number',
'Threat Public Servant' : 'Threat Public officer',
'Poss/Sell/Del/Man Amobarbital' : 'Selling Drug', 
'Use Scanning Device to Defraud' : 'Use Scanning Device to Defrauding', 
'Poss Drugs W/O A Prescription' : 'Possession of Controlled Substance Without Prescription',
'False Motor Veh Insurance Card' : 'Falsely Motor Vehicle Insurance Card' ,
'Poss Meth/Diox/Meth/Amp (MDMA)' : 'Possession of drug', 
'Burglary Conveyance Armed' :'Burglary Conveyance with Deadly Weapon',
'Aiding Escape' : 'Aiding Escape' , 
'PL/Unlaw Use Credit Card' : 'Unlawful Use Credit Card',
'Carrying A Concealed Weapon' : 'Carrying A concealed weapon',
'Introduce Contraband Into Jail' : 'Introduce Contraband Into Jail',
'Lease For Purpose Trafficking' : 'Lease For Purpose Trafficking',
'Grand Theft in the 1st Degree' : 'Grand Theft in the first Degree',
'Grand Theft on 65 Yr or Older' : 'Grand Theft on Elderly' ,
'Trespass Structure w/Dang Weap' : 'Trespassing Conveyance with Deadly Weapon',
'Murder in 2nd Degree' : 'Murder in second Degree',
'Poss Anti-Shoplifting Device' : 'Possession of Anti-Shoplifting Device',
'Attempt Burglary (Struct)' : 'Attempted Burglary ',
'Attempted Robbery  Weapon' : 'Attempted Robbery with Weapon', 
'Agg Assault Law Enforc Officer' : 'Aggravated Assault Law Enforcement Officer',
'Tamper With Witness' : 'Tampering With Witness',
'Aggravated Battery (Firearm)' : 'Aggravated Battery with Deadly Weapon',
'Traff In Cocaine <400g>150 Kil' : 'Trafficking In Drug',
'Tresspass Struct/Conveyance' : 'Trespassing Conveyance' ,
'Poss Firearm W/Altered ID#' : 'Possession of Firearm with Alternative ID number',
'Throw Deadly Missile Into Veh' : 'Throw Deadly Missile Into Vehicle',
'Poss Unlaw Issue Id' : 'Possession of Unlawful Issuing Driver License',
'Fail To Redeliv Hire/Leas Prop' : 'Failure to return hired property',
'Cruelty to Animals' : 'Animal abuse', 
'nan' : 'Unknown',
'Misuse Of 911 Or E911 System' : "Falsely 911 call",
'Crlty Twrd Child Urge Oth Act' : "Child Abuse",
'Possession of Ethylone' : 'Possession of Drug',
'Attempted Burg/Convey/Unocc' : 'Attempted Burglary Conveyance Unoccupied', 
'Poss of Firearm by Convic Felo' : 'Possession of Firearm by Convicted Felony',
'Obtain Control Substance By Fraud' : 'Obtain Controlled Substance By Fraudulent',
'Fail Sex Offend Report Bylaw' : 'Failure Sexual Offender Report By law',
'DOC/Cause Public Danger' : 'DOC and Cause Public Dangerous',
'Contribute Delinquency Of A Minor' : 'Contribute Delinquency Of A Minor',
'Trespass Structure/Conveyance': 'Trespassing Conveyance', 
'Poss Counterfeit Payment Inst' : 'Possession of Counterfeit Paying Instrument',
'Poss Cntrft Contr Sub w/Intent' : 'Possession of Counterfeit Controlled Substance',
'Pos Methylenedioxymethcath W/I/D/S' : 'Selling drug', 
'Poss Tetrahydrocannabinols' : "Possession of Drug",
'License Suspended Revoked' : 'License Suspended Revoked',
'Battery On A Person Over 65' : 'Battery On A Person Over 65',
'Trespass Property w/Dang Weap' : 'Trespassing Property With Dangerous Weapon', 
'Consp Traff Oxycodone  4g><14g' : 'Trafficking Drug  4g><14g',
'Agg Fleeing/Eluding High Speed' : 'Aggravated Fleeing or Eluding High Speed',
'Aggr Child Abuse-Torture,Punish' : 'Aggravated Child Abuse',
'Bribery Athletic Contests' : 'Bribery Athletic Contests',
'Purchasing Of Alprazolam' : 'Purchase Drug',
'Del of JWH-250 2-Methox 1-Pentyl' : 'Delivery Drug',
'Dealing In Stolen Property' : 'Dealing In Stolen Property',
'Pos Cannabis For Consideration' : 'Possession of Drug',
'Sel Etc/Pos/w/Int Contrft Schd' : 'Selling controlled substance',
'Murder in the First Degree' : 'Murder in the First Degree',
'Alcoholic Beverage Violation-FL' : 'Alcohol Beverage Violation',
'Uttering Worthless Check +$150' : 'Uttering Worthless Check +$150',
'Burglary Structure Occupied' : 'Burglary Conveyance Occupied',
'Battery On Parking Enfor Speci' : 'Battery',
'Refuse Submit Blood/Breath Test' : 'Refuse Submit Blood/Breath Test',
'Oper Motorcycle W/O Valid DL' : 'Operating Motor Without Valid Driver License',
'Possession Of Phentermine' : 'Possession Of Drug',
'Possession Child Pornography' : 'Child Pornography',
'DUI - Property Damage/Personal Injury' : 'DUI - Property Damage and Person Injury' ,
'Unemployment Compensatn Fraud' : 'Unemployment Compensation Fraudulent',
'Felony DUI - Enhanced' : 'DUI - Enhanced',
'Fail Obey Driv Lic Restrictions' : 'Failure to Obey Driver License Restrictions',
'Issuing a Worthless Draft' : 'Issuing a Worthless Draft',
'Sel/Pur/Mfr/Del Control Substa' : 'Selling Controlled substance', 
'Grand Theft In The 3Rd Degree' : 'Grand Theft In The third Degree',
'Harass Witness/Victm/Informnt' : 'Harass Witness or Victim of Confidential Informant',
'Uttering Forged Credit Card' : 'Uttering Forged Credit Card',
'Leave Accd/Attend Veh/Less $50' : 'Leaving accident with attended Vehicle less than $50',
'Lewd/Lasc Exhib Presence <16yr' : 'Lewdness Violation',
'Poss Pyrrolidinobutiophenone': 'Possession of drug',
'Offn Against Intellectual Prop': 'Offense against Intellectual Property',
'Manage Busn W/O City Occup Lic' : 'Manage Business without city occupation license', 
'Arson in the First Degree' : 'Arson first degree',
'Sound Articles Over 100' : 'Sound Articles Over 100',
'Falsely Impersonating Officer' :'Falsely Impersonate Officer',
'Poss Similitude of Drivers Lic' : 'Possession of unlawful Driver License', 
'Sale/Del Cannabis At/Near Scho' : 'Selling Drug',
'Poss/pur/sell/deliver Cocaine' : 'Selling Drug',
'Fail Register Career Offender' : 'Failure Register Career Offender',
'Sell Conterfeit Cont Substance' : 'Selling Counterfeit Controlled Substance',
'Aggrav Child Abuse-Causes Harm' : 'Aggravated Child Abuse',
'Possess Controlled Substance' : 'Possession of Controlled Substance',
'Neglect/Abuse Elderly Person' : "Elderly Abuse",
'Fail To Secure Load' : 'Failure To Secure Load',
'Compulsory Sch Attnd Violation' : 'Compulsory School Attended Violation',
'Solicit Purchase Cocaine' : 'Solicitation Purchase Drug',
'Possess Weapon On School Prop' : 'Possession Weapon On School Property',
'Possess/Use Weapon 1 Deg Felon' : 'Possession of Weapon first Degree Felony',
'Cause Anoth Phone Ring Repeat' : 'Cause Another Phone Ring Repeat',
'Prostitution' : 'Prostitution',
'Possess Tobacco Product Under 18' : 'Possession of Tobacco Product Under 18',
'Agg Assault W/int Com Fel Dome' : 'Aggravated Assault With intention to commit Domestic Violence'}

Then this natural language is processed, removing non alphabetical, stop words and lowered. These short descriptions then are tokenized for one hot conversion.

In [None]:
# Create a dictionary of key words from charge description
dict_desc = []
convert_values = list(convert.values())
for key, item in convert.items():
    cv = CountVectorizer(stop_words='english')
    item1 = re.sub('[^a-zA-Z]', ' ', item)
    tmp = item1.lower().split()
    matrix = cv.fit_transform(tmp)
    vocab = list(cv.vocabulary_.keys())
    convert[key] = vocab
print(convert)


In [None]:
# update the charge description columns to new converted list
full_df_is_recid['c_charge_desc'] = full_df_is_recid['c_charge_desc'].map(convert)
full_df_is_recid = full_df_is_recid.dropna(subset=['c_charge_desc'])

full_df_two_year_recid['c_charge_desc'] = full_df_two_year_recid['c_charge_desc'].map(convert)
full_df_two_year_recid = full_df_two_year_recid.dropna(subset=['c_charge_desc'])

In [None]:
#TWO_YEAR_RECID
# Apply one hot encoding for the list of charge description
mlb = MultiLabelBinarizer(sparse_output=True)

charge_label = full_df_two_year_recid["two_year_recid"].to_frame()
charge_label = charge_label.join(
            pd.DataFrame.sparse.from_spmatrix(
                mlb.fit_transform(full_df_two_year_recid.pop('c_charge_desc')),
                index=charge_label.index,
                columns=mlb.classes_))

display(charge_label.head())

Run this one hot matrix via chi square analysis to determine features/words to keep. My significant threshold is at 0.05 but it can be as high as 0.1. Given this parameter, we found 29 feature worth training on.

In [None]:
# Run chi2 to see what catergorical features from charge description need to keep
SIG = 0.05
MOD_SIG = 0.1
cols = charge_label.columns.to_list()
LABEL = "two_year_recid"
cols.remove(LABEL)
feat_keep = []
feat_maybe = []

for col in cols:
    contingency = pd.crosstab(charge_label[LABEL], charge_label[col])
    display(contingency)
    c, p, dof, expected = chi2_contingency(contingency)
    if p < SIG:
        feat_keep.append(col)
        print(col, "and label are not independent - keep, p =", p)
    elif p < MOD_SIG:
        feat_maybe.append(col)
        print(col, "and label may have some relationship - maybe keep, p =", p)
    else:
        print(col, "and label are independent - drop, p =", p)

In [None]:
print(len(feat_keep))
print(feat_keep)

In [None]:
print(len(feat_maybe))
print(feat_maybe)

In [None]:
# feat_keep = feat_keep + feat_maybe + ['two_year_recid']
feat_keep = feat_keep + ['two_year_recid']
print(feat_keep)

In [None]:
charge_label = charge_label[feat_keep]
charge_label.shape

In [None]:
# add the new list of charge description as column to the full_df_is_recid
for i in charge_label.columns.to_list(): 
    full_df_two_year_recid[i] = charge_label[i]
display(full_df_two_year_recid.head())

In [None]:
print(full_df_two_year_recid.shape)

In [None]:
# Apply one hot encoding for categorical features: sex and age_cat
categorical = {"sex", "age_cat", "race", "c_charge_degree"}
full_df_two_year_recid =  pd.get_dummies(full_df_two_year_recid,columns=categorical)
display(full_df.head())

In [None]:
print(full_df_two_year_recid.shape)

In [None]:
#full_df_two_year_recid.corr(method="spearman")

Then, use Spearman to find the correlation between the remaining features with label, none of which is at significant correlation/redundant to be dropped.

In [None]:
# display the correlation of the features to the label
list_corr = full_df_two_year_recid.corr(method="spearman")['two_year_recid']
print(list_corr)

In [None]:
two_year_feat = full_df_two_year_recid.columns.to_list()
print(two_year_feat)

Then check the df against ANOVA, we can see that none of the features is normally distributed, thus either Kruskal or f_oneway needs to be used to determine the features to keep. Both analyses suggest keeping all current features.

In [None]:
LABEL = "two_year_recid"
df_fail_two_year = full_df_two_year_recid[full_df_two_year_recid[LABEL] == 0]
df_pass_two_year = full_df_two_year_recid[full_df_two_year_recid[LABEL] == 1]

SIG = 0.05
MOD_SIG = 0.1
for col in two_year_feat:
    pop1 = df_fail_two_year[col]
    pop2 = df_pass_two_year[col]
    stat1, p1 = normaltest(pop1)
    stat2, p2 = normaltest(pop2)
    if p1 > SIG and p2 > SIG:
        stat, p = bartlett(pop1, pop2)
        if p > SIG:
            print(col, "meets ANOVA assumptions")
        else:
            print(col, "--> Kruskal-Wallis, variance is unequal:", p)
    else:
        print(col,  "--> Kruskal-Wallis, not normally distributed:", p1, p2)

In [None]:
for col in two_year_feat:
    pop1 = df_fail_two_year[col]
    pop2 = df_pass_two_year[col]
    stat, p = kruskal(pop1, pop2)
    if p <= SIG:
        print(col, "and label are not independent - keep, p =", p)
    elif p <= MOD_SIG:
        print(col, "and label may have some relationship - maybe keep, p =", p)
    else:
        print(col, "and label are independent - drop, p =", p)

In [None]:
new_two_year_feat = []
for col in two_year_feat:
    pop1 = df_fail_two_year[col]
    pop2 = df_pass_two_year[col]
    stat, p = f_oneway(pop1, pop2)
    if p <= SIG:
        new_two_year_feat.append(col)
        print(col, "and label are not independent - keep, p =", p)
    elif p <= MOD_SIG:
        print(col, "and label may have some relationship - maybe keep, p =", p)
    else:
        print(col, "and label are independent - drop, p =", p)

In [None]:
print(len(new_two_year_feat))
print(new_two_year_feat)

We then split the data to features and label as well as splitting 80/20, prepare for training and evaluation.

In [None]:
new_two_year_feat.remove("two_year_recid")
df_1_x = full_df_two_year_recid[new_two_year_feat]
df_1_y = full_df_two_year_recid[LABEL]

In [None]:
# TWO_YEAR_RECID
#df_shuffled_two_year_recid = full_df_two_year_recid.sample(frac=1, random_state=32).reset_index(drop=True)
df_shuffled_two_year_recid = full_df_two_year_recid
test_size_two_year_recid = int(len(df_shuffled_two_year_recid) * 0.2)
df_test_two_year_recid = df_shuffled_two_year_recid[:test_size_two_year_recid]
df_train_val_two_year_recid = df_shuffled_two_year_recid[test_size_two_year_recid:]
val_size_two_year_recid = int(len(df_train_val_two_year_recid) * 0.2)
df_val_two_year_recid = df_train_val_two_year_recid[:val_size_two_year_recid]
df_train_two_year_recid = df_train_val_two_year_recid[val_size_two_year_recid:]


df_1_x = df_train_two_year_recid[new_two_year_feat]
df_1_y = df_train_two_year_recid[LABEL]
x_val = df_val_two_year_recid[new_two_year_feat]
y_val = df_val_two_year_recid[LABEL]
x_test = df_test_two_year_recid[new_two_year_feat]
y_test = df_test_two_year_recid[LABEL]

In order to determine hyperparameter for our data, we use Keras Tuner to search for the combination of hidden layers and number of nodes in these layers.

Define our search space to be max of 2 layers and between 4 and 140 nodes each.

In [None]:
def tune_model1(hp):
    model = keras.Sequential()
    for i in range(hp.Int("num_layers", min_value=1, max_value=2, step=1)):
        model.add(layers.Dense(units=hp.Int("units_" + str(i), min_value=4, max_value=140, step=4), activation="relu"))
    
    model.add(layers.Dense(1, activation="sigmoid"))
    model.compile(optimizer="adam", loss="binary_crossentropy",
                metrics=["accuracy", "TruePositives", "TrueNegatives",
                         "FalsePositives", "FalseNegatives"])
    return model

In [None]:
tuner_acc1 = RandomSearch(
    tune_model1, objective="val_accuracy", 
    max_trials=10, executions_per_trial=3, project_name="objective_1"
)

# tuner_acc1.search_space_summary()

tuner_acc1.search(df_1_x.values,
                 df_1_y.values,
                 epochs=100,
                 batch_size=512,
                 validation_data=(x_val.values, y_val.values),
                 callbacks=[keras.callbacks.EarlyStopping(monitor="val_loss", patience=3)])

In [None]:
tuner_acc1.results_summary()

In [None]:
def best_model(layer_info):
    '''
    layer_info is a list with nodes per layer
    '''
    model = keras.Sequential()
    for i in range(len(layer_info)):
        model.add(layers.Dense(layer_info[i], activation="relu"))
    
    model.add(layers.Dense(1, activation="sigmoid"))
    model.compile(optimizer="adam", loss="binary_crossentropy",
                metrics=["accuracy", 
                         "TruePositives", "TrueNegatives",
                         "FalsePositives", "FalseNegatives"])
    return model

We found that 84 nodes for first layer and 136 for the second layer works best.

Compile model then train.

In [None]:
best_model_acc1 = best_model([84, 136])
best_model_acc1.fit(df_1_x.values, df_1_y.values, epochs=100, batch_size=512,
                   validation_data=(x_val.values, y_val.values),
                   callbacks=[keras.callbacks.EarlyStopping(monitor="val_loss", patience=3)])

In [None]:
best_model_acc1.evaluate(x_test.values, y_test.values)

In [None]:
y_test_b = df_test_two_year_recid[df_test_two_year_recid["race_African-American"] == 1][LABEL]

best_model_acc1.evaluate(x_test[x_test["race_African-American"] == 1].values, y_test_b.values)

In [None]:
y_test_w = df_test_two_year_recid[df_test_two_year_recid["race_Caucasian"] == 1][LABEL]

best_model_acc1.evaluate(x_test[x_test["race_Caucasian"] == 1].values, y_test_w.values)

In [None]:
y_test_h = df_test_two_year_recid[df_test_two_year_recid["race_Hispanic"] == 1][LABEL]

best_model_acc1.evaluate(x_test[x_test["race_Hispanic"] == 1].values, y_test_h.values)

In [None]:
y_test_o = df_test_two_year_recid[df_test_two_year_recid["race_Other"] == 1][LABEL]

best_model_acc1.evaluate(x_test[x_test["race_Other"] == 1].values, y_test_o.values)

We have the overall accuracy at 0.7 which exceeds the .65 requirement. Accuracy varies between around .69 to .72 between 4 races which is acceptable.

In [None]:
his_b = best_model_acc1.evaluate(x_test[x_test["race_African-American"] == 1].values, y_test_b.values)
his_w = best_model_acc1.evaluate(x_test[x_test["race_Caucasian"] == 1].values, y_test_w.values)

We then go on the inspect the mathematical fairness between the black and white sub groups using classification parity as criteria and precision as metric.

In [None]:
def calculate_ppv(tp, tn, fp, fn):
    precision = tp / (tp + fp)
    return precision

tp_b = his_b[2]
tn_b = his_b[3]
fp_b = his_b[4]
fn_b = his_b[5]

tp_w = his_w[2]
tn_w = his_w[3]
fp_w = his_w[4]
fn_w = his_w[5]

ppv_b = calculate_ppv(tp_b, tn_b, fp_b, fn_b)
ppv_w = calculate_ppv(tp_w, tn_w, fp_w, fn_w)

print("PPV of African American:")
print(ppv_b)

print("PPV of Caucasian:")
print(ppv_w)

print("PPV Difference")
print(abs(ppv_b - ppv_w))

We are consistently getting the precision between .78 and .8 for both sub groups, resulting in less than .02 difference which is much better requirement at .05