# Predicting Recidivism with Machine Learning

## Importing Data

### Import Statements

In [31]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pandas.api.types import is_string_dtype, is_numeric_dtype

import time
import pathlib

### Loading Data

In [32]:
#Import data

# Whole data set for data-exploration
whole_dataframe = pd.read_csv("data/broward_data.csv")
# Test train split used in XYZ paper for training our models. 
train_dataframe = pd.read_csv("data/broward_train.csv")
test_dataframe = pd.read_csv("data/broward_test.csv")

# whole_dataframe.columns

In [33]:

# Removing person_id, screening_date because they are not helpful to our analyses
# Removing current_violence20 because redundant with 2 other columns. 

whole_dataframe = whole_dataframe.drop(['person_id', "screening_date", "current_violence20"], axis=1)
test_dataframe = test_dataframe.drop(['person_id', "screening_date", "current_violence20"], axis=1)
train_dataframe = train_dataframe.drop(['person_id', "screening_date", "current_violence20"], axis=1)


In [34]:
race_mapping = {"African-American" : 0, "Caucasian" : 1, "Hispanic" : 2, "Other" : 3, "Asian" : 4, "Native American" : 5, }
whole_dataframe['race'] = whole_dataframe['race'].map(race_mapping)
train_dataframe['race'] = train_dataframe['race'].map(race_mapping)
test_dataframe['race'] = test_dataframe['race'].map(race_mapping)

In [35]:
label_column_names = ['six_month', 'one_year', 'three_year', 'five_year', 'general_two_year',
       'general_six_month', 'drug_two_year', 'property_two_year',
       'misdemeanor_two_year', 'felony_two_year', 'violent_two_year',
       'drug_six_month', 'property_six_month', 'misdemeanor_six_month',
       'felony_six_month', 'violent_six_month']

whole_dataframe_X = whole_dataframe.drop(label_column_names, axis = 1)
whole_dataframe_label_choices = whole_dataframe[label_column_names]
whole_data_X = whole_dataframe_X.values
# To get numpy y-labels: append {.astype(int).values} to end of label series

# TEST Dataframe with only X 
test_dataframe_X = test_dataframe.drop(label_column_names, axis = 1)
# np-array from dataframe
test_data_X = test_dataframe_X.values
# Generate labels as np-array
test_dataframe_label_choices = test_dataframe[label_column_names]
test_data_y = test_dataframe_label_choices["general_six_month"].astype(int).values

# TRAIN Dataframe with only X
train_dataframe_X = train_dataframe.drop(label_column_names, axis = 1)
# np-array from dataframe
train_data_X = train_dataframe_X.values
# Generate labels as np-array
train_dataframe_label_choices = train_dataframe[label_column_names]
train_data_y = train_dataframe_label_choices["general_six_month"].astype(int).values

In [37]:
state_num = 816
np.random.seed(816)

# Models Import

import sklearn
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from gosdt_model.threshold_guess import compute_thresholds
from gosdt import GOSDT

# Metrics Import
from sklearn.metrics import roc_auc_score


In [38]:
# GBDT parameters for threshold and lower bound guesses
n_est = 40
max_depth = 1

# guess thresholds
X_train, thresholds, header, threshold_guess_time = compute_thresholds(train_dataframe_X, y_train, n_est, max_depth)
y_train = pd.DataFrame(train_data_y)

# guess lower bound
start_time = time.perf_counter()
clf = GradientBoostingClassifier(n_estimators=n_est, max_depth=max_depth)
clf.fit(X_train, y_train.values.flatten())
warm_labels = clf.predict(X_train)
elapsed_time = time.perf_counter() - start_time
lb_time = elapsed_time

# save the labels from lower bound guesses as a tmp file and return the path to it.
labelsdir = pathlib.Path('/tmp/warm_lb_labels')
labelsdir.mkdir(exist_ok=True, parents=True)
labelpath = labelsdir / 'warm_label.tmp'
labelpath = str(labelpath)
pd.DataFrame(warm_labels, columns=["class_labels"]).to_csv(labelpath, header="class_labels",index=None)


In [39]:
# train GOSDT model
config = {
            "regularization": 0.001,
            "depth_budget": 5,
            "warm_LB": True,
            "path_to_labels": labelpath,
            "time_limit": 60,
            "similar_support": False
        }

model = GOSDT(config)

model.fit(X_train, y_train)

print("evaluate the model, extracting tree and scores", flush=True)

# get the results
train_acc = model.score(X_train, y_train)
n_leaves = model.leaves()
n_nodes = model.nodes()
time = model.utime

print("Model training time: {}".format(time))
print("Training accuracy: {}".format(train_acc))
print("# of leaves: {}".format(n_leaves))
print(model.tree)

gosdt reported successful execution
training completed. 0.000/0.000/0.003 (user, system, wall), mem=0 MB
bounds: [0.217473..0.217473] (0.000000) loss=0.215473, iterations=0
evaluate the model, extracting tree and scores
Model training time: 0.0
Training accuracy: 0.7845268542199488
# of leaves: 2
if p_drug<=26.0 = 1 then:
    predicted class: 0
    misclassification penalty: 0.215
    complexity penalty: 0.001

else if p_drug<=26.0 != 1 then:
    predicted class: 1
    misclassification penalty: 0.0
    complexity penalty: 0.001


In [56]:
# Print AUC 
print("AUC of Logistic Regression for Best Model")
print(roc_auc_score(y_test, rf_clf.predict_proba(X_test)[:,1]))


AUC of Logistic Regression for Best Model
0.6128833172613308
