# Predicting Recidivism with Machine Learning

## Importing Data

### Import Statements

In [37]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pandas.api.types import is_string_dtype, is_numeric_dtype

### Loading Data

In [38]:
#Import data

# Whole data set for data-exploration
whole_dataframe = pd.read_csv("../../data/broward_data.csv")
# Test train split used in XYZ paper for training our models. 
train_dataframe = pd.read_csv("../../data/broward_train.csv")
test_dataframe = pd.read_csv("../../data/broward_test.csv")

# whole_dataframe.columns

In [39]:

# Removing person_id, screening_date because they are not helpful to our analyses
# Removing current_violence20 because redundant with 2 other columns. 

whole_dataframe = whole_dataframe.drop(['person_id', "screening_date", "current_violence20"], axis=1)
test_dataframe = test_dataframe.drop(['person_id', "screening_date", "current_violence20"], axis=1)
train_dataframe = train_dataframe.drop(['person_id', "screening_date", "current_violence20"], axis=1)


In [40]:
race_mapping = {"African-American" : 0, "Caucasian" : 1, "Hispanic" : 2, "Other" : 3, "Asian" : 4, "Native American" : 5, }
whole_dataframe['race'] = whole_dataframe['race'].map(race_mapping)
train_dataframe['race'] = train_dataframe['race'].map(race_mapping)
test_dataframe['race'] = test_dataframe['race'].map(race_mapping)

In [41]:
label_column_names = ['six_month', 'one_year', 'three_year', 'five_year', 'general_two_year',
       'general_six_month', 'drug_two_year', 'property_two_year',
       'misdemeanor_two_year', 'felony_two_year', 'violent_two_year',
       'drug_six_month', 'property_six_month', 'misdemeanor_six_month',
       'felony_six_month', 'violent_six_month']

whole_dataframe_X = whole_dataframe.drop(label_column_names, axis = 1)
whole_dataframe_label_choices = whole_dataframe[label_column_names]
whole_data_X = whole_dataframe_X.values
# To get numpy y-labels: append {.astype(int).values} to end of label series

test_dataframe_X = test_dataframe.drop(label_column_names, axis = 1)
test_dataframe_label_choices = test_dataframe[label_column_names]
test_data_X = test_dataframe_X.values
test_data_y = test_dataframe_label_choices["general_six_month"].astype(int).values

train_dataframe_X = train_dataframe.drop(label_column_names, axis = 1)
train_dataframe_label_choices = train_dataframe[label_column_names]
train_data_X = train_dataframe_X.values
train_data_y = train_dataframe_label_choices["general_six_month"].astype(int).values

In [42]:
X_train = train_data_X
y_train = train_data_y
X_test = test_data_X
y_test = test_data_y

In [43]:
state_num = 816

### Baseline Models

In [44]:
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import AdaBoostClassifier

In [45]:
from sklearn.model_selection import GridSearchCV

In [46]:
np.random.seed(816)

In [47]:
# Metrics Import
from sklearn.metrics import roc_auc_score

Metrics: AUC
Additionally, our evaluation metric is AUC, which is a rank statistic, and considers relative risk rather than absolute risk;

https://datascience.stackexchange.com/questions/30881/when-is-precision-more-important-over-recall#:~:text=Thus%2C%20precision%20will%20be%20more,having%20any%20action%20at%20all%22.

Metric: Precision
Picked this metric on the assumption that a false negative (predicting a person will not be charged when they will be) is better than false positives.

Metric: F1-Score:
A good metric because our dataset is imbalanced and combines the power of precision and recall. 

Metric: Log-Loss:
Useful for assessing the uncertainty of our predictions, which is useful, considering the decisions of our decisions affect real people. 


### Model Creation and Evaluation

#### Logistic Regression

In [48]:
lreg_grid = {
    'penalty': ['l2'],
    'solver' : ['liblinear'],
    'C':[10**-5, 10**-4, 10**-3, 10**-2],
    'class_weight': ['balanced'],
    'max_iter': [10000]
}

In [49]:
%%capture
lreg_cv = GridSearchCV(estimator = LogisticRegression(random_state=state_num), 
                                    param_grid = lreg_grid, 
                                    cv = 5
                                    )
lreg_cv.fit(X_train, y_train)


In [50]:
print(lreg_cv.best_params_)

{'C': 1e-05, 'class_weight': 'balanced', 'max_iter': 10000, 'penalty': 'l2', 'solver': 'liblinear'}


In [51]:
lreg_clf = LogisticRegression(
    penalty = 'l2',
    solver = 'liblinear',
    C = lreg_cv.best_params_.get('C'),
    class_weight='balanced', 
    max_iter=10000, 
    random_state=state_num
    )
lreg_clf.fit(X_train, y_train)

LogisticRegression(C=1e-05, class_weight='balanced', max_iter=10000,
                   random_state=816, solver='liblinear')

In [52]:
# Print AUC 
print("AUC of Logistic Regression for Best Model")
print(roc_auc_score(y_test, lreg_clf.decision_function(X_test)))


AUC of Logistic Regression for Best Model
0.6200964320154292


#### Random Forest

In [53]:
#Cross Fold Validation for Random Forest

rf_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [1, 2, 3],
    'min_impurity_decrease': [.001, .002, .003, .004, .005, .006, .007, .008, .009, .010] 
}



In [54]:
rf_cv = GridSearchCV(estimator = RandomForestClassifier(random_state=state_num), 
                                    param_grid = rf_grid, 
                                    cv = 5
                                    )
rf_cv.fit(X_train, y_train)
print(rf_cv.best_params_)

{'max_depth': 1, 'min_impurity_decrease': 0.001, 'n_estimators': 50}


In [55]:
rf_clf = RandomForestClassifier(
    n_estimators = rf_cv.best_params_.get('n_estimators'), 
    max_depth = rf_cv.best_params_.get('max_depth'), 
    min_impurity_decrease = rf_cv.best_params_.get('min_impurity_decrease'), 
    random_state=state_num
    )
rf_clf.fit(X_train, y_train)

RandomForestClassifier(max_depth=1, min_impurity_decrease=0.001,
                       n_estimators=50, random_state=816)

In [56]:
# Print AUC 
print("AUC of Logistic Regression for Best Model")
print(roc_auc_score(y_test, rf_clf.predict_proba(X_test)[:,1]))


AUC of Logistic Regression for Best Model
0.6128833172613308


### Linear SVM

In [57]:
#Cross Fold Validation for Random Forest

svm_grid = {
    'C':[10**-5, 10**-4, 10**-3, 10**-2]
}

In [60]:
%%capture
svm_cv = GridSearchCV(estimator = LinearSVC(random_state=state_num), 
                                    param_grid = svm_grid, 
                                    cv = 5
                                    )
svm_cv.fit(X_train, y_train)

In [61]:
svm_clf = LinearSVC(
    C = svm_cv.best_params_.get('C'), 
    random_state=state_num
    )
svm_clf.fit(X_train, y_train)

LinearSVC(C=1e-05, random_state=816)

In [62]:
# Print AUC 
print("AUC of Logistic Regression for Best Model")
print(roc_auc_score(y_test, svm_clf.decision_function(X_test)))


AUC of Logistic Regression for Best Model
0.6006943105110896


#### Boosted Decision Trees (ADA Boost)

In [63]:
ada_grid = { 
    'learning_rate': [0.05],
    'n_estimators': [50, 100, 200]
}

In [64]:
ada_cv = GridSearchCV(
    estimator = AdaBoostClassifier(random_state=state_num),
    param_grid = ada_grid,
    cv = 5
)
ada_cv.fit(X_train, y_train)
print(ada_cv.best_params_)

{'learning_rate': 0.05, 'n_estimators': 50}


In [65]:
ada_clf= AdaBoostClassifier(
    n_estimators = ada_cv.best_params_.get('n_estimators'), 
    learning_rate = ada_cv.best_params_.get('learning_rate'), 
    random_state=state_num
    )
ada_clf.fit(X_train, y_train)

AdaBoostClassifier(learning_rate=0.05, random_state=816)

In [66]:
# Print AUC 
print("AUC of Logistic Regression for Best Model")
print(roc_auc_score(y_test, ada_clf.predict_proba(X_test)[:,1]))

AUC of Logistic Regression for Best Model
0.6275988428158148
