# Predicting Recidivism with Machine Learning

## Importing Data

### Import Statements

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pandas.api.types import is_string_dtype, is_numeric_dtype

### Loading Data

In [2]:
"Import data"
# Whole data set for data-exploration
whole_dataframe = pd.read_csv("../data/broward_data.csv")
# Test train split used in XYZ paper for training our models. 
train_dataframe = pd.read_csv("../data/broward_train.csv")
test_dataframe = pd.read_csv("../data/broward_test.csv")

# whole_dataframe.columns

In [3]:
"""
Removing person_id, screening_date because they are not helpful to our analyses
Removing current_violence20 because redundant with 2 other columns. 
"""
whole_dataframe = whole_dataframe.drop(['person_id', "screening_date", "current_violence20"], axis=1)
test_dataframe = test_dataframe.drop(['person_id', "screening_date", "current_violence20"], axis=1)
train_dataframe = train_dataframe.drop(['person_id', "screening_date", "current_violence20"], axis=1)


In [4]:
race_mapping = {"African-American" : 0, "Caucasian" : 1, "Hispanic" : 2, "Other" : 3, "Asian" : 4, "Native American" : 5, }
whole_dataframe['race'] = whole_dataframe['race'].map(race_mapping)
train_dataframe['race'] = train_dataframe['race'].map(race_mapping)
test_dataframe['race'] = test_dataframe['race'].map(race_mapping)

In [5]:
label_column_names = ['six_month', 'one_year', 'three_year', 'five_year', 'general_two_year',
       'general_six_month', 'drug_two_year', 'property_two_year',
       'misdemeanor_two_year', 'felony_two_year', 'violent_two_year',
       'drug_six_month', 'property_six_month', 'misdemeanor_six_month',
       'felony_six_month', 'violent_six_month']

whole_dataframe_X = whole_dataframe.drop(label_column_names, axis = 1)
whole_dataframe_label_choices = whole_dataframe[label_column_names]
whole_data_X = whole_dataframe_X.values
"To get numpy y-labels: append {.astype(int).values} to end of label series"

test_dataframe_X = test_dataframe.drop(label_column_names, axis = 1)
test_dataframe_label_choices = test_dataframe[label_column_names]
test_data_X = test_dataframe_X.values
test_data_y = test_dataframe_label_choices["general_six_month"].astype(int).values

train_dataframe_X = train_dataframe.drop(label_column_names, axis = 1)
train_dataframe_label_choices = train_dataframe[label_column_names]
train_data_X = train_dataframe_X.values
train_data_y = train_dataframe_label_choices["general_six_month"].astype(int).values

In [6]:
X_train = train_data_X
y_train = train_data_y
X_test = test_data_X
y_test = test_data_y

### Baseline Models

In [7]:
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Lasso
from sklearn.svm import LinearSVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier

In [8]:
from sklearn.model_selection import GridSearchCV

In [9]:
np.random.seed(671)

In [26]:
"Metrics"
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss


Metrics: AUC
Additionally, our evaluation metric is AUC, which is a rank statistic, and considers relative risk rather than absolute risk;

https://datascience.stackexchange.com/questions/30881/when-is-precision-more-important-over-recall#:~:text=Thus%2C%20precision%20will%20be%20more,having%20any%20action%20at%20all%22.

Metric: Precision
Picked this metric on the assumption that a false negative (predicting a person will not be charged when they will be) is better than false positives.

Metric: F1-Score:
A good metric because our dataset is imbalanced and combines the power of precision and recall. 

Metric: Log-Loss:
Useful for assessing the uncertainty of our predictions, which is useful, considering the decisions of our decisions affect real people. 


#### Logistic Regression

In [22]:
lreg_grid = {
    'penalty': ['none', 'l1', 'l2', 'elasticnet'],
    'fit_intercept': [True, False],
    'C':[10**-5, 10**-4, 10**-3, 10**-2, 10**-1, 10**0, 10**2],
    'tol': [.00001, .0001, .001, .01, .1, 1],
    'max_iter': [1000]
}

In [23]:
lreg_cv = GridSearchCV(estimator = LogisticRegression(), 
                                    param_grid = lreg_grid, 
                                    cv = 5
                                    )
lreg_cv.fit(X_train, y_train)
print(lreg_cv.best_params_)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

{'C': 0.0001, 'fit_intercept': True, 'max_iter': 1000, 'penalty': 'l2', 'tol': 1e-05}


In [24]:
lreg_clf = LogisticRegression(
    penalty = lreg_cv.best_params_.get('penalty'), 
    fit_intercept = lreg_cv.best_params_.get('fit_intercept'), 
    C = lreg_cv.best_params_.get('C'),
    tol = lreg_cv.best_params_.get('tol')
    )
lreg_clf.fit(X_train, y_train)

LogisticRegression(C=0.0001, tol=1e-05)

In [28]:
print("AUC of Logistic Regression for Best Model")
print(roc_auc_score(y_test, lreg_clf.decision_function(X_test)))

print("Precision of Logistic Regression for Best Model")
#print(precision_score(y_test, lreg_clf.decision_function(X_test)))

print("F1-Score of Logistic Regression for Best Model")
#print(f1_score(y_test, lreg_clf.decision_function(X_test)))

# Used only for model comparion
print("Log Loss of Logistic Regression for Best Model")
print(log_loss(y_test, lreg_clf.decision_function(X_test)))

AUC of Logistic Regression for Best Model
0.6104918032786886
Precision of Logistic Regression for Best Model
F1-Score of Logistic Regression for Best Model
Log Loss of Logistic Regression for Best Model
7.4454016965471475


#### Lasso

In [None]:
lasso_grid = {
    'alpha': [2, 4, 8, 16, 32, 64, 128],
    'fit_intercept': [True, False],
    'tol': [.00001, .0001, .001, .01, .1, 1],
}

In [None]:
lasso_cv = GridSearchCV(estimator = Lasso(), 
                                    param_grid = lasso_grid, 
                                    cv = 5
                                    )
lasso_cv.fit(X_train, y_train)
print(lasso_cv.best_params_)

In [None]:
lasso_clf = Lasso(
    alpha = lasso_cv.best_params_.get('alpha'), 
    fit_intercept = lasso_cv.best_params_.get('fit_intercept'), 
    tol = lasso_cv.best_params_.get('tol'), 
    )
lasso_clf.fit(X_train, y_train)

#### Random Forest

In [None]:
#Cross Fold Validation for Random Forest

rf_grid = {
    'n_estimators': [2, 4, 8, 16, 32, 64, 128],
    'max_features': [1, 2, 3, 4, 5, 6],
    'max_depth': list(range(1, 32+1)),
    'criterion': ['gini', 'entropy', 'log_loss'], 
    'random_state' : [0]
}



In [None]:
rf_cv = GridSearchCV(estimator = RandomForestClassifier(), 
                                    param_grid = rf_grid, 
                                    cv = 5
                                    )
rf_cv.fit(X_train, y_train)
print(rf_cv.best_params_)

In [None]:
rf_clf = RandomForestClassifier(
    n_estimators = rf_cv.best_params_.get('n_estimators'), 
    max_depth = rf_cv.best_params_.get('max_depth'), 
    max_features = rf_cv.best_params_.get('max_features'), 
    criterion = rf_cv.best_params_.get('criterion')
    )
rf_clf.fit(X_train, y_train)

#### Boosted Decision Trees (ADA Boost)

In [None]:
ada_grid = {
    'n_estimators': [2, 4, 8, 16, 32, 64, 128],
    'learning_rate': [0.1, 0.25, 0.5, 0.75, 0.9, 1.0, 1.25, 1.5, 2],
    'algorithm': ['SAMME', 'SAMME.R'],
    'random_state' : [0]
}

In [None]:
ada_cv = GridSearchCV(
    estimator = AdaBoostClassifier(),
    param_grid = ada_grid,
    cv = 5
)
ada_cv.fit(X_train, y_train)
print(ada_cv.best_params_)

In [None]:
ada_clf= AdaBoostClassifier(
    n_estimators = ada_cv.best_params_.get('n_estimators'), 
    learning_rate = ada_cv.best_params_.get('learning_rate'), 
    algorithm = ada_cv.best_params_.get('algorithm')
    )
ada_clf.fit(X_train, y_train)