In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import OneHotEncoder
import warnings

warnings.filterwarnings('ignore', category=FutureWarning, module='sklearn')

compas = pd.read_csv('data/compas-scores-two-years-violent.csv')


In [26]:
# Two feature logistic regression model

# Selecting features and target
features = compas[['age', 'priors_count']]
target = compas['two_year_recid']

# Splitting the data into training and testing sets
train_features, test_features, train_target, test_target = train_test_split(features, target, test_size=0.2, random_state=42)

# Initializing and training the logistic regression model
model = LogisticRegression()
model.fit(train_features, train_target)

# Making predictions
predicted_classes = model.predict(test_features)

# Model Eval
accuracy = accuracy_score(test_target, predicted_classes)
report = classification_report(test_target, predicted_classes)
conf_matrix = confusion_matrix(test_target, predicted_classes)

print(report)

              precision    recall  f1-score   support

           0       0.83      0.99      0.90       785
           1       0.50      0.07      0.12       164

    accuracy                           0.83       949
   macro avg       0.67      0.53      0.51       949
weighted avg       0.78      0.83      0.77       949



In [27]:
# Confusion Matrix for two feature model and COMPAS

# Matrix for our model
predicted_reoffend = model.predict(features)
conf_matrix_model = confusion_matrix(compas['two_year_recid'], predicted_reoffend)

print("Confusion Matrix for Your Model:")
print(conf_matrix_model)

# COMPAS' matrix
compas_predictions = (compas['v_decile_score'] > 5).astype(int)
conf_matrix_compas = confusion_matrix(compas['two_year_recid'], compas_predictions)

print("Confusion Matrix for COMPAS:")
print(conf_matrix_compas)


Confusion Matrix for Your Model:
[[3925   43]
 [ 719   56]]
Confusion Matrix for COMPAS:
[[3341  627]
 [ 433  342]]


In [28]:
# Logistic Regression Model using features accumulated from recursive feature selection

compas.reset_index(drop=True, inplace=True)

# Encode categorical variables
encoder = OneHotEncoder(sparse=False, drop='first')
categorical_features = encoder.fit_transform(compas[['sex', 'age_cat']])

encoded_df = pd.DataFrame(categorical_features, columns=encoder.get_feature_names_out(['sex', 'age_cat']))
encoded_df.reset_index(drop=True, inplace=True)

# Concatenate the encoded features the dataframe
compas_encoded = pd.concat([compas, encoded_df], axis=1)

# New features include age category, sex, juvenile misdemeanor count, other juvenile misconduct count, prior convictions count
feature_names = list(encoder.get_feature_names_out(['sex', 'age_cat'])) + ['juv_misd_count', 'juv_other_count', 'priors_count.1']
features = compas_encoded[feature_names]
target = compas_encoded['two_year_recid']

train_features, test_features, train_target, test_target = train_test_split(features, target, test_size=0.2, random_state=42)

model = LogisticRegression()
model.fit(train_features, train_target)

predicted_classes = model.predict(test_features)

accuracy = accuracy_score(test_target, predicted_classes)
report = classification_report(test_target, predicted_classes)

print(report)

              precision    recall  f1-score   support

           0       0.84      0.99      0.91       785
           1       0.58      0.09      0.15       164

    accuracy                           0.83       949
   macro avg       0.71      0.54      0.53       949
weighted avg       0.79      0.83      0.78       949



In [29]:
# Confusion Matrix for 5-feature model

predicted_reoffend = model.predict(features)

conf_matrix_model = confusion_matrix(compas['two_year_recid'], predicted_reoffend)

print("Confusion Matrix for Your Model:")
print(conf_matrix_model)

compas_predictions = (compas['v_decile_score'] > 5).astype(int)

conf_matrix_compas = confusion_matrix(compas['two_year_recid'], compas_predictions)

print("Confusion Matrix for COMPAS:")
print(conf_matrix_compas)



Confusion Matrix for Your Model:
[[3925   43]
 [ 717   58]]
Confusion Matrix for COMPAS:
[[3341  627]
 [ 433  342]]


In [30]:
# Now using resampled data
compas_1 = pd.read_csv('combined_resample.csv')

# Encode gender in compas dataframe so that both compas_1 and compas have same categories for sex
compas['sex'] = compas['sex'].map({'Male': 1, 'Female': 0})

In [31]:
# Two-feature regression model using resampled data

# train using compas_1 (resampled)
train_features = compas_1[['age', 'priors_count']]
train_target = compas_1['two_year_recid']

model = LogisticRegression()
model.fit(train_features, train_target)

# test using compas (original data)
test_features = compas[['age', 'priors_count']]
test_target = compas['two_year_recid']

predicted_classes = model.predict(test_features)

accuracy = accuracy_score(test_target, predicted_classes)
report = classification_report(test_target, predicted_classes)
conf_matrix = confusion_matrix(test_target, predicted_classes)

print(report)

              precision    recall  f1-score   support

           0       0.84      0.99      0.91      3968
           1       0.55      0.07      0.12       775

    accuracy                           0.84      4743
   macro avg       0.70      0.53      0.51      4743
weighted avg       0.80      0.84      0.78      4743



In [32]:
# Confusion matrix for resampled two-feature model

predicted_reoffend = model.predict(test_features)

conf_matrix_model = confusion_matrix(compas['two_year_recid'], predicted_reoffend)

print("Confusion Matrix for Your Model:")
print(conf_matrix_model)

compas_predictions = (compas['v_decile_score'] > 5).astype(int)

conf_matrix_compas = confusion_matrix(compas['two_year_recid'], compas_predictions)

print("Confusion Matrix for COMPAS:")
print(conf_matrix_compas)


Confusion Matrix for Your Model:
[[3926   42]
 [ 724   51]]
Confusion Matrix for COMPAS:
[[3341  627]
 [ 433  342]]


In [33]:
# Function to preprocess a dataframe (original method of encoding didn't work)
def preprocess_df(df):
    df.reset_index(drop=True, inplace=True)
    encoder = OneHotEncoder(sparse=False, drop='first')
    categorical_features = encoder.fit_transform(df[['sex', 'age_cat']])
    encoded_df = pd.DataFrame(categorical_features, columns=encoder.get_feature_names_out(['sex', 'age_cat']))
    df_encoded = pd.concat([df, encoded_df], axis=1)
    feature_names = list(encoder.get_feature_names_out(['sex', 'age_cat'])) + ['juv_misd_count', 'juv_other_count', 'priors_count.1']
    features = df_encoded[feature_names]
    return features, df_encoded['two_year_recid'], encoder

# Preprocess the training data (compas_1)
train_features, train_target, encoder = preprocess_df(compas_1)

model = LogisticRegression()
model.fit(train_features, train_target)

# Preprocess the testing data (compas) using the same encoder
test_features, test_target, _ = preprocess_df(compas)

predicted_classes = model.predict(test_features)

accuracy = accuracy_score(test_target, predicted_classes)
report = classification_report(test_target, predicted_classes)
conf_matrix = confusion_matrix(test_target, predicted_classes)

print(report)


              precision    recall  f1-score   support

           0       0.85      0.99      0.91      3968
           1       0.57      0.08      0.14       775

    accuracy                           0.84      4743
   macro avg       0.71      0.53      0.53      4743
weighted avg       0.80      0.84      0.79      4743



In [34]:
# Confusion matrix for 5-feature model with resampled data

test_predicted_reoffend = model.predict(test_features)

conf_matrix_model = confusion_matrix(test_target, test_predicted_reoffend)

print("Confusion Matrix for Your Model:")
print(conf_matrix_model)

compas_test_predictions = (compas.loc[test_features.index, 'v_decile_score'] > 5).astype(int)

conf_matrix_compas = confusion_matrix(test_target, compas_test_predictions)

print("Confusion Matrix for COMPAS:")
print(conf_matrix_compas)

Confusion Matrix for Your Model:
[[3921   47]
 [ 712   63]]
Confusion Matrix for COMPAS:
[[3341  627]
 [ 433  342]]
