In [1]:
import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt 

In [2]:
compas = pd.read_csv('data/cox-violent-parsed.csv')
compas.head()

Unnamed: 0,id,name,first,last,compas_screening_date,sex,dob,age,age_cat,race,...,v_type_of_assessment,v_decile_score,v_score_text,v_screening_date,in_custody,out_custody,priors_count.1,start,end,event
0,1.0,miguel hernandez,miguel,hernandez,14/08/2013,Male,18/04/1947,69,Greater than 45,Other,...,Risk of Violence,1,Low,14/08/2013,07/07/2014,14/07/2014,0,0,327,0
1,2.0,miguel hernandez,miguel,hernandez,14/08/2013,Male,18/04/1947,69,Greater than 45,Other,...,Risk of Violence,1,Low,14/08/2013,07/07/2014,14/07/2014,0,334,961,0
2,3.0,michael ryan,michael,ryan,31/12/2014,Male,06/02/1985,31,25 - 45,Caucasian,...,Risk of Violence,2,Low,31/12/2014,30/12/2014,03/01/2015,0,3,457,0
3,4.0,kevon dixon,kevon,dixon,27/01/2013,Male,22/01/1982,34,25 - 45,African-American,...,Risk of Violence,1,Low,27/01/2013,26/01/2013,05/02/2013,0,9,159,1
4,5.0,ed philo,ed,philo,14/04/2013,Male,14/05/1991,24,Less than 25,African-American,...,Risk of Violence,3,Low,14/04/2013,16/06/2013,16/06/2013,4,0,63,0


In [3]:
compas['c_jail_in'] = pd.to_datetime(compas['c_jail_in'], format='mixed')
compas['c_jail_out'] = pd.to_datetime(compas['c_jail_out'], format='mixed')

compas['v_screening_date'] = pd.to_datetime(compas['v_screening_date'], format='mixed')
compas['vr_offense_date'] = pd.to_datetime(compas['vr_offense_date'], format='mixed')

compas['c_offense_date'] = pd.to_datetime(compas['c_offense_date'], format='mixed')
compas['c_arrest_date'] = pd.to_datetime(compas['c_arrest_date'], format='mixed')

compas['compas_screening_date'] = pd.to_datetime(compas['compas_screening_date'], format='mixed')

ValueError: time data '13/08/2013 6:03' does not match format 'mixed' (match)

In [None]:
compas = compas.drop(['first','last','c_case_number','c_charge_degree', 'c_charge_desc', 'r_case_number','r_charge_degree','vr_case_number','vr_charge_degree','start','end','event'],axis=1)

#remove row with nan scores 
compas = compas.loc[compas['score_text'].notnull()]
#clean out the "-1" decile score from dataset 
compas = compas.loc[compas['v_decile_score'] != -1]

#label encodimg
compas['sex'] = compas['sex'].replace({'Male': 1, 'Female': 0})
compas['score_text'] = compas['score_text'].replace({'Low':0, 'Medium':1, 'High': 2})

#days in jail
compas['jail_out- jail_in'] = (compas['c_jail_out'] - compas['c_jail_in']).dt.days

In [None]:
compas.info()

In [None]:
compas.describe(include=["object","category"]).T

In [None]:
compas.describe().T

In [None]:
compas_race = compas['race'].value_counts()
compas_race

In [None]:
compas['race'].value_counts().plot(kind='bar')
plt.xticks(rotation=45)

In [None]:
compas['age_cat'].value_counts().plot(kind='bar')
plt.xticks(rotation=45)

In [None]:
compas.info()

In [None]:
corr_matrix = compas[['sex', 'age','juv_fel_count', 'decile_score', 
                    'juv_misd_count','juv_other_count','priors_count', 
                    'days_b_screening_arrest', 'c_days_from_compas', 'r_days_from_arrest', 'v_decile_score', 'jail_out- jail_in','score_text' ]].corr()
sns.heatmap(corr_matrix, annot=False)
plt.show()

In [None]:
corr_matrix.style.background_gradient(cmap='coolwarm')

In [None]:
compas.head()

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


compas['reoffend'] = compas['v_decile_score'].apply(lambda x: 1 if x > 5 else 0)

# Selecting features and target
features = compas[['age', 'priors_count']]
target = compas['reoffend']

# Splitting the data into training and testing sets
train_features, test_features, train_target, test_target = train_test_split(features, target, test_size=0.2, random_state=42)

# Initializing and training the logistic regression model
model = LogisticRegression()
model.fit(train_features, train_target)

# Making predictions
predicted_classes = model.predict(test_features)

# Evaluating the model
accuracy = accuracy_score(test_target, predicted_classes)
report = classification_report(test_target, predicted_classes)
conf_matrix = confusion_matrix(test_target, predicted_classes)

report

'              precision    recall  f1-score   support\n\n           0       0.85      0.90      0.87      2631\n           1       0.70      0.60      0.64      1033\n\n    accuracy                           0.81      3664\n   macro avg       0.77      0.75      0.76      3664\nweighted avg       0.81      0.81      0.81      3664\n'

In [8]:
# Reformatting the classification report for better readability
from tabulate import tabulate

# Converting the classification report into a dictionary
report_dict = classification_report(test_target, predicted_classes, output_dict=True)

# Preparing data for tabulation
report_data = []
for key, value in report_dict.items():
    if key == 'accuracy':
        report_data.append(['accuracy', '', '', value, report_dict['macro avg']['support']])
    elif key in ['macro avg', 'weighted avg']:
        report_data.append([key, value['precision'], value['recall'], value['f1-score'], value['support']])
    else:
        report_data.append([f'Class {key}', value['precision'], value['recall'], value['f1-score'], value['support']])

# Creating a table with headers
headers = ["Metric", "Precision", "Recall", "F1-Score", "Support"]
table = tabulate(report_data, headers, tablefmt="pretty")

# Printing the formatted table
print(table)



+--------------+--------------------+--------------------+--------------------+---------+
|    Metric    |     Precision      |       Recall       |      F1-Score      | Support |
+--------------+--------------------+--------------------+--------------------+---------+
|   Class 0    | 0.8513172140021653 | 0.8966172557962752 | 0.873380229544613  | 2631.0  |
|   Class 1    | 0.6954087346024636 | 0.601161665053243  | 0.6448598130841121 | 1033.0  |
|   accuracy   |                    |                    | 0.8133187772925764 | 3664.0  |
|  macro avg   | 0.7733629743023145 | 0.7488894604247591 | 0.7591200213143625 | 3664.0  |
| weighted avg | 0.8073615755687887 | 0.8133187772925764 | 0.8089529396418571 | 3664.0  |
+--------------+--------------------+--------------------+--------------------+---------+


In [9]:
# Example data
new_data = pd.DataFrame({
    'age': [25, 40],
    'priors_count': [10, 0]
})

# Making predictions
predictions = model.predict(new_data)

# Interpreting predictions
prediction_labels = ['Likely to reoffend' if pred == 1 else 'Less likely to reoffend' for pred in predictions]

# Display results
for i, label in enumerate(prediction_labels):
    print(f"Individual {i+1}: {label}")


Individual 1: Likely to reoffend
Individual 2: Less likely to reoffend


In [11]:
# Assuming features and target have been prepared

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Define the neural network architecture
model = Sequential()
model.add(Dense(10, input_dim=2, activation='relu'))  # Input layer and first hidden layer
model.add(Dense(1, activation='sigmoid'))  # Output layer

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(train_features, train_target, epochs=100, batch_size=10, validation_split=0.1)

# Evaluate the model
accuracy = model.evaluate(test_features, test_target)

# Make predictions
predictions = model.predict(new_data)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [13]:
# Make predictions
predictions = model.predict(new_data)

predictions




array([[0.5547271 ],
       [0.02580232]], dtype=float32)