In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

compas = pd.read_csv('data/compas-scores-two-years-violent.csv')


compas['reoffend'] = compas['v_decile_score'].apply(lambda x: 1 if x > 5 else 0)

# select features and target
features = compas[['age', 'priors_count']]
target = compas['reoffend']

# Splitting data
train_features, test_features, train_target, test_target = train_test_split(features, target, test_size=0.2, random_state=42)
model = LogisticRegression()
model.fit(train_features, train_target)
predicted_classes = model.predict(test_features)

# Eval model
accuracy = accuracy_score(test_target, predicted_classes)
report = classification_report(test_target, predicted_classes)
conf_matrix = confusion_matrix(test_target, predicted_classes)

report

'              precision    recall  f1-score   support\n\n           0       0.87      0.95      0.91       759\n           1       0.70      0.45      0.55       190\n\n    accuracy                           0.85       949\n   macro avg       0.79      0.70      0.73       949\nweighted avg       0.84      0.85      0.84       949\n'

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from tabulate import tabulate


compas['reoffend'] = compas['v_decile_score'].apply(lambda x: 1 if x > 5 else 0)
features_old = compas[['age', 'priors_count']]
target_old = compas['reoffend']
train_features_old, test_features_old, train_target_old, test_target_old = train_test_split(features_old, target_old, test_size=0.2, random_state=42)


model_old = LogisticRegression()
model_old.fit(train_features_old, train_target_old)
predicted_classes_old = model_old.predict(test_features_old)


accuracy_old = accuracy_score(test_target_old, predicted_classes_old)
print("Accuracy of the old model:", accuracy_old)

# Selecting features 
features_new = compas[['age', 'priors_count', 'race']]
target_new = compas['reoffend']
features_new = pd.get_dummies(features_new, columns=['race'], drop_first=True)
train_features_new, test_features_new, train_target_new, test_target_new = train_test_split(features_new, target_new, test_size=0.2, random_state=42)


model_new = LinearRegression()
model_new.fit(train_features_new, train_target_new)
predicted_values_new = model_new.predict(test_features_new)
predicted_classes_new = [1 if val > 0.5 else 0 for val in predicted_values_new]


accuracy_new = accuracy_score(test_target_new, predicted_classes_new)
print("Accuracy of the new model:", accuracy_new)
if accuracy_new > accuracy_old:
    print("The new model performs better.")
elif accuracy_new < accuracy_old:
    print("The old model performs better.")
else:
    print("Both models have the same accuracy.")


Accuracy of the old model: 0.8514225500526871
Accuracy of the new model: 0.8061116965226555
The old model performs better.


In [8]:
# reformatting the classification report 
from tabulate import tabulate

# Converting into a dictionary
report_dict = classification_report(test_target, predicted_classes, output_dict=True)

# data 
report_data = []
for key, value in report_dict.items():
    if key == 'accuracy':
        report_data.append(['accuracy', '', '', value, report_dict['macro avg']['support']])
    elif key in ['macro avg', 'weighted avg']:
        report_data.append([key, value['precision'], value['recall'], value['f1-score'], value['support']])
    else:
        report_data.append([f'Class {key}', value['precision'], value['recall'], value['f1-score'], value['support']])


headers = ["Metric", "Precision", "Recall", "F1-Score", "Support"]
table = tabulate(report_data, headers, tablefmt="pretty")
print(table)



+--------------+--------------------+--------------------+--------------------+---------+
|    Metric    |     Precision      |       Recall       |      F1-Score      | Support |
+--------------+--------------------+--------------------+--------------------+---------+
|   Class 0    | 0.8731884057971014 | 0.9525691699604744 | 0.9111531190926278 |  759.0  |
|   Class 1    | 0.7024793388429752 | 0.4473684210526316 | 0.5466237942122186 |  190.0  |
|   accuracy   |                    |                    | 0.8514225500526871 |  949.0  |
|  macro avg   | 0.7878338723200383 | 0.699968795506553  | 0.7288884566524232 |  949.0  |
| weighted avg | 0.8390106157852111 | 0.8514225500526871 | 0.8381704302335363 |  949.0  |
+--------------+--------------------+--------------------+--------------------+---------+


In [9]:
# Example data
new_data = pd.DataFrame({
    'age': [25, 40],
    'priors_count': [10, 0]
})

# Making predictions
predictions = model.predict(new_data)
prediction_labels = ['Likely to reoffend' if pred == 1 else 'Less likely to reoffend' for pred in predictions]

for i, label in enumerate(prediction_labels):
    print(f"Individual {i+1}: {label}")


Individual 1: Likely to reoffend
Individual 2: Less likely to reoffend
