In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

compas = pd.read_csv('./data/compas.csv')


compas['reoffend'] = compas['v_decile_score'].apply(lambda x: 1 if x > 5 else 0)

# Selecting features and target
features = compas[['age', 'priors_count']]
target = compas['reoffend']

# Splitting the data into training and testing sets
train_features, test_features, train_target, test_target = train_test_split(features, target, test_size=0.2, random_state=42)

# Initializing and training the logistic regression model
model = LogisticRegression()
model.fit(train_features, train_target)

# Making predictions
predicted_classes = model.predict(test_features)

# Evaluating the model
accuracy = accuracy_score(test_target, predicted_classes)
report = classification_report(test_target, predicted_classes)
conf_matrix = confusion_matrix(test_target, predicted_classes)

report

'              precision    recall  f1-score   support\n\n           0       0.85      0.91      0.88      2603\n           1       0.74      0.60      0.66      1056\n\n    accuracy                           0.82      3659\n   macro avg       0.79      0.76      0.77      3659\nweighted avg       0.82      0.82      0.82      3659\n'

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from tabulate import tabulate

# Assuming 'compas' is your dataset

# Add a new column for binary reoffend variable
compas['reoffend'] = compas['v_decile_score'].apply(lambda x: 1 if x > 5 else 0)

# Selecting features and target for the old model
features_old = compas[['age', 'priors_count']]
target_old = compas['reoffend']

# Splitting the data into training and testing sets for the old model
train_features_old, test_features_old, train_target_old, test_target_old = train_test_split(features_old, target_old, test_size=0.2, random_state=42)

# Initializing and training the old logistic regression model
model_old = LogisticRegression()
model_old.fit(train_features_old, train_target_old)

# Making predictions for the old model
predicted_classes_old = model_old.predict(test_features_old)

# Evaluating the old model
accuracy_old = accuracy_score(test_target_old, predicted_classes_old)
print("Accuracy of the old model:", accuracy_old)

# Selecting features and target for the new model (including 'race')
features_new = compas[['age', 'priors_count', 'race']]
target_new = compas['reoffend']

# Convert categorical variable 'race' into dummy variables
features_new = pd.get_dummies(features_new, columns=['race'], drop_first=True)

# Splitting the data into training and testing sets for the new model
train_features_new, test_features_new, train_target_new, test_target_new = train_test_split(features_new, target_new, test_size=0.2, random_state=42)

# Initializing and training the new linear regression model
model_new = LinearRegression()
model_new.fit(train_features_new, train_target_new)

# Making predictions for the new model
predicted_values_new = model_new.predict(test_features_new)

# Converting predicted values to binary classes
predicted_classes_new = [1 if val > 0.5 else 0 for val in predicted_values_new]

# Evaluating the new model
accuracy_new = accuracy_score(test_target_new, predicted_classes_new)
print("Accuracy of the new model:", accuracy_new)

# Comparing the models
if accuracy_new > accuracy_old:
    print("The new model performs better.")
elif accuracy_new < accuracy_old:
    print("The old model performs better.")
else:
    print("Both models have the same accuracy.")


In [None]:
# Reformatting the classification report for better readability
from tabulate import tabulate

# Converting the classification report into a dictionary
report_dict = classification_report(test_target, predicted_classes, output_dict=True)

# Preparing data for tabulation
report_data = []
for key, value in report_dict.items():
    if key == 'accuracy':
        report_data.append(['accuracy', '', '', value, report_dict['macro avg']['support']])
    elif key in ['macro avg', 'weighted avg']:
        report_data.append([key, value['precision'], value['recall'], value['f1-score'], value['support']])
    else:
        report_data.append([f'Class {key}', value['precision'], value['recall'], value['f1-score'], value['support']])

# Creating a table with headers
headers = ["Metric", "Precision", "Recall", "F1-Score", "Support"]
table = tabulate(report_data, headers, tablefmt="pretty")

# Printing the formatted table
print(table)



In [None]:
# Example data
new_data = pd.DataFrame({
    'age': [25, 40],
    'priors_count': [10, 0]
})

# Making predictions
predictions = model.predict(new_data)

# Interpreting predictions
prediction_labels = ['Likely to reoffend' if pred == 1 else 'Less likely to reoffend' for pred in predictions]

# Display results
for i, label in enumerate(prediction_labels):
    print(f"Individual {i+1}: {label}")
