# COMPAS Dataset Audit

This notebook audits the COMPAS Recidivism dataset for racial bias using AI Fairness 360, pandas, and matplotlib.  
See the summary and ethical discussion in `Compas_Dataset_Audit.md`.

In [None]:
# Install required packages
%pip install aif360 pandas matplotlib scikit-learn

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from aif360.datasets import CompasDataset
from aif360.metrics import BinaryLabelDatasetMetric
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import warnings
warnings.filterwarnings('ignore')

Collecting aif360
  Downloading aif360-0.6.1-py3-none-any.whl.metadata (5.0 kB)
Downloading aif360-0.6.1-py3-none-any.whl (259 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m259.7/259.7 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: aif360
Successfully installed aif360-0.6.1


pip install 'aif360[Reductions]'
pip install 'aif360[Reductions]'
pip install 'aif360[inFairness]'
pip install 'aif360[Reductions]'


## 1. Load the COMPAS Dataset

Download the dataset from [ProPublica](https://projects.propublica.org/datastore/#compas-recidivism-risk-score-data-and-analysis) and place it in the working directory as `compas-scores-two-years.csv`.

In [None]:
# Load COMPAS dataset using AIF360
compas = CompasDataset()
df = compas.convert_to_dataframe()[0]
df.head()

## 2. Preprocess Data

Focus on race as the protected attribute. Define privileged (Caucasian) and unprivileged (African-American) groups.

In [None]:
# Define privileged and unprivileged groups
privileged_groups = [{'race': 1}]  # Caucasian
unprivileged_groups = [{'race': 0}]  # African-American

# Check class balance
df['race'].value_counts()

## 3. Compute Fairness Metrics

- Disparate impact ratio
- Equal opportunity difference
- False positive and false negative rates by race

In [None]:
# Compute metrics
metric = BinaryLabelDatasetMetric(
    compas,
    privileged_groups=privileged_groups,
    unprivileged_groups=unprivileged_groups
)

disparate_impact = metric.disparate_impact()
print(f'Disparate Impact Ratio: {disparate_impact:.3f}')

# For equal opportunity and confusion matrix, need predictions
# Here, use the original labels as 'predictions' for demonstration
y_true = compas.labels.ravel()
y_pred = compas.labels.ravel()

# Split by race
mask_white = compas.protected_attributes.ravel() == 1
mask_black = compas.protected_attributes.ravel() == 0

def rates(y_true, y_pred, mask):
    tn, fp, fn, tp = confusion_matrix(y_true[mask], y_pred[mask]).ravel()
    fpr = fp / (fp + tn)
    fnr = fn / (fn + tp)
    tpr = tp / (tp + fn)
    return fpr, fnr, tpr

fpr_white, fnr_white, tpr_white = rates(y_true, y_pred, mask_white)
fpr_black, fnr_black, tpr_black = rates(y_true, y_pred, mask_black)

equal_opp_diff = tpr_white - tpr_black
print(f'Equal Opportunity Difference (TPR White - TPR Black): {equal_opp_diff:.3f}')
print(f'False Positive Rate (White): {fpr_white:.3f}, (Black): {fpr_black:.3f}')
print(f'False Negative Rate (White): {fnr_white:.3f}, (Black): {fnr_black:.3f}')

## 4. Visualizations

- Bar plots comparing risk scores across groups
- Confusion matrix visualizations

In [None]:
# Bar plot: High risk rate by race
risk_by_race = df.groupby('race')['two_year_recid'].mean()
risk_by_race.plot(kind='bar', color=['#4F81BD', '#C0504D'])
plt.title('High Risk Rate by Race')
plt.ylabel('Proportion Classified High Risk')
plt.xlabel('Race (0=Black, 1=White)')
plt.show()

In [None]:
# Confusion matrices
for label, mask in [('White', mask_white), ('Black', mask_black)]:
    cm = confusion_matrix(y_true[mask], y_pred[mask])
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot()
    plt.title(f'Confusion Matrix: {label}')
    plt.show()

---

For summary, ethical implications, and mitigation steps, see `Compas_Dataset_Audit.md`.