## Exploration of base dataset - Compas

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from IPython.display import display

In [2]:
COMPAS_PATH = Path("/opt/anaconda3/lib/python3.11/site-packages/aif360/data/raw/compas/compas-scores-two-years.csv")
df = pd.read_csv(COMPAS_PATH)

In [3]:
pd.set_option('display.max_colwidth', None)
top_n = 5
rows = []
for col in df.columns:
    feature_label = 'Target: two_year_recid' if col == 'two_year_recid' else col
    ser = df[col]

    if pd.api.types.is_numeric_dtype(ser):
        rows.append({
            'Feature': feature_label,
            'Type': 'Continuous',
            'Min': ser.min(),
            'Max': ser.max(),
            'Mean': round(ser.mean(), 2),
            'Std': round(ser.std(), 2),
            'Values': ''
        })
    else:
        counts = ser.dropna().value_counts()
        if counts.size > top_n:
            vals = counts.index[:top_n].tolist() + ['Other']
        else:
            vals = counts.index.tolist()
        rows.append({
            'Feature': feature_label,
            'Type': 'Categorical',
            'Min': '',
            'Max': '',
            'Mean': '',
            'Std': '',
            'Values': ", ".join(map(str, vals))
        })

summary_df = pd.DataFrame(rows)[['Feature', 'Type', 'Min', 'Max', 'Mean', 'Std', 'Values']]

# Reset index to start at 1
summary_df.index = range(1, len(summary_df) + 1)
summary_df
## for appendix 

Unnamed: 0,Feature,Type,Min,Max,Mean,Std,Values
1,id,Continuous,1.0,11001.0,5501.26,3175.71,
2,name,Categorical,,,,,"anthony smith, christopher hamilton, carlos vasquez, leroy battie, roderick thomas, Other"
3,first,Categorical,,,,,"michael, christopher, james, anthony, robert, Other"
4,last,Categorical,,,,,"williams, johnson, brown, smith, jones, Other"
5,compas_screening_date,Categorical,,,,,"2013-02-20, 2013-03-20, 2013-02-07, 2013-04-20, 2013-01-03, Other"
6,sex,Categorical,,,,,"Male, Female"
7,dob,Categorical,,,,,"1987-12-21, 1990-02-22, 1990-05-02, 1994-07-15, 1989-04-27, Other"
8,age,Continuous,18.0,96.0,34.82,11.89,
9,age_cat,Categorical,,,,,"25 - 45, Greater than 45, Less than 25"
10,race,Categorical,,,,,"African-American, Caucasian, Hispanic, Other, Asian, Other"


In [4]:
# List of features actually used in the experiment 
cols_used = [
    'age', 'race', 'juv_fel_count', 'juv_misd_count',
    'juv_other_count', 'priors_count', 'sex=Male',
    'c_charge_degree=M', 'c_charge_desc=Alcohol_dui',
    'c_charge_desc=Drug', 'c_charge_desc=Property',
    'c_charge_desc=Violent', 'c_charge_desc=Weapons',
    'Target: two_year_recid'
]

filtered_df = summary_df[summary_df['Feature'].isin(cols_used)].copy()
filtered_df.index = range(1, len(filtered_df) + 1)
filtered_df
## for paper

Unnamed: 0,Feature,Type,Min,Max,Mean,Std,Values
1,age,Continuous,18.0,96.0,34.82,11.89,
2,race,Categorical,,,,,"African-American, Caucasian, Hispanic, Other, Asian, Other"
3,juv_fel_count,Continuous,0.0,20.0,0.07,0.47,
4,juv_misd_count,Continuous,0.0,13.0,0.09,0.49,
5,juv_other_count,Continuous,0.0,17.0,0.11,0.5,
6,priors_count,Continuous,0.0,38.0,3.47,4.88,
7,Target: two_year_recid,Continuous,0.0,1.0,0.45,0.5,


In [5]:
# Meta info 
n_instances   = len(df)
n_features    = df.shape[1] - 1  
positive_rate = df['two_year_recid'].mean() * 100

meta = pd.DataFrame({
    'Instances':          [n_instances],
    'Features':           [n_features],
    'Positive-class (%)': [round(positive_rate,1)]
})
display(meta)


# 1) Race counts & percentages
race_summary = pd.DataFrame({
    'Count':          df['race'].value_counts(),
    'Percentage (%)': df['race'].value_counts(normalize=True).mul(100).round(1)
})
race_summary = race_summary.sort_index()
display(race_summary)


# 2) Positive-class rate by sex
sex_rates = (
    df
    .groupby('sex')['two_year_recid']
    .mean()
    .mul(100)
    .round(1)
)
sex_rates = sex_rates.sort_index()
sex_rates_df = sex_rates.to_frame('Positive-class (%)')
display(sex_rates_df)


# 3) Positive-class rate by race
race_rates = (
    df
    .groupby('race')['two_year_recid']
    .mean()
    .mul(100)
    .round(1)
)
race_rates = race_rates.sort_index()
race_rates_df = race_rates.to_frame('Positive-class (%)')
display(race_rates_df)


# 4) Positive-class rate by race_group (Caucasian vs. Non-Caucasian)
df['race_group'] = np.where(df['race'] == 'Caucasian', 'Caucasian', 'Non-Caucasian')
racegrp_rates = (
    df
    .groupby('race_group')['two_year_recid']
    .mean()
    .mul(100)
    .round(1)
)
racegrp_rates = racegrp_rates.sort_index()
racegrp_rates_df = racegrp_rates.to_frame('Positive-class (%)')
display(racegrp_rates_df)


# 5) Intersectional 2×2 matrix
matrix = (
    df
    .assign(recid=df['two_year_recid'])
    .pivot_table(
        index='race_group',
        columns='sex',
        values='recid',
        aggfunc='mean'
    )
    .mul(100)
    .round(1)
)
matrix = matrix.sort_index().sort_index(axis=1)
matrix.index.name   = 'Race Group'
matrix.columns.name = 'Sex'
display(matrix)


Unnamed: 0,Instances,Features,Positive-class (%)
0,7214,52,45.1


Unnamed: 0_level_0,Count,Percentage (%)
race,Unnamed: 1_level_1,Unnamed: 2_level_1
African-American,3696,51.2
Asian,32,0.4
Caucasian,2454,34.0
Hispanic,637,8.8
Native American,18,0.2
Other,377,5.2


Unnamed: 0_level_0,Positive-class (%)
sex,Unnamed: 1_level_1
Female,35.7
Male,47.3


Unnamed: 0_level_0,Positive-class (%)
race,Unnamed: 1_level_1
African-American,51.4
Asian,28.1
Caucasian,39.4
Hispanic,36.4
Native American,55.6
Other,35.3


Unnamed: 0_level_0,Positive-class (%)
race_group,Unnamed: 1_level_1
Caucasian,39.4
Non-Caucasian,48.0


Sex,Female,Male
Race Group,Unnamed: 1_level_1,Unnamed: 2_level_1
Caucasian,35.1,40.6
Non-Caucasian,36.1,50.5
