## Exploratory Preprocessing
- compas for 'sex' and 'race'

In [None]:
import sys
import os

project_root = os.path.abspath(os.path.join(os.getcwd(), "../.."))
sys.path.insert(0, project_root)

In [None]:
from aif360.datasets import CompasDataset
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit

pd.set_option('display.max_columns', None)

In [None]:

def load_compas_race(custom_preprocessing=None):
    ds = CompasDataset(
        protected_attribute_names=['race'],
        privileged_classes=[['Caucasian']],
        features_to_drop=[''],
        categorical_features=[
            'sex', 'age_cat', 'c_charge_degree', 'c_charge_desc'
        ],
        custom_preprocessing=custom_preprocessing
    )
    df = pd.DataFrame(ds.features, columns=ds.feature_names)
    df['label'] = ds.labels.ravel()
    df['race']  = ds.protected_attributes[:, 0]
    return ds, df


In [None]:
# 1) Retrieve data
cd, df = load_compas_race()
df

- the 29 removed have missing recidivism, need to be dropped

# add preprocessor
-group categoricals
-remoce age cat

In [None]:
def preprocessing_compas(df):


    return df

def load_compas_race(custom_preprocessing=preprocessing_compas):
    ds = CompasDataset(
        protected_attribute_names=['race'],
        privileged_classes=[['Caucasian']],
        features_to_drop=[''],
        categorical_features=[
            'sex', 'age_cat', 'c_charge_degree', 'c_charge_desc'
        ],
        custom_preprocessing=custom_preprocessing
    )
    df = pd.DataFrame(ds.features, columns=ds.feature_names)
    df['label'] = ds.labels.ravel()
    df['race']  = ds.protected_attributes[:, 0]
    return ds, df

In [None]:
import sys
import os

project_root = os.path.abspath(os.path.join(os.getcwd(), "../.."))
sys.path.insert(0, project_root)

import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit
from src.modeling import train_and_predict
from src.metrics import compute_metrics

In [None]:
# 1) Retrieve data
protected = 'race'
privileged_value   = 1.0
unprivileged_value = 0.0

ds, df = load_compas_race()
feature_cols = ds.feature_names

In [None]:
protected = 'race'   

variants = {
    'continuous_age': [
        c for c in df.columns
        if c not in ('label', protected)
        and not c.startswith('age_cat=')
    ],
    'binned_age': [
        c for c in df.columns
        if c not in ('label', protected, 'age')
    ]
}

# 2) Run experiment, Evaluate
sss = StratifiedShuffleSplit(n_splits=25, test_size=0.2, random_state=42)
results = {}
for name, cols in variants.items():
    res = []
    for train_idx, test_idx in sss.split(df, df['label']):
        test_df, y_test, y_pred = train_and_predict(df, cols, train_idx, test_idx)
        res.append(compute_metrics(
             test_df, y_test, y_pred,
             protected, privileged_value, unprivileged_value
        ))
    metrics_df        = pd.DataFrame(res)
    results[name]     = metrics_df.agg(['mean','std'])

print("Continuous-age results:\n", results['continuous_age'])
print("\nBinned-age results:\n",    results['binned_age'])

#### remove binned age categorical

In [None]:
df

In [None]:
def preprocessing_compas(df):


    return df

def load_compas_race(custom_preprocessing=preprocessing_compas):
    ds = CompasDataset(
        protected_attribute_names=['race'],
        privileged_classes=[['Caucasian']],
        features_to_drop=['age_cat'],
        categorical_features=[
            'sex', 'c_charge_degree', 'c_charge_desc'
        ],
        custom_preprocessing=custom_preprocessing
    )
    df = pd.DataFrame(ds.features, columns=ds.feature_names)
    df['label'] = ds.labels.ravel()
    df['race']  = ds.protected_attributes[:, 0]
    return ds, df

In [None]:
# 1) Retrieve data
protected = 'race'
privileged_value   = 1.0
unprivileged_value = 0.0

ds, df = load_compas_race()
feature_cols = ds.feature_names

# 2) Run experiment, Evaluate
sss = StratifiedShuffleSplit(n_splits=25, test_size=0.2, random_state=42)

results = []
for train_idx, test_idx in sss.split(df, df['label']):
    test_df, y_test, y_pred = train_and_predict(
        df, feature_cols, train_idx, test_idx
    )
    m = compute_metrics(test_df, y_test, y_pred, protected, privileged_value, unprivileged_value)
    results.append(m)

# 3) Aggregate results
compas_race_metrics = pd.DataFrame(results)
compas_race_metrics_agg = compas_race_metrics.agg(['mean', 'std'])

In [None]:
compas_race_metrics_agg

#### check charges for grouping

In [None]:
# check charges 
charge_desc_cols = [c for c in df.columns if c.startswith('c_charge_desc=')]
counts = df[charge_desc_cols].sum().sort_values(ascending=False)
counts = counts[counts > 0]
counts.index = counts.index.str.replace('c_charge_desc=', '')
counts_df = counts.reset_index()
counts_df.columns = ['charge_desc', 'count']
print(counts_df.to_string(index=False))

In [None]:
def preprocessing_compas(df):
    # 4) Charges simplification
    violent = {'assault','battery','murder','manslaughter'}
    property = {'theft','burglary','robbery','arson','trespass'}
    drug = {'possession','traff','deliver','cocaine', 'heroin','marijuana','meth','opioid'}
    alcohol_dui = {'dui','dwi','alcohol','intoxicated'}
    weapons = {'weapon','firearm','gun','deadly'}
    
    def charge_group(x):
        if not isinstance(x, str):
            return 'Other'
        txt = x.lower()
        if any(k in txt for k in violent):
            return 'Violent'
        if any(k in txt for k in property):
            return 'Property'
        if any(k in txt for k in drug):
            return 'Drug'
        if any(k in txt for k in alcohol_dui):
            return 'Alcohol_dui'
        if any(k in txt for k in weapons):
            return 'Weapons'
        return 'Other'
    df['c_charge_desc'] = df['c_charge_desc'].apply(charge_group)

    return df

def load_compas_race(custom_preprocessing=preprocessing_compas):
    ds = CompasDataset(
        protected_attribute_names=['race'],
        privileged_classes=[['Caucasian']],
        features_to_drop=['age_cat'],
        categorical_features=[
            'sex', 'c_charge_degree', 'c_charge_desc'
        ],
        custom_preprocessing=custom_preprocessing
    )
    df = pd.DataFrame(ds.features, columns=ds.feature_names)
    df['label'] = ds.labels.ravel()
    df['race']  = ds.protected_attributes[:, 0]
    return ds, df

In [None]:
# 1) Retrieve data
protected = 'race'
privileged_value   = 1.0
unprivileged_value = 0.0

ds, df = load_compas_race()
feature_cols = ds.feature_names

In [None]:
df.iloc[:,:6].describe()
# race is binary, ignore
# robust scaler!

In [None]:
print(df['race'].value_counts())

#### check metrics with binned

In [None]:
# 1) Retrieve data
protected = 'race'
privileged_value   = 1.0
unprivileged_value = 0.0

ds, df = load_compas_race()
feature_cols = ds.feature_names

# 2) Run experiment, Evaluate
sss = StratifiedShuffleSplit(n_splits=25, test_size=0.2, random_state=42)

results = []
for train_idx, test_idx in sss.split(df, df['label']):
    test_df, y_test, y_pred = train_and_predict(
        df, feature_cols, train_idx, test_idx
    )
    m = compute_metrics(test_df, y_test, y_pred, protected, privileged_value, unprivileged_value)
    results.append(m)

# 3) Aggregate results
compas_race_metrics = pd.DataFrame(results)
compas_race_metrics_agg = compas_race_metrics.agg(['mean', 'std'])

In [None]:
compas_race_metrics_agg

### drop one-hot-encoded extras, final check
- aifs30 doesn't allow to drop first/last during encoding

In [None]:
def preprocessing_compas(df):
    # 4) Charges simplification
    violent = {'assault','battery','murder','manslaughter'}
    property = {'theft','burglary','robbery','arson','trespass'}
    drug = {'possession','traff','deliver','cocaine', 'heroin','marijuana','meth','opioid'}
    alcohol_dui = {'dui','dwi','alcohol','intoxicated'}
    weapons = {'weapon','firearm','gun','deadly'}
    
    def charge_group(x):
        if not isinstance(x, str):
            return 'Other'
        txt = x.lower()
        if any(k in txt for k in violent):
            return 'Violent'
        if any(k in txt for k in property):
            return 'Property'
        if any(k in txt for k in drug):
            return 'Drug'
        if any(k in txt for k in alcohol_dui):
            return 'Alcohol_dui'
        if any(k in txt for k in weapons):
            return 'Weapons'
        return 'Other'
    df['c_charge_desc'] = df['c_charge_desc'].apply(charge_group)

    return df

def load_compas_race(custom_preprocessing=preprocessing_compas):
    ds = CompasDataset(
        protected_attribute_names=['race'],
        privileged_classes=[['Caucasian']],
        features_to_drop=['age_cat'],
        categorical_features=[
            'sex', 'c_charge_degree', 'c_charge_desc'
        ],
        custom_preprocessing=custom_preprocessing
    )
    df = pd.DataFrame(ds.features, columns=ds.feature_names)
    df['label'] = ds.labels.ravel()
    df['race']  = ds.protected_attributes[:, 0]

    df.drop(    
        ['c_charge_desc=Other', 'sex=Female', 'c_charge_degree=F'],
        axis=1,
        inplace=True
    )

    return ds, df

In [1]:
import sys
import os

project_root = os.path.abspath(os.path.join(os.getcwd(), "../.."))
sys.path.insert(0, project_root)

In [2]:

import pandas as pd
from src.data_loading import load_compas_sex, load_compas_race
from src.modeling import train_and_predict
from src.metrics import compute_metrics

from sklearn.model_selection import StratifiedShuffleSplit

  vect_normalized_discounted_cumulative_gain = vmap(
  monte_carlo_vect_ndcg = vmap(vect_normalized_discounted_cumulative_gain, in_dims=(0,))


In [3]:
# 1) Retrieve data
protected = 'race'
privileged_value   = 1.0
unprivileged_value = 0.0

ds, df = load_compas_race()
feature_cols = [c for c in df.columns if c not in ('label','race')]

# 2) Run experiment, Evaluate
sss = StratifiedShuffleSplit(n_splits=25, test_size=0.2, random_state=42)

results = []
for train_idx, test_idx in sss.split(df, df['label']):
    test_df, y_test, y_pred = train_and_predict(
        df, feature_cols, train_idx, test_idx
    )
    m = compute_metrics(test_df, y_test, y_pred, protected, privileged_value, unprivileged_value)
    results.append(m)

# 3) Aggregate results
compas_race_metrics = pd.DataFrame(results)
compas_race_metrics_agg = compas_race_metrics.agg(['mean', 'std'])

In [4]:
compas_race_metrics_agg

Unnamed: 0,accuracy,f1_score,SPD,DI,EOD,AOD
mean,0.67307,0.597067,0.179721,1.749589,0.208976,0.159877
std,0.010989,0.012907,0.024342,0.143759,0.038348,0.023623


In [5]:
df

Unnamed: 0,age,race,juv_fel_count,juv_misd_count,juv_other_count,priors_count,sex=Male,c_charge_degree=M,c_charge_desc=Alcohol_dui,c_charge_desc=Drug,c_charge_desc=Property,c_charge_desc=Violent,c_charge_desc=Weapons,label
0,69.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,34.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
2,24.0,0.0,0.0,0.0,1.0,4.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,23.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,43.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7209,23.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
7210,23.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7211,57.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
7212,33.0,0.0,0.0,0.0,0.0,3.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [6]:
# 1) Retrieve data
protected = 'sex'
privileged_value   = 1.0
unprivileged_value = 0.0

ds, df = load_compas_sex()
feature_cols = [c for c in df.columns if c not in ('label','sex')]

# 2) Run experiment, Evaluate
sss = StratifiedShuffleSplit(n_splits=25, test_size=0.2, random_state=42)

results = []
for train_idx, test_idx in sss.split(df, df['label']):
    test_df, y_test, y_pred = train_and_predict(
        df, feature_cols, train_idx, test_idx
    )
    m = compute_metrics(test_df, y_test, y_pred, protected, privileged_value, unprivileged_value)
    results.append(m)

# 3) Aggregate results
compas_sex_metrics = pd.DataFrame(results)
compas_sex_metrics_agg = compas_sex_metrics.agg(['mean', 'std'])

In [7]:
compas_sex_metrics_agg

Unnamed: 0,accuracy,f1_score,SPD,DI,EOD,AOD
mean,0.674401,0.601044,-0.284224,0.32498,-0.331084,-0.263587
std,0.010802,0.013216,0.030753,0.059045,0.05555,0.034169


In [8]:
df

Unnamed: 0,sex,age,juv_fel_count,juv_misd_count,juv_other_count,priors_count,race=African-American,race=Asian,race=Caucasian,race=Hispanic,race=Native American,c_charge_degree=M,c_charge_desc=Alcohol_dui,c_charge_desc=Drug,c_charge_desc=Property,c_charge_desc=Violent,c_charge_desc=Weapons,label,race
0,1.0,69.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,1.0,34.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
2,1.0,24.0,0.0,0.0,1.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0
3,1.0,23.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,1.0,43.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7209,1.0,23.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
7210,1.0,23.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
7211,1.0,57.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
7212,0.0,33.0,0.0,0.0,0.0,3.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
