## Preprocessing - Reweighing -  Compas Model
- for 'sex' and 'race'

In [1]:
import sys
import os

project_root = os.path.abspath(os.path.join(os.getcwd(), "../.."))
sys.path.insert(0, project_root)

import pandas as pd
from src.data_loading import load_compas_race, load_compas_sex
from src.modeling import reweighing_train_and_predict
from src.metrics import compute_metrics, compare_viz_metrics_2x3, save_agg_metrics, save_raw_metrics
from sklearn.model_selection import StratifiedShuffleSplit

  vect_normalized_discounted_cumulative_gain = vmap(
  monte_carlo_vect_ndcg = vmap(vect_normalized_discounted_cumulative_gain, in_dims=(0,))


## Sex

In [2]:

protected         = 'sex'
privileged_value   = 1.0
unprivileged_value = 0.0

ad, df = load_adult_sex()
feature_cols = [c for c in df.columns if c not in (protected, 'label', 'sex', 'fnlwgt')]

# 1) Set up 25-fold stratified CV
sss = StratifiedShuffleSplit(n_splits=25, test_size=0.2, random_state=42)
results = []

for train_idx, test_idx in sss.split(df, df['label']):
    test_df, y_test, y_pred = reweighing_train_and_predict(
        ad, df,
        train_idx, test_idx,
        protected, privileged_value, unprivileged_value
    )
    m = compute_metrics(
        test_df, y_test, y_pred,
        protected, privileged_value, unprivileged_value
    )
    results.append(m)

adult_sex_metrics = pd.DataFrame(results)
adult_sex_metrics_agg = adult_sex_metrics.agg(['mean', 'std'])

NameError: name 'load_adult_sex' is not defined

In [None]:
# 1) Retrieve data
protected = 'sex'
privileged_value   = 0.0
unprivileged_value = 1.0

cd, df = load_compas_sex()
feature_cols = [c for c in df.columns if c not in ('label','sex')]

# 2) Set up 25-fold stratified CV
sss = StratifiedShuffleSplit(n_splits=25, test_size=0.2, random_state=42)
results = []

for train_idx, test_idx in sss.split(df, df['label']):
    test_df, y_test, y_pred = reweighing_train_and_predict(
        cd, df,
        train_idx, test_idx,
        protected, privileged_value, unprivileged_value
    )
    m = compute_metrics(
        test_df, y_test, y_pred,
        protected, privileged_value, unprivileged_value
    )
    results.append(m)

# 3) Aggregate results
compas_sex_metrics = pd.DataFrame(results)
compas_sex_metrics_agg = compas_sex_metrics.agg(['mean', 'std'])

In [None]:
compas_sex_metrics_agg

## Race

In [None]:
# 1) Retrieve data
protected = 'race'
privileged_value   = 0.0
unprivileged_value = 1.0

cd, df = load_compas_race()
feature_cols = [c for c in df.columns if c not in ('label','race')]

# 2) Set up 25-fold stratified CV
sss = StratifiedShuffleSplit(n_splits=25, test_size=0.2, random_state=42)
results = []

for train_idx, test_idx in sss.split(df, df['label']):
    test_df, y_test, y_pred = reweighing_train_and_predict(
        cd, df,
        train_idx, test_idx,
        protected, privileged_value, unprivileged_value
    )
    m = compute_metrics(
        test_df, y_test, y_pred,
        protected, privileged_value, unprivileged_value
    )
    results.append(m)

# 3) Aggregate results
compas_race_metrics = pd.DataFrame(results)
compas_race_metrics_agg = compas_race_metrics.agg(['mean', 'std'])

In [None]:
compas_race_metrics_agg

---------------------------------
## compare with baseline, create plots

In [None]:
baseline_race_agg = pd.read_csv('../../reports/baseline_agg/compas_race_metrics_agg.csv', index_col=0)
baseline_sex_agg = pd.read_csv('../../reports/baseline_agg/compas_sex_metrics_agg.csv', index_col=0)

In [None]:
compare_viz_metrics_2x3(baseline_race_agg, compas_race_metrics_agg, 'Baseline', 'Race', 'Compas Baseline - Race')

In [None]:
compare_viz_metrics_2x3(baseline_sex_agg, compas_sex_metrics_agg, 'Baseline', 'Sex', 'Compas Baseline - Sex')

In [None]:
save_agg_metrics('compas', 'reweighing', compas_race_metrics_agg, compas_sex_metrics_agg)
save_raw_metrics('compas', 'reweighing', compas_race_metrics, compas_sex_metrics)