## Preprocessing - Reweighing -  Adult Model
- for 'sex' and 'race'

In [13]:
import sys
import os

project_root = os.path.abspath(os.path.join(os.getcwd(), "../.."))
sys.path.insert(0, project_root)

import pandas as pd
from src.data_loading import load_adult_race, load_adult_sex
from src.modeling import train_and_predict
from src.metrics import compute_metrics, viz_metrics_2x3, compare_viz_metrics_2x3

from sklearn.model_selection import StratifiedShuffleSplit

## Sex

In [None]:
import pandas as pd
from aif360.algorithms.preprocessing import Reweighing
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedShuffleSplit
from src.modeling import get_default_model_pipeline

# 0) Load the preprocessed AIF360 Adult dataset + numeric DataFrame
ds, df = load_adult_sex()
protected         = 'sex'
privileged_value   = 1.0
unprivileged_value = 0.0

# drop label, protected, and fnlwgt from your features
feature_cols = [c for c in df.columns if c not in (protected, 'label', 'sex', 'fnlwgt')]

# 1) Set up 25-fold stratified CV
sss = StratifiedShuffleSplit(n_splits=25, test_size=0.2, random_state=42)
results = []

for train_idx, test_idx in sss.split(df, df['label']):
    train_bld = ds.subset(train_idx)
    test_bld  = ds.subset(test_idx)

    # 3) Fit & apply REWEIGHING on the training split only
    rw = Reweighing(
        unprivileged_groups=[{protected: unprivileged_value}],
        privileged_groups=[{protected:   privileged_value}]
    )
    rw.fit(train_bld)
    train_transf = rw.transform(train_bld)

    X_tr = train_transf.features
    y_tr = train_transf.labels.ravel()
    w_tr = train_transf.instance_weights.ravel()

    X_te = test_bld.features
    y_te = test_bld.labels.ravel()

    # 5) Train with sample_weight = fnlwgt * reweigh_factor (for adult dataset)
    pipe = get_default_model_pipeline()
    pipe.fit(X_tr, y_tr, clf__sample_weight=w_tr)

    y_pred = pipe.predict(X_te)
    test_df = df.iloc[test_idx]
    
    m = compute_metrics(
        test_df, y_te, y_pred,
        protected, privileged_value, unprivileged_value
    )
    results.append(m)

adult_sex_metrics = pd.DataFrame(results)
adult_sex_metrics_agg = adult_sex_metrics.agg(['mean', 'std'])

      accuracy  f1_score       SPD        DI       EOD       AOD
mean  0.845065  0.638245 -0.089908  0.589545  0.137647  0.060430
std   0.002639  0.005547  0.008698  0.034020  0.025841  0.014497


## Race

In [2]:
# 1) Retrieve data
protected = 'race'
privileged_value   = 1.0
unprivileged_value = 0.0

ad, df = load_adult_race()
feature_cols = [c for c in df.columns if c not in ('label','race')]

# 2) Run experiment, Evaluate
sss = StratifiedShuffleSplit(n_splits=25, test_size=0.2, random_state=42)

results = []
for train_idx, test_idx in sss.split(df, df['label']):
    test_df, y_test, y_pred = train_and_predict(
        df, feature_cols, train_idx, test_idx
    )
    m = compute_metrics(test_df, y_test, y_pred, protected, privileged_value, unprivileged_value)
    results.append(m)

# 3) Aggregate results
adult_race_metrics = pd.DataFrame(results)
adult_race_metrics_agg = adult_race_metrics.agg(['mean', 'std'])

In [3]:
adult_race_metrics_agg

Unnamed: 0,accuracy,f1_score,SPD,DI,EOD,AOD
mean,0.850605,0.656032,-0.084891,0.590458,-0.047565,-0.038272
std,0.002979,0.005996,0.005642,0.025659,0.022491,0.010729
