In [21]:
run_no = 0
universe_id = "test"
universe = {
    #"scale": "scale", # "scale", "do-not-scale",
    #"encode_categorical": "one-hot", # "ordinal", "one-hot"
    "model": "logreg", # "logreg", "penalized_logreg", "rf",
    "cutoff": ["quantile_0.15", "quantile_0.30"],
    "exclude_features": "nationality", # "none", "nationality", "sex", "nationality-sex"
    "exclude_subgroups": "drop-non-german", # "keep-all", "drop-non-german"
}

output_dir="./output"
seed=0

In [22]:
import json
# Parse universe into dict if it is passed as a string
if isinstance(universe, str):
    universe = json.loads(universe)

In [23]:
# Auto-reload the custom package
%load_ext autoreload
%autoreload 1
%aimport fairness_multiverse

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [24]:
from fairness_multiverse.universe import UniverseAnalysis

universe_analysis = UniverseAnalysis(
    run_no = run_no,
    universe_id = universe_id,
    universe = universe,
    output_dir=output_dir,
)

In [25]:
import numpy as np
parsed_seed = int(seed)
np.random.seed(parsed_seed)
print(f"Using Seed: {parsed_seed}")

Using Seed: 0


# Loading Data

Load siab_train, siab_test, siab_calib and/or 
load siab_train_features, siab_train_labels

In [6]:
from pathlib import Path
import pandas as pd

# File paths
raw_file = Path("data/raw/siab.csv")
cache_file = Path("data/siab_cached.csv.gz")

# Ensure cache directory exists
cache_file.parent.mkdir(parents=True, exist_ok=True)

# Load with simple caching
if cache_file.exists():
    print(f"Loading SIAB data from cache: {cache_file}")
    siab = pd.read_csv(cache_file, compression='gzip')
else:
    print(f"Cache not found. Reading raw SIAB data: {raw_file}")
    siab = pd.read_csv(raw_file)
    siab.to_csv(cache_file, index=False, compression='gzip')
    print(f"Cached SIAB data to: {cache_file}")

# Now use `siab` DataFrame as needed
print(siab.shape)


Loading SIAB data from cache: data/siab_cached.csv.gz
(643690, 164)


In [26]:
X_train = pd.read_csv("./data/X_train.csv")
y_train = pd.read_csv("./data/y_train.csv")

In [None]:
siab

Pre-Processing for Selected Task -> skipped. I think I don't need this

# Preprocessing Data

In [27]:
# Exclude protected features
# "exclude_features": "none", # "nationality", "sex", "nationality-sex"

# ToDo: incorporate maxdeutsch.Missing into maxdeutsch1

excluded_features = universe["exclude_features"].split("-") # split, e.g.: "nationality-sex" -> ["nationality", "sex"]
excluded_features_dictionary = {
    "nationality": ["maxdeutsch1", "maxdeutsch.Missing."],
    "sex": ["frau1"],
}


In [28]:
# Code nice names to column names
# ???? empty

excluded_features_columns = [
    excluded_features_dictionary[f] for f in excluded_features if len(f) > 0 and f != "none"
]

In [29]:
from utils import flatten_once

excluded_features_columns = flatten_once(excluded_features_columns)

In [30]:
if len(excluded_features_columns) > 0:
    print(f"Dropping features: {excluded_features_columns}")
    X_train.drop(excluded_features_columns, axis=1, inplace=True)

Dropping features: ['maxdeutsch1', 'maxdeutsch.Missing.']


In [None]:
# Split data ???

In [12]:
# Exclude Certain Subgroups
# ??? just training or also calibration? 

# Extract configuration
exclude_subgroups_config = universe["exclude_subgroups"].split("_")

In [13]:
if len(exclude_subgroups_config) == 1:
    exclude_subgroups_config = (exclude_subgroups_config[0], None, None)

In [14]:
excl_subgroups_method, excl_subgroup_colname, excl_subgroups_value = exclude_subgroups_config

In [15]:
if excl_subgroup_colname == "nationality":
    excl_subgroup_column = ["maxdeutsch1", "maxdeutsch.Missing."]
    excl_subgroup_counts = org_train[excl_subgroup_column].value_counts()
elif excl_subgroups_method != "keep-all":
    raise Exception("Unsupported configuration for exclude_subgroups:" + universe["exclude_subgroups"])



In [None]:
if excl_subgroups_method == "keep-all":
    # Don't need to do anything
    excl_subgroup_column = None
    excl_subgroup_values = []
else:
    if excl_subgroups_method == "drop-smallest":
        drop_smallest_n = int(excl_subgroups_value)
        excl_subgroup_values = list(excl_subgroup_counts.tail(drop_smallest_n).index)
    elif excl_subgroups_method == "keep-largest":
        keep_largest_n = int(excl_subgroups_value)
        excl_subgroup_values = list(excl_subgroup_counts.tail(
            len(excl_subgroup_counts) - keep_largest_n
        ).index)
    elif excl_subgroups_method == "drop-name":
        excl_subgroup_values = [excl_subgroups_value]
    else:
        raise Exception("Unsupported configuration for exclude_subgroups:" + universe["exclude_subgroups"])

    if excl_subgroup_column is not None:
        print(f"Dropping values: {excl_subgroup_values}")
        keep_rows_mask = ~org_train[excl_subgroup_column].isin(excl_subgroup_values)

    n_rows_to_drop = (~keep_rows_mask).sum()
    if n_rows_to_drop > 0:
        print(f"Dropping N = {n_rows_to_drop} ({n_rows_to_drop / len(keep_rows_mask):.2%}) rows from {excl_subgroup_colname}")
        X_train = X_train[keep_rows_mask]
        y_train = y_train[keep_rows_mask]
        group_train = group_train[keep_rows_mask]

In [None]:
# gpt

subgrp_method = universe["exclude_subgroups"]

# If we're not keeping all, define a mask to drop any non-German nationals
if subgrp_method == "keep-all":
    # No changes
    X_train_filtered = X_train.copy()
    y_train_filtered = y_train.copy()
    group_train_filtered = group_train.copy()

elif subgrp_method == "drop-non-german":
    # Assume 'nationality' is encoded in column "maxdeutsch1" with value 1 == German
    mask_is_german = (org_train["maxdeutsch1"] == 1)
    n_drop = (~mask_is_german).sum()
    pct_drop = n_drop / len(mask_is_german)
    print(f"Dropping N = {n_drop} ({pct_drop:.2%}) non-German rows")

    X_train_filtered = X_train[mask_is_german]
    y_train_filtered = y_train[mask_is_german]
    group_train_filtered = group_train[mask_is_german]

else:
    raise ValueError(f"Unsupported configuration for exclude_subgroups: {subgrp_method}")

# Now reassign (or continue working with) the filtered sets:
X_train, y_train, group_train = X_train_filtered, y_train_filtered, group_train_filtered

In [31]:
# gpt2
# Would that still work if protected features are excluded and non-germans should be dropped?

if universe["exclude_subgroups"] == "keep-all":
    keep_mask = pd.Series(True, index=X_train.index)

elif universe["exclude_subgroups"] == "drop-non-german":
    # keep only rows where maxdeutsch1 == 1 (German nationals)
    # this will automatically drop both non-German (0) and Missing (NaN or 0+Missing flag)
    keep_mask = X_train["maxdeutsch1"] == 1

    # optional: if maxdeutsch1==0 but maxdeutsch.Missing.==1,
    # you might want to treat those separately (e.g. impute or tag).
    # But for “drop-non-german” we just drop everything not ==1.
    
    n_drop = (~keep_mask).sum()
    print(f"Dropping N = {n_drop} ({n_drop/len(keep_mask):.2%}) non-German or missing rows")

else:
    raise ValueError(f"Unsupported exclude_subgroups: {universe['exclude_subgroups']}")

# --- 3) Apply the mask ---
X_train   = X_train[keep_mask]
y_train   = y_train[keep_mask]
#group_train = group_train[keep_mask]

KeyError: 'maxdeutsch1'