## Setup

In [1]:
import pandas as pd
import numpy as np
from joblib import load
import yaml

from utils import (
    compute_nc_scores,
    find_threshold,
    predict_conformal_sets,
    evaluate_sets
)

  vect_normalized_discounted_cumulative_gain = vmap(
  monte_carlo_vect_ndcg = vmap(vect_normalized_discounted_cumulative_gain, in_dims=(0,))


In [2]:
# Load universe definitions from YAML
with open("universes.yaml") as f:
    universes = yaml.safe_load(f)

In [3]:
# Determine unique model & feature_set combinations (each requires one model)
unique_combos = {(cfg["model"], cfg["feature_set"]) for cfg in universes}

## Data and Model Loading

In [4]:
X_calib_f = pd.read_csv("./output/X_calib_f.csv") # 2015, w. protected attributes
X_calib_s = pd.read_csv("./output/X_calib_s.csv") # 2015, w/o protected attributes
y_calib = pd.read_csv("./output/y_calib.csv").iloc[:,0]

X_test_f = pd.read_csv("./output/X_test_f.csv")
X_test_s = pd.read_csv("./output/X_test_s.csv")
y_test = pd.read_csv("./output/y_test.csv").iloc[:,0]

In [5]:
# Mapping from feature_set name to actual DataFrame, for convenience
feature_sets_calib = {
    "with_protected": X_calib_f,
    "without_protected": X_calib_s
}
feature_sets_test = {
    "with_protected": X_test_f,
    "without_protected": X_test_s
}

## Conformal

In [6]:
# Miscoverage level for conformal prediction (10% allowed error rate => 90% target coverage)
alpha = 0.10

In [7]:
# Helper function: given a trained model file and data, produce conformal prediction sets and metrics
def conformal_calibrate_and_evaluate(model_path, X_cal, y_cal, X_te, y_te, alpha):
    """Load model, compute conformal prediction sets on X_te using calibration set (X_cal, y_cal)."""
    model = load(model_path)
    # Compute nonconformity scores on calibration set (1 - probability of true class)
    probs_cal = model.predict_proba(X_cal)
    nc_scores = compute_nc_scores(probs_cal, y_cal)
    # Find conformal threshold q_hat for the given alpha (split conformal method)
    q_hat = find_threshold(nc_scores, alpha)
    # Generate prediction sets for each test example
    pred_sets = predict_conformal_sets(model, X_te, q_hat)
    # Evaluate coverage and average set size on test data
    metrics = evaluate_sets(pred_sets, y_te)
    return q_hat, metrics, pred_sets

In [8]:
# Run conformal prediction for each model type & feature set combination
conformal_results = {}  # to collect results for each universe combo
for model_type, feature_flag in sorted(unique_combos):
    # Identify the saved model file for this universe (matching training stage naming convention)
    model_filename = f"{model_type}_{feature_flag}.joblib"
    model_path = f"./models/{model_filename}"
    # Select the corresponding calibration and test feature sets
    X_cal = feature_sets_calib[feature_flag]
    X_te  = feature_sets_test[feature_flag]
    # Perform conformal calibration and evaluation
    q_hat, metrics, pred_sets = conformal_calibrate_and_evaluate(model_path, X_cal, y_calib, X_te, y_test, alpha)
    # Store results (coverage and average set size)
    conformal_results[(model_type, feature_flag)] = {
        "q_hat": q_hat,
        "coverage": metrics["coverage"],
        "avg_set_size": metrics["avg_size"]
    }
    # Print a summary of results for this model universe
    cov = metrics["coverage"]; avg = metrics["avg_size"]
    print(f"Model: {model_type} ({feature_flag} - Coverage: {cov:.3f}, Avg. Set Size: {avg:.2f}")

Model: logreg (with_protected - Coverage: 0.911, Avg. Set Size: 1.13
Model: logreg (without_protected - Coverage: 0.910, Avg. Set Size: 1.13
Model: penalized_logreg (with_protected - Coverage: 0.911, Avg. Set Size: 1.13
Model: penalized_logreg (without_protected - Coverage: 0.910, Avg. Set Size: 1.12
Model: rf (with_protected - Coverage: 0.914, Avg. Set Size: 1.14
Model: rf (without_protected - Coverage: 0.913, Avg. Set Size: 1.14


## Analyzing CP per group 

In [9]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [10]:
def analyze_cp_groups(cp_groups):
    """Compute accuracy, precision, recall, f1 per subgroup in cp_groups DataFrame."""

    # Evaluate pred_set if needed
    if isinstance(cp_groups['pred_set'].iloc[0], str):
        cp_groups['pred_set'] = cp_groups['pred_set'].apply(eval)

    # Define subgroups
    group_definitions = {
        'female': lambda df: df['frau1'] == 1,
        'male': lambda df: df['frau1'] == 0,
        'nongerman': lambda df: df['nongerman'] == 1,
        'german': lambda df: df['nongerman'] == 0,
        'nongerman_male': lambda df: df['nongerman_male'] == 1,
        'nongerman_female': lambda df: df['nongerman_female'] == 1
    }

    # Compute metrics per group
    results = {}
    for group, condition in group_definitions.items():
        mask = condition(cp_groups) & cp_groups['pred_set'].apply(lambda s: len(s) == 1)
        if not np.any(mask):
            results[group] = None
            continue
        y_true = cp_groups.loc[mask, 'true_label']
        y_pred = cp_groups.loc[mask, 'pred_set'].apply(lambda s: list(s)[0])
        results[group] = {
            'accuracy': accuracy_score(y_true, y_pred),
            'precision': precision_score(y_true, y_pred),
            'recall': recall_score(y_true, y_pred),
            'f1': f1_score(y_true, y_pred)
        }

    return pd.DataFrame(results).T

## Version 2 ???

In [12]:
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support

In [13]:
def conformal_prediction_sets(model, X_calib, y_calib, X_test, alpha=0.1):
    """
    Given a trained model and data, produce conformal prediction sets for X_test.
    Uses X_calib and y_calib to compute nonconformity scores and threshold at level alpha.
    Returns a list of prediction sets (as Python sets) for each row in X_test.
    """
    # Compute nonconformity scores on calibration set: 1 - P_true_label:contentReference[oaicite:11]{index=11}
    probs_calib = model.predict_proba(X_calib)  # shape (n_calib, n_classes)
    nc_scores = 1.0 - probs_calib[np.arange(len(y_calib)), y_calib]  # compute 1 - P(correct class)
    # Determine quantile threshold for given alpha:contentReference[oaicite:12]{index=12}
    q_hat = np.quantile(nc_scores, 1 - alpha, method="higher")
    # Generate conformal prediction sets for each test sample:contentReference[oaicite:13]{index=13}
    probs_test = model.predict_proba(X_test)
    nonconf_test = 1.0 - probs_test  # nonconformity for each class on test
    pred_sets = [set(np.where(nc_row <= q_hat)[0]) for nc_row in nonconf_test]
    return pred_sets


In [14]:
def build_cp_groups(pred_sets, y_test, test_idx, X_test_full):
    """
    Construct a cp_groups DataFrame given conformal prediction sets, true labels, 
    and test indices. Merges in protected attributes from the full test set.
    """
    # Initialize DataFrame with test indices to align with X_test_full:contentReference[oaicite:14]{index=14}
    cp_df = pd.DataFrame(index=test_idx.copy())
    cp_df['pred_set']   = pd.Series(pred_sets, index=test_idx).apply(lambda s: {int(x) for x in s})
    cp_df['true_label'] = y_test.reindex(test_idx)
    # Merge protected attributes from the full test set:contentReference[oaicite:15]{index=15}
    cp_df['frau1']   = X_test_full.loc[test_idx, 'frau1']
    cp_df['nongerman'] = np.where(
        X_test_full.loc[test_idx, 'maxdeutsch1'] == 0, 
        1, 
        0
    )
    # Set 'nongerman' to NaN where nationality is missing:contentReference[oaicite:16]{index=16}
    if 'maxdeutsch.Missing.' in X_test_full.columns:
        missing_mask = (X_test_full.loc[test_idx, 'maxdeutsch.Missing.'] == 1)
        cp_df.loc[missing_mask, 'nongerman'] = np.nan
    # Derived subgroup flags:contentReference[oaicite:17]{index=17}
    cp_df['nongerman_male']   = np.where((cp_df['nongerman'] == 1) & (cp_df['frau1'] == 0), 1, 0)
    cp_df['nongerman_female'] = np.where((cp_df['nongerman'] == 1) & (cp_df['frau1'] == 1), 1, 0)
    # Drop any rows with NaN in subgroup columns (e.g., missing protected info):contentReference[oaicite:18]{index=18}
    cp_df = cp_df.dropna(subset=['nongerman'])
    return cp_df

In [15]:
def generate_all_cp_groups(universes, X_calib_f, X_calib_s, y_calib, 
                            X_test_f, X_test_s, y_test, alpha=0.1, models_dir="./models"):
    """
    Iterate over all universes and generate cp_groups DataFrame for each.
    Returns a dictionary mapping universe ID to cp_groups DataFrame.
    Also returns a combined DataFrame with an added 'universe_id' column for comparison (if needed).
    """
    cp_groups_dict = {}
    # Group universes by (model_type, feature_set) to reuse models/calibration
    from collections import defaultdict
    universe_groups = defaultdict(list)
    for cfg in universes:
        key = (cfg["model"], cfg["feature_set"])
        universe_groups[key].append(cfg)
    # Loop over each unique model configuration
    for (model_type, feature_flag), cfg_list in universe_groups.items():
        # Load the trained model for this config
        model_path = f"{models_dir}/{model_type}_{feature_flag}.joblib"
        model = load(model_path)
        # Select the appropriate calibration and test sets based on feature_flag
        X_calib = X_calib_f if feature_flag == "with_protected" else X_calib_s
        X_test  = X_test_f  if feature_flag == "with_protected" else X_test_s
        # Compute conformal prediction sets for this model
        pred_sets = conformal_prediction_sets(model, X_calib, y_calib.to_numpy(), X_test, alpha)
        # Build the base cp_groups DataFrame for this model (without universe ID yet)
        cp_df_base = build_cp_groups(pred_sets, y_test, X_test.index, X_test_f)
        # Assign this cp_groups DataFrame to each universe variant (threshold policy) in cfg_list
        for cfg in cfg_list:
            uid = cfg["id"]
            # We make a copy so each universe ID has an independent DataFrame (to avoid aliasing)
            cp_groups_dict[uid] = cp_df_base.copy()
            cp_groups_dict[uid]["universe_id"] = uid  # tag the universe ID (useful if combining)
    # Combine all cp_groups into one DataFrame (optional)
    combined_cp = pd.concat(cp_groups_dict.values(), ignore_index=True)
    # Also add universe_id in combined (ensure present even if empty DataFrames were concatenated)
    if "universe_id" not in combined_cp.columns:
        combined_cp.insert(0, "universe_id", combined_cp.index.map(lambda i: list(cp_groups_dict.keys())[i] ))
    return cp_groups_dict, combined_cp


In [16]:
def compute_subgroup_metrics(cp_df):
    """
    Compute accuracy, precision, recall, and F1 for overall and each subgroup in cp_df.
    Only single-label prediction sets are counted as definite predictions; 
    ambiguous sets (both classes) are ignored in metric calculations.
    Returns a DataFrame with metrics for 'overall' and each subgroup.
    """
    results = {}
    # Convert Series to list/array for processing
    pred_sets_list = cp_df['pred_set'].tolist()
    y_true = np.array(cp_df['true_label'])
    # Boolean array for non-ambiguous predictions (prediction set of size 1):contentReference[oaicite:19]{index=19}
    is_single = np.array([len(s) == 1 for s in pred_sets_list])
    # Overall metrics (on non-ambiguous predictions only)
    if np.any(is_single):
        y_true_single = y_true[is_single]
        # Extract the single predicted label from each singleton set
        y_pred_single = [next(iter(s)) for s, flag in zip(pred_sets_list, is_single) if flag]
        tn, fp, fn, tp = confusion_matrix(y_true_single, y_pred_single, labels=[0,1]).ravel()
        precision, recall, f1, _ = precision_recall_fscore_support(y_true_single, y_pred_single, average="binary", zero_division=0)
        results["overall"] = {
            "TP": tp, "TN": tn, "FP": fp, "FN": fn,
            "Accuracy": (tp + tn) / (tp + tn + fp + fn) if (tp+tn+fp+fn) > 0 else 0.0,
            "Precision": precision, "Recall": recall, "F1": f1,
            "Num_Samples": len(y_true_single), 
            "Frac_NonAmbiguous": np.mean(is_single)  # fraction of test samples with a single-label prediction
        }
    else:
        results["overall"] = {"TP":0,"TN":0,"FP":0,"FN":0,"Accuracy":0,"Precision":0,"Recall":0,"F1":0,
                               "Num_Samples": 0, "Frac_NonAmbiguous": 0.0}
    # Compute metrics for each subgroup (using only cases where subgroup flag == 1)
    subgroup_cols = ["frau1", "nongerman", "nongerman_male", "nongerman_female"]
    for col in subgroup_cols:
        mask = (cp_df[col] == 1).to_numpy() & is_single  # only consider subgroup members with definite predictions
        if np.any(mask):
            y_true_sub = y_true[mask]
            y_pred_sub = [next(iter(s)) for s, flag in zip(pred_sets_list, mask) if flag]
            tn, fp, fn, tp = confusion_matrix(y_true_sub, y_pred_sub, labels=[0,1]).ravel()
            precision, recall, f1, _ = precision_recall_fscore_support(y_true_sub, y_pred_sub, average="binary", zero_division=0)
            results[col] = {
                "TP": tp, "TN": tn, "FP": fp, "FN": fn,
                "Accuracy": (tp + tn) / (tp + tn + fp + fn) if (tp+tn+fp+fn) > 0 else 0.0,
                "Precision": precision, "Recall": recall, "F1": f1,
                "Num_Samples": len(y_true_sub),
                "Frac_NonAmbiguous": np.mean(is_single[cp_df[col] == 1])  # fraction of subgroup's samples with singleton pred
            }
        else:
            # No definite predictions for this subgroup (or subgroup empty after dropna)
            results[col] = None
    # Convert results to DataFrame for a tidy display
    metrics_df = pd.DataFrame(results).T
    metrics_df.index.name = 'Subgroup'
    return metrics_df

In [17]:
cp_groups_dict, cp_all_df = generate_all_cp_groups(universes, X_calib_f, X_calib_s, y_calib, 
                                                   X_test_f, X_test_s, y_test, alpha=0.1)

In [19]:
# Get metrics for a specific universe (e.g., universe 3)
metrics_u3 = compute_subgroup_metrics(cp_groups_dict[12])
print(metrics_u3)


                    TP       TN     FP      FN  Accuracy  Precision    Recall  \
Subgroup                                                                        
overall           32.0  67440.0  169.0  7433.0  0.898740   0.159204  0.004287   
frau1             16.0  28079.0   77.0  3213.0  0.895173   0.172043  0.004955   
nongerman         20.0  14412.0  156.0  1550.0  0.894287   0.113636  0.012739   
nongerman_male    10.0   9417.0   84.0   798.0  0.914444   0.106383  0.012376   
nongerman_female  10.0   4995.0   72.0   752.0  0.858638   0.121951  0.013123   

                        F1  Num_Samples  Frac_NonAmbiguous  
Subgroup                                                    
overall           0.008349      75074.0           0.863088  
frau1             0.009633      31385.0           0.846071  
nongerman         0.022910      16138.0           0.905968  
nongerman_male    0.022173      10309.0           0.928488  
nongerman_female  0.023697       5829.0           0.868703  


In [20]:
# Or get metrics for all universes in one DataFrame by grouping cp_all_df
metrics_all = cp_all_df.groupby('universe_id').apply(lambda df: compute_subgroup_metrics(df))

  metrics_all = cp_all_df.groupby('universe_id').apply(lambda df: compute_subgroup_metrics(df))
