## Setup

In [1]:
import pandas as pd
import numpy as np
from joblib import load
import yaml

from utils import (
    compute_nc_scores,
    find_threshold,
    predict_conformal_sets,
    evaluate_sets
)

  vect_normalized_discounted_cumulative_gain = vmap(
  monte_carlo_vect_ndcg = vmap(vect_normalized_discounted_cumulative_gain, in_dims=(0,))


In [2]:
# Load universe definitions from YAML
with open("universes.yaml") as f:
    universes = yaml.safe_load(f)

In [3]:
# Determine unique model & feature_set combinations (each requires one model)
unique_combos = {(cfg["model"], cfg["feature_set"]) for cfg in universes}

## Data and Model Loading

In [4]:
X_calib_f = pd.read_csv("./output/X_calib_f.csv") # 2015, w. protected attributes
X_calib_s = pd.read_csv("./output/X_calib_s.csv") # 2015, w/o protected attributes
y_calib = pd.read_csv("./output/y_calib.csv").iloc[:,0]

X_test_f = pd.read_csv("./output/X_test_f.csv")
X_test_s = pd.read_csv("./output/X_test_s.csv")
y_test = pd.read_csv("./output/y_test.csv").iloc[:,0]

In [5]:
# Mapping from feature_set name to actual DataFrame, for convenience
feature_sets_calib = {
    "with_protected": X_calib_f,
    "without_protected": X_calib_s
}
feature_sets_test = {
    "with_protected": X_test_f,
    "without_protected": X_test_s
}

## Conformal

In [6]:
# Miscoverage level for conformal prediction (10% allowed error rate => 90% target coverage)
alpha = 0.10

In [7]:
# Helper function: given a trained model file and data, produce conformal prediction sets and metrics
def conformal_calibrate_and_evaluate(model_path, X_cal, y_cal, X_te, y_te, alpha):
    """Load model, compute conformal prediction sets on X_te using calibration set (X_cal, y_cal)."""
    model = load(model_path)
    # Compute nonconformity scores on calibration set (1 - probability of true class)
    probs_cal = model.predict_proba(X_cal)
    nc_scores = compute_nc_scores(probs_cal, y_cal)
    # Find conformal threshold q_hat for the given alpha (split conformal method)
    q_hat = find_threshold(nc_scores, alpha)
    # Generate prediction sets for each test example
    pred_sets = predict_conformal_sets(model, X_te, q_hat)
    # Evaluate coverage and average set size on test data
    metrics = evaluate_sets(pred_sets, y_te)
    return q_hat, metrics, pred_sets

In [8]:
# Run conformal prediction for each model type & feature set combination
conformal_results = {}  # to collect results for each universe combo
for model_type, feature_flag in sorted(unique_combos):
    # Identify the saved model file for this universe (matching training stage naming convention)
    model_filename = f"{model_type}_{feature_flag}.joblib"
    model_path = f"./models/{model_filename}"
    # Select the corresponding calibration and test feature sets
    X_cal = feature_sets_calib[feature_flag]
    X_te  = feature_sets_test[feature_flag]
    # Perform conformal calibration and evaluation
    q_hat, metrics, pred_sets = conformal_calibrate_and_evaluate(model_path, X_cal, y_calib, X_te, y_test, alpha)
    # Store results (coverage and average set size)
    conformal_results[(model_type, feature_flag)] = {
        "q_hat": q_hat,
        "coverage": metrics["coverage"],
        "avg_set_size": metrics["avg_size"]
    }
    # Print a summary of results for this model universe
    cov = metrics["coverage"]; avg = metrics["avg_size"]
    print(f"Model: {model_type} ({feature_flag} - Coverage: {cov:.3f}, Avg. Set Size: {avg:.2f}")

Model: logreg (with_protected - Coverage: 0.911, Avg. Set Size: 1.13
Model: logreg (without_protected - Coverage: 0.910, Avg. Set Size: 1.13
Model: penalized_logreg (with_protected - Coverage: 0.911, Avg. Set Size: 1.13
Model: penalized_logreg (without_protected - Coverage: 0.910, Avg. Set Size: 1.12
Model: rf (with_protected - Coverage: 0.914, Avg. Set Size: 1.14
Model: rf (without_protected - Coverage: 0.913, Avg. Set Size: 1.14


## Analyzing CP per subgroup 

In [11]:
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support

In [12]:
def conformal_prediction_sets(model, X_calib, y_calib, X_test, alpha=0.1):
    """
    Generate conformal prediction sets for X_test using calibration data (X_calib, y_calib).
    Returns a list of prediction sets (as Python sets of class labels) for each test sample.
    """
    # Compute nonconformity scores on calibration set (1 - probability of true class)
    probs_calib = model.predict_proba(X_calib)         # shape: (n_calib, n_classes)
    nc_scores = 1.0 - probs_calib[np.arange(len(y_calib)), y_calib]  # 1 - P(true_label)
    # Determine conformal threshold q_hat at level alpha
    q_hat = np.quantile(nc_scores, 1 - alpha, method="higher")
    # Compute nonconformity scores for each test sample and derive prediction sets
    probs_test = model.predict_proba(X_test)           # shape: (n_test, n_classes)
    nonconf_test = 1.0 - probs_test                    # nonconformity for each class
    pred_sets = [ set(np.where(nc_row <= q_hat)[0]) for nc_row in nonconf_test ]
    return pred_sets

In [13]:
def build_cp_groups(pred_sets, y_test, test_idx, X_test_full):
    """
    Build a DataFrame of conformal prediction results with subgroup info.
    - pred_sets: list of prediction sets for each test sample (in order of test_idx).
    - y_test: Series or array of true labels for test samples.
    - test_idx: index of the test samples corresponding to pred_sets.
    - X_test_full: DataFrame of test features **including protected attributes**.
    """
    cp_df = pd.DataFrame(index=test_idx.copy())
    # Store prediction sets (convert each to a set of ints for consistency)
    cp_df['pred_set']   = pd.Series(pred_sets, index=test_idx).apply(lambda s: {int(x) for x in s})
    cp_df['true_label'] = y_test.reindex(test_idx)
    # Bring in protected attributes from the full test set
    cp_df['frau1']    = X_test_full.loc[test_idx, 'frau1']          # 1 = female, 0 = male
    # Derive 'nongerman' flag ('maxdeutsch1' indicates German (1) vs non-German (0))
    cp_df['nongerman'] = np.where(X_test_full.loc[test_idx, 'maxdeutsch1'] == 0, 1, 0)
    # Handle missing nationality information if applicable
    if 'maxdeutsch.Missing.' in X_test_full.columns:
        missing_mask = (X_test_full.loc[test_idx, 'maxdeutsch.Missing.'] == 1)
        cp_df.loc[missing_mask, 'nongerman'] = np.nan
    # Derive intersectional subgroup flags
    cp_df['nongerman_male']   = np.where((cp_df['nongerman'] == 1) & (cp_df['frau1'] == 0), 1, 0)
    cp_df['nongerman_female'] = np.where((cp_df['nongerman'] == 1) & (cp_df['frau1'] == 1), 1, 0)
    # (Optional) drop samples with missing subgroup info
    cp_df = cp_df.dropna(subset=['nongerman'])
    return cp_df

In [None]:
def compute_subgroup_metrics(cp_df):
    """
    Compute accuracy, precision, recall, and F1 for overall and each subgroup in cp_df.
    Considers only single-label prediction sets as definite predictions.
    """
    results = {}
    # Convert prediction sets to list for easier handling
    pred_sets_list = cp_df['pred_set'].tolist()
    y_true = np.array(cp_df['true_label'])
    # Identify non-ambiguous (single-label) predictions
    is_single = np.array([len(s) == 1 for s in pred_sets_list])
    # Overall metrics for all single predictions
    if np.any(is_single):
        y_true_single = y_true[is_single]
        # Extract the single predicted label from each singleton set
        y_pred_single = [next(iter(s)) for s, flag in zip(pred_sets_list, is_single) if flag]
        # Compute confusion matrix and metrics
        tn, fp, fn, tp = confusion_matrix(y_true_single, y_pred_single, labels=[0, 1]).ravel()
        precision, recall, f1, _ = precision_recall_fscore_support(
            y_true_single, y_pred_single, average="binary", zero_division=0
        )
        results["overall"] = {
            "TP": tp, "TN": tn, "FP": fp, "FN": fn,
            "Accuracy": (tp + tn) / (tp + tn + fp + fn) if (tp+tn+fp+fn) > 0 else 0.0,
            "Precision": precision, "Recall": recall, "F1": f1,
            "Num_Samples": len(y_true_single),
            "Frac_NonAmbiguous": np.mean(is_single)  # fraction of test samples with a single-label pred
        }
    else:
        # If no single-label predictions, overall metrics are all zero/none
        # ???? check if this is desired behavior
        results["overall"] = {
            "TP": 0, "TN": 0, "FP": 0, "FN": 0,
            "Accuracy": 0.0, "Precision": 0.0, "Recall": 0.0, "F1": 0.0,
            "Num_Samples": 0, "Frac_NonAmbiguous": 0.0
        }
    # List of subgroup columns to evaluate (1 indicates membership in subgroup)
    subgroup_cols = ["frau1", "nongerman", "nongerman_male", "nongerman_female"]
    for col in subgroup_cols:
        mask = (cp_df[col] == 1).to_numpy() & is_single  # members of subgroup with definite predictions
        if np.any(mask):
            y_true_sub = y_true[mask]
            y_pred_sub = [next(iter(s)) for s, flag in zip(pred_sets_list, mask) if flag]
            tn, fp, fn, tp = confusion_matrix(y_true_sub, y_pred_sub, labels=[0, 1]).ravel()
            precision, recall, f1, _ = precision_recall_fscore_support(
                y_true_sub, y_pred_sub, average="binary", zero_division=0
            )
            results[col] = {
                "TP": tp, "TN": tn, "FP": fp, "FN": fn,
                "Accuracy": (tp + tn) / (tp + tn + fp + fn) if (tp+tn+fp+fn) > 0 else 0.0,
                "Precision": precision, "Recall": recall, "F1": f1,
                "Num_Samples": len(y_true_sub),
                # Fraction of subgroup's samples with a single-label prediction
                "Frac_NonAmbiguous": np.mean(is_single[cp_df[col] == 1])
            }
        else:
            results[col] = None  # No definite predictions (or no samples) in this subgroup
    metrics_df = pd.DataFrame(results).T
    metrics_df.index.name = 'Subgroup'
    return metrics_df

In [15]:
# 2. Use the above functions in the analysis workflow
# Example for a single universe or model configuration:
model_path = "./models/logreg_with_protected.joblib"
model = load(model_path)
# Use appropriate feature sets depending on model (with or without protected)
X_calib = X_calib_f   # (assuming this model uses protected attributes)
X_test  = X_test_f
# Generate conformal prediction sets for this model
pred_sets = conformal_prediction_sets(model, X_calib, y_calib.to_numpy(), X_test, alpha=0.1)
# Build the cp_groups DataFrame with true labels and group info
cp_groups_df = build_cp_groups(pred_sets, y_test, X_test.index, X_test_f)
# Compute subgroup metrics
metrics = compute_subgroup_metrics(cp_groups_df)
print(metrics)


                     TP       TN     FP      FN  Accuracy  Precision  \
Subgroup                                                               
overall           345.0  68042.0  451.0  7305.0  0.898139   0.433417   
frau1             118.0  28438.0  177.0  3187.0  0.894612   0.400000   
nongerman          37.0  14876.0   36.0  1515.0  0.905794   0.506849   
nongerman_male     22.0   9687.0   17.0   773.0  0.924755   0.564103   
nongerman_female   15.0   5189.0   19.0   742.0  0.872422   0.441176   

                    Recall        F1  Num_Samples  Frac_NonAmbiguous  
Subgroup                                                              
overall           0.045098  0.081695      76143.0           0.875378  
frau1             0.035703  0.065556      31920.0           0.860493  
nongerman         0.023840  0.045538      16464.0           0.924269  
nongerman_male    0.027673  0.052758      10499.0           0.945600  
nongerman_female  0.019815  0.037927       5965.0           0.888972 

In [None]:
# 2. Compute metrics for each universe in universes
metrics_by_universe = {}
records = []

for universe in universes:
    model_type = universe['model']
    feature_flag = universe['feature_set']
    universe_id = f"{model_type}_{feature_flag}"
    model_path = f"./models/{universe_id}.joblib"
    model = load(model_path)

    if feature_flag == "with_protected":
        X_calib, X_test = X_calib_f, X_test_f
    else:
        X_calib, X_test = X_calib_s, X_test_s

    pred_sets = conformal_prediction_sets(model, X_calib, y_calib.to_numpy(), X_test, alpha=0.1)
    cp_groups_df = build_cp_groups(pred_sets, y_test, X_test.index, X_test_f)
    metrics = compute_subgroup_metrics(cp_groups_df)
    metrics_by_universe[universe['id']] = metrics

    for subgroup, row in metrics.iterrows():
        if row is not None:
            row_dict = row.to_dict()
            row_dict.update({
                "UniverseID": universe['id'],
                "Model": model_type,
                "FeatureSet": feature_flag,
                "Subgroup": subgroup
            })
            records.append(row_dict)

# Combine into a single DataFrame
all_metrics_df = pd.DataFrame(records)

In [20]:
all_metrics_df.to_csv("./output/conformal_subgroup_metrics.csv", index=False)
