## Setup

In [1]:
import pandas as pd
import numpy as np
from joblib import load

from utils import (
    compute_nc_scores,
    find_threshold,
    predict_conformal_sets,
    evaluate_sets,
    summarize_by_indicator,
    summarize_for_predicate
)

  vect_normalized_discounted_cumulative_gain = vmap(
  monte_carlo_vect_ndcg = vmap(vect_normalized_discounted_cumulative_gain, in_dims=(0,))


## Data and Model Loading

In [32]:
X_calib_f = pd.read_csv("./output/X_calib_f.csv") # 2015, w. protected attributes
#X_calib_s = pd.read_csv("./output/X_calib_s.csv") # 2015, w/o protected attributes
y_calib = pd.read_csv("./output/y_calib.csv").iloc[:,0]

X_test_f = pd.read_csv("./output/X_test_f.csv")
#X_test_s = pd.read_csv("./output/X_test_s.csv")
y_test = pd.read_csv("./output/y_test.csv").iloc[:,0]

In [33]:
#preds_test = pd.read_csv("./output/preds_test.csv")

glm1 = load("./models/glm1.joblib")

## Conformal

In [34]:
# Miscoverage level
alpha = 0.1

In [35]:
probs_calib = glm1.predict_proba(X_calib_f)

nc_scores = compute_nc_scores(probs_calib, y_calib)

q_hat = find_threshold(nc_scores, alpha) # q_hat is data-driven threshold for classification
print(f"q_hat: {q_hat:.4f}")

q_hat: 0.6604


In [36]:
# With test data
pred_sets = predict_conformal_sets(glm1, X_test_f, q_hat)

In [7]:
# With test data
evaluation = evaluate_sets(pred_sets, y_test)
print(f"Coverage: {evaluation['coverage']:.2f}")
print(f"Avg. set size: {evaluation['avg_size']:.2f}")

Coverage: 0.91
Avg. set size: 1.13


## Analyzing CP per group 

In [11]:
# Create cp_groups with the same index as X_test_f (and y_test)
cp_groups = pd.DataFrame(index=X_test_f.index)
cp_groups['pred_set'] = pd.Series(pred_sets, index=X_test_f.index).apply(lambda s: {int(x) for x in s})
cp_groups['true_label'] = y_test.reindex(X_test_f.index)
cp_groups['frau1'] = X_test_f['frau1']

cp_groups['nongerman'] = np.where(
    X_test_f['maxdeutsch1'] == 0, 
    1, 
    0
)
cp_groups.loc[
    X_test_f['maxdeutsch.Missing.'] == 1, 
    'nongerman'
] = np.nan

cp_groups['nongerman_male'] = np.where(
    (cp_groups['nongerman'] == 1) & (cp_groups['frau1'] == 0),
    1,
    0
)
cp_groups['nongerman_female'] = np.where(
    (cp_groups['nongerman'] == 1) & (cp_groups['frau1'] == 1),
    1,
    0
)

cp_groups = cp_groups.dropna()

In [24]:
# Conditional coverage and set size

# List of subgroup indicators to evaluate
groups = ['frau1', 'nongerman', 'nongerman_male', 'nongerman_female']

# Align pred_sets with y_test indices for easy filtering
pred_sets_series = pd.Series(pred_sets, index=y_test.index)

# Prepare a list to collect results
results = []

for group in groups:
    # Create a boolean mask for the current subgroup (True for indices in the subgroup)
    mask = (cp_groups[group] == 1)
    # Align the mask to y_test index (in case cp_groups has a subset of test indices)
    mask_aligned = mask.reindex(y_test.index, fill_value=False)
    
    # Filter true labels and prediction sets for this subgroup
    group_y = y_test[mask_aligned]             # true labels for this subgroup
    group_pred_sets = pred_sets_series[mask_aligned]  # prediction sets for this subgroup
    
    # Compute coverage: fraction of cases where true label is in the prediction set
    coverage = np.mean([
        1 if true_label in pred_set else 0 
        for true_label, pred_set in zip(group_y, group_pred_sets)
    ])
    # Compute average prediction set size for this subgroup
    avg_set_size = np.mean([len(pred_set) for pred_set in group_pred_sets])
    
    # Store the results (optionally multiply coverage by 100 if you want percentage)
    results.append({
        'Group': group,
        'Coverage': coverage,
        'Avg Set Size': avg_set_size,
        'Num Samples': mask_aligned.sum()  # number of test samples in this subgroup
    })

# Create a DataFrame for clear tabular display of the results
coverage_results = pd.DataFrame(results).set_index('Group')
print(coverage_results)

                  Coverage  Avg Set Size  Num Samples
Group                                                
frau1             0.909314      1.139507        37095
nongerman         0.912929      1.075731        17813
nongerman_male    0.928848      1.054400        11103
nongerman_female  0.886587      1.111028         6710


In [13]:
# Subgroups true class label distributions

# Overall distribution of true_label
print("Overall true_label distribution:")
print(cp_groups['true_label'].value_counts().sort_index())
print("As proportions:")
print(cp_groups['true_label'].value_counts(normalize=True).sort_index())
print(f"P(true_label=1): {cp_groups['true_label'].mean():.4f}")
print()

Overall true_label distribution:
true_label
0    75876
1    11107
Name: count, dtype: int64
As proportions:
true_label
0    0.872308
1    0.127692
Name: proportion, dtype: float64
P(true_label=1): 0.1277



In [20]:
# Distribution conditional on frau1
print("Distribution conditional on frau1:")
for frau_val in [0, 1]:
    subset = cp_groups[cp_groups['frau1'] == frau_val] # Get all females
    prop_positive = subset['true_label'].mean() # What % of females have true_label=1?
    print(f"P(true_label=1 | frau1={frau_val}): {prop_positive:.4f} (n={len(subset)})")
print()

# Add total counts
n_female = (cp_groups['frau1'] == 1).sum()
n_male = (cp_groups['frau1'] == 0).sum()
print(f"Total observations: {len(cp_groups)} (female: n={n_female}, male: n={n_male})")
print()

Distribution conditional on frau1:
P(true_label=1 | frau1=0): 0.1235 (n=49888)
P(true_label=1 | frau1=1): 0.1334 (n=37095)

Total observations: 86983 (female: n=37095, male: n=49888)



In [21]:
# Distribution conditional on nongerman
print("Distribution conditional on nongerman:")
for ng_val in [0, 1]:
    subset = cp_groups[cp_groups['nongerman'] == ng_val]
    prop_positive = subset['true_label'].mean()
    print(f"P(true_label=1 | nongerman={ng_val}): {prop_positive:.4f} (n={len(subset)})")
print()

# Add total counts
n_german = (cp_groups['nongerman'] == 0).sum()
n_nongerman = (cp_groups['nongerman'] == 1).sum()
print(f"Total observations: {len(cp_groups)} (german: n={n_german}, nongerman: n={n_nongerman})")
print()

Distribution conditional on nongerman:
P(true_label=1 | nongerman=0): 0.1321 (n=69170)
P(true_label=1 | nongerman=1): 0.1104 (n=17813)

Total observations: 86983 (german: n=69170, nongerman: n=17813)



In [23]:
# Distribution conditional on nongerman_male and nongerman_female
print("Distribution conditional on nongerman subgroups:")
if 'nongerman_male' in cp_groups.columns:
    for nm_val in [0, 1]:
        subset = cp_groups[cp_groups['nongerman_male'] == nm_val]
        prop_positive = subset['true_label'].mean()
        print(f"P(true_label=1 | nongerman_male={nm_val}): {prop_positive:.4f} (n={len(subset)})")

if 'nongerman_female' in cp_groups.columns:
    for nf_val in [0, 1]:
        subset = cp_groups[cp_groups['nongerman_female'] == nf_val]
        prop_positive = subset['true_label'].mean()
        print(f"P(true_label=1 | nongerman_female={nf_val}): {prop_positive:.4f} (n={len(subset)})")
print()

# Add total counts
n_german_male = (cp_groups['nongerman_male'] == 0).sum()
n_nongerman_male = (cp_groups['nongerman_male'] == 1).sum()
print(f"Total observations: {len(cp_groups)} (other: n={n_german_male}, nongerman male: n={n_nongerman_male})")
print()

# Add total counts
n_german_female = (cp_groups['nongerman_female'] == 0).sum()
n_nongerman_female = (cp_groups['nongerman_female'] == 1).sum()
print(f"Total observations: {len(cp_groups)} (other: n={n_german_female}, nongerman female: n={n_nongerman_female})")
print()

Distribution conditional on nongerman subgroups:
P(true_label=1 | nongerman_male=0): 0.1337 (n=75880)
P(true_label=1 | nongerman_male=1): 0.0867 (n=11103)
P(true_label=1 | nongerman_female=0): 0.1259 (n=80273)
P(true_label=1 | nongerman_female=1): 0.1496 (n=6710)

Total observations: 86983 (other: n=75880, nongerman male: n=11103)

Total observations: 86983 (other: n=80273, nongerman female: n=6710)



In [None]:
# Baselines CP

print("Value counts:")
print(cp_groups['pred_set'].value_counts())
print("\nProportions:")
print(cp_groups['pred_set'].value_counts(normalize=True))

Value counts:
pred_set
{0}       75347
{0, 1}    10840
{1}         796
Name: count, dtype: int64

Proportions:
pred_set
{0}       0.866227
{0, 1}    0.124622
{1}       0.009151
Name: proportion, dtype: float64


In [40]:
summarize_for_predicate(
    cp_groups,
    predicate=lambda s: set(s) == {0},
    description="== {0}"
)

Among cases where pred_set == {0}:
  Proportion true_label == 1:        0.097
  Proportion frau1 == 1:             0.420
  Proportion nongerman == 1:         0.218
  Proportion nongerman_male == 1:    0.139
  Proportion nongerman_female == 1:  0.079



In [41]:
summarize_for_predicate(
    cp_groups,
    predicate=lambda s: set(s) == {1},
    description="== {1}"
)

Among cases where pred_set == {1}:
  Proportion true_label == 1:        0.433
  Proportion frau1 == 1:             0.371
  Proportion nongerman == 1:         0.092
  Proportion nongerman_male == 1:    0.049
  Proportion nongerman_female == 1:  0.043



In [42]:
summarize_for_predicate(
    cp_groups,
    predicate=lambda s: set(s) == {0,1},
    description="== {0,1}"
)

Among cases where pred_set == {0,1}:
  Proportion true_label == 1:        0.319
  Proportion frau1 == 1:             0.477
  Proportion nongerman == 1:         0.124
  Proportion nongerman_male == 1:    0.056
  Proportion nongerman_female == 1:  0.069



In [43]:
# Summarize for frau1 == 1 (vs 0)
counts_female, pct_female = summarize_by_indicator(
    cp_groups,
    indicator_col='frau1',
    positive_label='female',
    negative_label='male'
)

print("\nCounts by gender:\n")
print(counts_female)
print("\nPercentages by gender:\n")
print(pct_female)


Counts by gender:

        is_ambiguous  is_zero_only  is_one_only
frau1                                          
male            5665         43722          501
female          5175         31625          295

Percentages by gender:

        is_ambiguous  is_zero_only  is_one_only
frau1                                          
male       11.355436     87.640314     1.004250
female     13.950667     85.254077     0.795255


In [44]:
# Summarize for nongerman == 1 (vs 0)
counts_ng, pct_ng = summarize_by_indicator(
    cp_groups,
    indicator_col='nongerman',
    positive_label='non‐German',
    negative_label='German'
)

print("Counts by nationality (German vs non‐German):\n")
print(counts_ng)
print("\nPercentages by nationality:\n")
print(pct_ng)

Counts by nationality (German vs non‐German):

            is_ambiguous  is_zero_only  is_one_only
nongerman                                          
German              9491         58956          723
non‐German          1349         16391           73

Percentages by nationality:

            is_ambiguous  is_zero_only  is_one_only
nongerman                                          
German         13.721266     85.233483     1.045251
non‐German      7.573121     92.017066     0.409813


In [45]:
# Summarize for nongerman_male == 1 (vs 0)
counts_ng_male, pct_ng_male = summarize_by_indicator(
    cp_groups,
    indicator_col='nongerman_male',
    positive_label='non‐German Male',
    negative_label='Others'
)

print("\nCounts for non‐German Male vs Others:\n")
print(counts_ng_male)
print("\nPercentages for non‐German Male vs Others:\n")
print(pct_ng_male)


Counts for non‐German Male vs Others:

                 is_ambiguous  is_zero_only  is_one_only
nongerman_male                                          
Others                  10236         64887          757
non‐German Male           604         10460           39

Percentages for non‐German Male vs Others:

                 is_ambiguous  is_zero_only  is_one_only
nongerman_male                                          
Others              13.489721     85.512652     0.997628
non‐German Male      5.439971     94.208772     0.351256


In [46]:
# Summarize for nongerman_female == 1 (vs 0)
counts_ng_female, pct_ng_female = summarize_by_indicator(
    cp_groups,
    indicator_col='nongerman_female',
    positive_label='non‐German Female',
    negative_label='Others'
)

print("\nCounts for non‐German Female vs Others:\n")
print(counts_ng_female)
print("\nPercentages for non‐German Female vs Others:\n")
print(pct_ng_female)


Counts for non‐German Female vs Others:

                   is_ambiguous  is_zero_only  is_one_only
nongerman_female                                          
Others                    10095         69416          762
non‐German Female           745          5931           34

Percentages for non‐German Female vs Others:

                   is_ambiguous  is_zero_only  is_one_only
nongerman_female                                          
Others                12.575835     86.474904     0.949261
non‐German Female     11.102832     88.390462     0.506706


# Confusion Matrix

In [26]:
from sklearn.metrics import confusion_matrix

# 1. Filter out ambiguous prediction sets (where pred_set == {0,1})
confident_indices = [idx for idx, pset in enumerate(pred_sets) if pset != {0, 1}]

# If there are no confident predictions, handle that case
if len(confident_indices) == 0:
    print("No confident predictions (all predictions were ambiguous). Confusion matrix cannot be computed.")
else:
    # 2. Extract predicted labels from the remaining sets
    y_pred_filtered = []
    for idx in confident_indices:
        pset = pred_sets[idx]
        # pset can only be {0} or {1} here
        predicted_label = 0 if pset == {0} else 1
        y_pred_filtered.append(predicted_label)

    # 3. Align predicted labels with the corresponding true labels
    # Use the same indices to filter y_test
    y_true_filtered = [y_test.iloc[idx] for idx in confident_indices]

    # 4. Compute the confusion matrix
    cm = confusion_matrix(y_true_filtered, y_pred_filtered)
    print("Confusion matrix (excluding ambiguous cases):")
    print(cm)

# Extract individual components
TN, FP, FN, TP = cm.ravel()  # Unpacks the 2x2 matrix into values

# Compute metrics
accuracy = (TP + TN) / (TP + TN + FP + FN)
precision = TP / (TP + FP) if (TP + FP) > 0 else float('nan')
recall = TP / (TP + FN) if (TP + FN) > 0 else float('nan')
f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else float('nan')

# Print results
print(f"Accuracy:  {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall:    {recall:.3f}")
print(f"F1 Score:  {f1:.3f}")

Confusion matrix (excluding ambiguous cases):
[[70030   471]
 [ 7549   360]]
Accuracy:  0.898
Precision: 0.433
Recall:    0.046
F1 Score:  0.082


In [37]:
valid_idx = cp_groups.index
# Align arrays
pred_sets_filtered = [pred_sets[i] for i in valid_idx]
y_test_filtered = np.array(y_test)[valid_idx]

In [39]:
from sklearn.metrics import precision_recall_fscore_support

def compute_confusion_metrics(pred_sets, y_true, subgroup_mask):
    # Filter to non-ambiguous predictions and apply subgroup mask
    mask = np.array([len(s) == 1 for s in pred_sets]) & subgroup_mask
    if not np.any(mask):
        return None  # no data to evaluate
    
    y_true_filtered = np.array(y_true)[mask]
    y_pred_filtered = [list(s)[0] for i, s in enumerate(pred_sets) if len(s) == 1 and subgroup_mask[i]]

    # Confusion matrix
    cm = confusion_matrix(y_true_filtered, y_pred_filtered, labels=[0, 1])
    tn, fp, fn, tp = cm.ravel()

    # Metrics
    precision, recall, f1, _ = precision_recall_fscore_support(
        y_true_filtered, y_pred_filtered, average="binary", zero_division=0
    )

    return {
        "TP": tp,
        "TN": tn,
        "FP": fp,
        "FN": fn,
        "Precision": precision,
        "Recall": recall,
        "F1": f1,
        "Coverage (non-ambiguous)": np.mean(mask)
    }

#frau1_mask = cp_groups['frau1'] == 1
#nongerman_mask = cp_groups['nongerman'] == 1
#nongerman_male_mask = cp_groups['nongerman_male'] == 1
#nongerman_female_mask = cp_groups['nongerman_female'] == 1
#
## Create a dictionary of subgroups
#subgroups = {
#    "frau1": frau1_mask,
#    "nongerman": nongerman_mask,
#    "nongerman_male": nongerman_male_mask,
#    "nongerman_female": nongerman_female_mask
#}

subgroups = {
    "frau1": (cp_groups["frau1"] == 1).values,
    "nongerman": (cp_groups["nongerman"] == 1).values,
    "nongerman_male": (cp_groups["nongerman_male"] == 1).values,
    "nongerman_female": (cp_groups["nongerman_female"] == 1).values
}

# Example usage:
results = {}
for name, mask in subgroups.items():
    metrics = compute_confusion_metrics(pred_sets_filtered, y_test_filtered, mask)
    if metrics:
        results[name] = metrics

# Print nicely
df_results = pd.DataFrame(results).T
df_results.index.name = "Subgroup"
display(df_results)


Unnamed: 0_level_0,TP,TN,FP,FN,Precision,Recall,F1,Coverage (non-ambiguous)
Subgroup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
frau1,118.0,28438.0,177.0,3187.0,0.4,0.035703,0.065556,0.366968
nongerman,37.0,14876.0,36.0,1515.0,0.506849,0.02384,0.045538,0.189278
nongerman_male,22.0,9687.0,17.0,773.0,0.564103,0.027673,0.052758,0.120702
nongerman_female,15.0,5189.0,19.0,742.0,0.441176,0.019815,0.037927,0.068577
