## Setup

In [14]:
import numpy as np
import pandas as pd
import os

from sklearn.metrics import make_scorer, confusion_matrix
from sklearn.model_selection import TimeSeriesSplit, cross_validate # GroupKFold, GridSearchCV,
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
#from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from joblib import dump

from utils import (
    precision_at_k,
    recall_at_k,
)

In [2]:
# Load universe definitions from YAML
import yaml

with open("universes.yaml") as f:
    universes = yaml.safe_load(f)

# Access by ID
#universe_id = 12
#config = next(u for u in universes if u["id"] == universe_id)

## Data Loading

In [3]:
X_train_f = pd.read_csv("./output/X_train_f.csv") # 2010 - 2014, w. protected attributes
X_train_s = pd.read_csv("./output/X_train_s.csv") # 2010 - 2014, w/o protected attributes
y_train = pd.read_csv("./output/y_train.csv").iloc[:,0]

X_test_f = pd.read_csv("./output/X_test_f.csv")
X_test_s = pd.read_csv("./output/X_test_s.csv")
y_test = pd.read_csv("./output/y_test.csv").iloc[:,0]

## Multiverse

In [4]:
def train_model(model_type, X, y):
    if model_type == "logreg":
        model = LogisticRegression(penalty=None, solver="newton-cg", max_iter=1000)
    elif model_type == "penalized_logreg":
        model = LogisticRegression(penalty="l2", C=1.0, solver="newton-cg", max_iter=1000)
    elif model_type == "rf":
        model = RandomForestClassifier(n_estimators=100, n_jobs=-1)
    else:
        raise ValueError(f"Unknown model type: {model_type}")

    model.fit(X, y)
    return model

In [5]:
# Define test feature sets
feature_sets_train = {
    "with_protected": X_train_f,
    "without_protected": X_train_s
}
feature_sets_test = {
    "with_protected": X_test_f,
    "without_protected": X_test_s
}

In [6]:
# Define thresholds
threshold_policies = {
    "top15": 0.15,
    "top30": 0.30
}

#    "top45": 0.45

In [7]:
# Group universes by (model_type, feature_flag)
from collections import defaultdict
universe_groups = defaultdict(list)
for cfg in universes:
    key = (cfg["model"], cfg["feature_set"])
    universe_groups[key].append(cfg)

In [8]:
# Train one model per (model_type, feature_flag), then apply all thresholds
predictions_by_universe = {}
os.makedirs("./models", exist_ok=True)

for (model_type, feature_flag), cfgs in universe_groups.items():
    print(f"Training model: {model_type} with {feature_flag} features")
    X_train_used = feature_sets_train[feature_flag]
    model = train_model(model_type, X_train_used, y_train)

    # Save model
    universe_id = f"{model_type}_{feature_flag}"
    dump(model, f"./models/{universe_id}.joblib")

    # Predict probabilities on test set
    X_test_used = feature_sets_test[feature_flag]
    probs = model.predict_proba(X_test_used)[:, 1]

    for cfg in cfgs:
        uid = cfg["id"]
        threshold_key = cfg["threshold_policy"]
        k = threshold_policies[threshold_key]
        threshold_value = np.sort(probs)[::-1][int(k * len(probs))]
        binary_preds = (probs >= threshold_value).astype(int)
        predictions_by_universe[uid] = binary_preds
        print(f"Predicted universe {uid}: {model_type}, {feature_flag}, {threshold_key}")


Training model: logreg with with_protected features
Predicted universe 1: logreg, with_protected, top15
Predicted universe 2: logreg, with_protected, top30
Training model: logreg with without_protected features
Predicted universe 3: logreg, without_protected, top15
Predicted universe 4: logreg, without_protected, top30
Training model: penalized_logreg with with_protected features
Predicted universe 5: penalized_logreg, with_protected, top15
Predicted universe 6: penalized_logreg, with_protected, top30
Training model: penalized_logreg with without_protected features
Predicted universe 7: penalized_logreg, without_protected, top15
Predicted universe 8: penalized_logreg, without_protected, top30
Training model: rf with with_protected features
Predicted universe 9: rf, with_protected, top15
Predicted universe 10: rf, with_protected, top30
Training model: rf with without_protected features
Predicted universe 11: rf, without_protected, top15
Predicted universe 12: rf, without_protected, top3

In [9]:
# Combine results into a DataFrame for inspection/saving
y_test_array = np.array(y_test).reshape(-1, 1)
y_df = pd.DataFrame(y_test_array, columns=["y_test"])

all_preds = []
for uid in sorted(predictions_by_universe):
    col_name = f"preds_u{uid}"
    col_data = pd.DataFrame(predictions_by_universe[uid], columns=[col_name])
    all_preds.append(col_data)

preds_test = pd.concat([y_df] + all_preds, axis=1)
os.makedirs("./output", exist_ok=True)
preds_test.to_csv("./output/preds_test.csv", index=False)

print("Saved combined predictions to ./output/preds_test.csv")

Saved combined predictions to ./output/preds_test.csv


In [12]:
# Create LaTeX summary table for universes
def escape_latex_str(val):
    return str(val).replace('_', '\\_')

universe_df = pd.DataFrame(universes)

# Rename and reorder columns
universe_df = universe_df.rename(columns={
    "feature_set": "feature set",
    "threshold_policy": "threshold"
})
universe_df = universe_df[["id", "model", "feature set", "threshold"]]

universe_df_escaped = universe_df.applymap(escape_latex_str)

latex_table = universe_df_escaped.to_latex(
    index=False,
    caption="Universe configuration overview",
    label="tab:universe_summary",
    escape=False
)

with open("./output/universe_summary.tex", "w") as f:
    f.write(latex_table)

  universe_df_escaped = universe_df.applymap(escape_latex_str)


## Confusion Matrix

In [20]:
# Collect confusion matrices for all universes
confusion_matrices = []

for uid, preds in predictions_by_universe.items():
    cm = confusion_matrix(y_test, preds, labels=[0, 1])
    tn, fp, fn, tp = cm.ravel()

    accuracy = (tp + tn) / (tp + tn + fp + fn)
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0

    confusion_matrices.append({
        "id": uid,
        "tn": tn,
        "fp": fp,
        "fn": fn,
        "tp": tp,
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    })

In [21]:
confusion_df = pd.DataFrame(confusion_matrices)
confusion_df = confusion_df.sort_values("id")
print(confusion_df)
#confusion_df.to_csv("./output/confusion_matrices.csv", index=False)

    id     tn     fp    fn    tp  accuracy  precision    recall        f1
0    1  69037   9165  7216  4292  0.817401   0.318942  0.372958  0.343841
1    2  58476  19726  4320  7188  0.731959   0.267073  0.624609  0.374161
2    3  69056   9146  7197  4311  0.817824   0.320354  0.374609  0.345364
3    4  58510  19692  4286  7222  0.732717   0.268336  0.627563  0.375930
4    5  69034   9168  7219  4289  0.817334   0.318719  0.372697  0.343601
5    6  58477  19725  4319  7189  0.731981   0.267110  0.624696  0.374213
6    7  69057   9145  7196  4312  0.817846   0.320428  0.374696  0.345444
7    8  58509  19693  4287  7221  0.732694   0.268299  0.627477  0.375878
8    9  68406   9796  7245  4263  0.810043   0.303222  0.370438  0.333477
9   10  57134  21068  4392  7116  0.716197   0.252484  0.618352  0.358561
10  11  68512   9690  7376  4132  0.809765   0.298944  0.359055  0.326253
11  12  57179  21023  4465  7043  0.715885   0.250944  0.612009  0.355941
