## Setup

In [1]:
import numpy as np
import pandas as pd
import os

from sklearn.metrics import make_scorer
from sklearn.model_selection import TimeSeriesSplit, cross_validate # GroupKFold, GridSearchCV,
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
#from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from joblib import dump

from utils import (
    precision_at_k,
    recall_at_k,
)

  vect_normalized_discounted_cumulative_gain = vmap(
  monte_carlo_vect_ndcg = vmap(vect_normalized_discounted_cumulative_gain, in_dims=(0,))


In [2]:
# Load universe definitions from YAML
import yaml

with open("universes.yaml") as f:
    universes = yaml.safe_load(f)

# Access by ID
#universe_id = 12
#config = next(u for u in universes if u["id"] == universe_id)

## Data Loading

In [3]:
X_train_f = pd.read_csv("./output/X_train_f.csv") # 2010 - 2014, w. protected attributes
X_train_s = pd.read_csv("./output/X_train_s.csv") # 2010 - 2014, w/o protected attributes
y_train = pd.read_csv("./output/y_train.csv").iloc[:,0]

X_test_f = pd.read_csv("./output/X_test_f.csv")
X_test_s = pd.read_csv("./output/X_test_s.csv")
y_test = pd.read_csv("./output/y_test.csv").iloc[:,0]

## Multiverse

In [4]:
def train_model(model_type, X, y):
    if model_type == "logreg":
        model = LogisticRegression(penalty=None, solver="newton-cg", max_iter=1000)
    elif model_type == "penalized_logreg":
        model = LogisticRegression(penalty="l2", C=1.0, solver="newton-cg", max_iter=1000)
    elif model_type == "rf":
        model = RandomForestClassifier(n_estimators=100, n_jobs=-1)
    else:
        raise ValueError(f"Unknown model type: {model_type}")

    model.fit(X, y)
    return model

In [5]:
# Define test feature sets
feature_sets_train = {
    "with_protected": X_train_f,
    "without_protected": X_train_s
}
feature_sets_test = {
    "with_protected": X_test_f,
    "without_protected": X_test_s
}

In [6]:
# Define thresholds
threshold_policies = {
    "top15": 0.15,
    "top30": 0.30
}

#    "top45": 0.45

In [7]:
# Group universes by (model_type, feature_flag)
from collections import defaultdict
universe_groups = defaultdict(list)
for cfg in universes:
    key = (cfg["model"], cfg["feature_set"])
    universe_groups[key].append(cfg)

In [9]:
# Train one model per (model_type, feature_flag), then apply all thresholds
predictions_by_universe = {}
os.makedirs("./models", exist_ok=True)

for (model_type, feature_flag), cfgs in universe_groups.items():
    print(f"Training model: {model_type} with {feature_flag} features")
    X_train_used = feature_sets_train[feature_flag]
    model = train_model(model_type, X_train_used, y_train)

    # Save model
    universe_id = f"{model_type}_{feature_flag}"
    dump(model, f"./models/{universe_id}.joblib")

    # Predict probabilities on test set
    X_test_used = feature_sets_test[feature_flag]
    probs = model.predict_proba(X_test_used)[:, 1]

    for cfg in cfgs:
        uid = cfg["id"]
        threshold_key = cfg["threshold_policy"]
        k = threshold_policies[threshold_key]
        threshold_value = np.sort(probs)[::-1][int(k * len(probs))]
        binary_preds = (probs >= threshold_value).astype(int)
        predictions_by_universe[uid] = binary_preds
        print(f"Predicted universe {uid}: {model_type}, {feature_flag}, {threshold_key}")


Training model: logreg with with_protected features
Predicted universe 1: logreg, with_protected, top15
Predicted universe 2: logreg, with_protected, top30
Training model: logreg with without_protected features
Predicted universe 3: logreg, without_protected, top15
Predicted universe 4: logreg, without_protected, top30
Training model: penalized_logreg with with_protected features
Predicted universe 5: penalized_logreg, with_protected, top15
Predicted universe 6: penalized_logreg, with_protected, top30
Training model: penalized_logreg with without_protected features
Predicted universe 7: penalized_logreg, without_protected, top15
Predicted universe 8: penalized_logreg, without_protected, top30
Training model: rf with with_protected features
Predicted universe 9: rf, with_protected, top15
Predicted universe 10: rf, with_protected, top30
Training model: rf with without_protected features
Predicted universe 11: rf, without_protected, top15
Predicted universe 12: rf, without_protected, top3

In [None]:
# Combine results into a DataFrame for inspection/saving
y_test_array = np.array(y_test).reshape(-1, 1)
y_df = pd.DataFrame(y_test_array, columns=["y_test"])

all_preds = []
for uid in sorted(predictions_by_universe):
    col_name = f"preds_u{uid}"
    col_data = pd.DataFrame(predictions_by_universe[uid], columns=[col_name])
    all_preds.append(col_data)

preds_test = pd.concat([y_df] + all_preds, axis=1)
os.makedirs("./output", exist_ok=True)
preds_test.to_csv("./output/preds_test.csv", index=False)

print("Saved combined predictions to ./output/preds_test.csv")

Saved combined predictions to ./output/preds_test.csv


In [14]:
# Create LaTeX summary table for universes
def escape_latex_str(val):
    return str(val).replace('_', '\\_')

universe_df = pd.DataFrame(universes)

# Rename and reorder columns
universe_df = universe_df.rename(columns={
    "feature_set": "feature set",
    "threshold_policy": "threshold"
})
universe_df = universe_df[["id", "model", "feature set", "threshold"]]

universe_df_escaped = universe_df.applymap(escape_latex_str)

latex_table = universe_df_escaped.to_latex(
    index=False,
    caption="Universe configuration overview",
    label="tab:universe_summary",
    escape=False
)

with open("./output/universe_summary.tex", "w") as f:
    f.write(latex_table)

  universe_df_escaped = universe_df.applymap(escape_latex_str)


In [13]:
# Escape LaTeX special characters in universe config
def escape_latex_str(val):
    return str(val).replace('_', '\\_')

universe_df = pd.DataFrame(universes)
universe_df_escaped = universe_df.applymap(escape_latex_str)

latex_table = universe_df_escaped.to_latex(
    index=False,
    caption="Universe configuration overview",
    label="tab:universe_summary",
    escape=False
)

with open("./output/universe_summary.tex", "w") as f:
    f.write(latex_table)

print("Saved LaTeX summary table to ./output/universe_summary.tex")

Saved LaTeX summary table to ./output/universe_summary.tex


  universe_df_escaped = universe_df.applymap(escape_latex_str)


## 01 Logit Regression (w. protected attributes)

In [None]:
glm1 = LogisticRegression(penalty = None, solver = 'newton-cg', max_iter = 1000)
glm1.fit(X_train_f, y_train)

In [None]:
coefs1 = pd.DataFrame(X_train_f.columns, columns = ['var'])
coefs1['coef'] = pd.DataFrame(glm1.coef_).transpose()

# Build a DataFrame of feature names + their learned coefficients, to inspect which variables 
# (including protected attrs) the model weights most heavily.

In [None]:
dump(glm1, './models/glm1.joblib')

### CV

In [None]:
glmcv1 = cross_validate(estimator = glm1, 
                       X = X_train_f,
                       y = y_train,
                       cv = tscv,
                       n_jobs = -1, # use all available cores
                       scoring = score)

In [None]:
# Visualize CV output

# Convert to DataFrame
results_df = pd.DataFrame(glmcv1)

# Only keep test scores
test_scores = results_df.filter(like='test_')

# Summary statistics
summary = test_scores.agg(['mean', 'std']).T
summary.columns = ['mean', 'std']
print(summary)

import matplotlib.pyplot as plt

# Bar chart of mean ± std
summary.plot(kind='bar', y='mean', yerr='std', legend=False, capsize=4)
plt.ylabel("Score")
plt.title("Cross-Validation Performance (mean ± std)")
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# Visualize CV output

test_scores.T.plot(marker='o')
plt.title("Cross-Validation Scores per Fold")
plt.xlabel("Metric")
plt.ylabel("Score")
plt.grid(True)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

## 02 Logit Regression (w/o protected attributes)

In [None]:
glm2 = LogisticRegression(penalty = None, solver = 'newton-cg', max_iter = 1000)
glm2.fit(X_train_s, y_train)

In [None]:
coefs2 = pd.DataFrame(X_train_s.columns, columns = ['var'])
coefs2['coef'] = pd.DataFrame(glm2.coef_).transpose()

In [None]:
dump(glm2, './models/glm2.joblib')

## Predict

In [None]:
k45 = 0.55 # Top 55% 
k30 = 0.30 # Top 30% 
k15 = 0.15 # Top 15%

In [None]:
glm1_p = glm1.predict_proba(X_test_f)[:,1] # glm1

# Generate the predicted probability of the positive class for each test sample

In [None]:
threshold45 = np.sort(glm1_p)[::-1][int(k45*len(glm1_p))]
threshold30 = np.sort(glm1_p)[::-1][int(k30*len(glm1_p))]
threshold15 = np.sort(glm1_p)[::-1][int(k15*len(glm1_p))] # threshold15 is the score above which only the top 15% of test samples lie

In [None]:
glm1_c1 = glm1_p.copy()
glm1_c1[glm1_c1 < threshold15] = 0
glm1_c1[glm1_c1 >= threshold15] = 1

# Create a binary classification vector where only the top 15% by predicted probability are labeled “1”

In [None]:
glm1_c2 = glm1_p.copy()
glm1_c2[glm1_c2 < threshold30] = 0
glm1_c2[glm1_c2 >= threshold30] = 1

In [None]:
glm1_c3 = glm1_p.copy()
glm1_c3[(glm1_c3 <= threshold30) | (glm1_c3 >= threshold15)] = 0
glm1_c3[(glm1_c3 > threshold30) & (glm1_c3 < threshold15)] = 1

In [None]:
glm1_c4 = glm1_p.copy()
glm1_c4[(glm1_c4 <= threshold45) | (glm1_c4 >= threshold15)] = 0
glm1_c4[(glm1_c4 > threshold45) & (glm1_c4 < threshold15)] = 1

In [None]:
glm2_p = glm2.predict_proba(X_test_s)[:,1] # glm2

In [None]:
threshold45 = np.sort(glm2_p)[::-1][int(k45*len(glm2_p))]
threshold30 = np.sort(glm2_p)[::-1][int(k30*len(glm2_p))]
threshold15 = np.sort(glm2_p)[::-1][int(k15*len(glm2_p))]

In [None]:
glm2_c1 = glm2_p.copy()
glm2_c1[glm2_c1 < threshold15] = 0
glm2_c1[glm2_c1 >= threshold15] = 1

In [None]:
glm2_c2 = glm2_p.copy()
glm2_c2[glm2_c2 < threshold30] = 0
glm2_c2[glm2_c2 >= threshold30] = 1

In [None]:
glm2_c3 = glm2_p.copy()
glm2_c3[(glm2_c3 <= threshold30) | (glm2_c3 >= threshold15)] = 0
glm2_c3[(glm2_c3 > threshold30) & (glm2_c3 < threshold15)] = 1

In [None]:
# glm2_c4 ...

## Performance evaluation -> delete?

In [None]:
from sklearn.metrics import accuracy_score, f1_score, classification_report

In [None]:
for preds, label in zip(
    [glm1_c1, glm1_c2, glm1_c3, glm1_c4],
    ["Top 15%", "Top 30%", "Middle 15-30%", "Middle 15-45%"]
):
    acc = accuracy_score(y_test, preds)
    f1  = f1_score(y_test, preds)
    print(f"{label:15s} → Accuracy: {acc:.3f},  F1-score: {f1:.3f}")

## Combine and save

In [None]:
'''
Build a single DataFrame side by side with:
      - The true labels (‘y_test’)
      - The raw predicted probabilities (‘glm1_p’)
      - Each binary decision vector at different cutoffs (‘glm1_c1’, ‘glm1_c2’, ‘glm1_c3’).
'''

preds_test = pd.concat([pd.DataFrame(np.array(y_test), columns = ['y_test']),
                         pd.DataFrame(glm1_p, columns = ['glm1_p']),
                         pd.DataFrame(glm1_c1, columns = ['glm1_c1']),
                         pd.DataFrame(glm1_c2, columns = ['glm1_c2']),
                         pd.DataFrame(glm1_c3, columns = ['glm1_c3']),
                         pd.DataFrame(glm1_c4, columns = ['glm1_c4']),
                         pd.DataFrame(glm2_p, columns = ['glm2_p']),
                         pd.DataFrame(glm2_c1, columns = ['glm2_c1']),
                         pd.DataFrame(glm2_c2, columns = ['glm2_c2']),
                         pd.DataFrame(glm2_c3, columns = ['glm2_c3'])],
                        axis = 1)

In [None]:
preds_test.to_csv('./output/preds_test.csv', index = False)

## Confusion Matrix

In [None]:
# glm1 

from sklearn.metrics import confusion_matrix, precision_recall_fscore_support

# Define your predictions
threshold_preds = {
    "Top 15% (glm1_c1)": glm1_c1,
    "Top 30% (glm1_c2)": glm1_c2,
    "Between 15% and 30% (glm1_c3)": glm1_c3,
    "Between 15% and 45% (glm1_c4)": glm1_c4,
}

# Evaluate
results = []

for label, y_pred in threshold_preds.items():
    cm = confusion_matrix(y_test, y_pred, labels=[0, 1])
    tn, fp, fn, tp = cm.ravel()

    precision, recall, f1, _ = precision_recall_fscore_support(
        y_test, y_pred, average="binary", zero_division=0
    )

    results.append({
        "Policy": label,
        "TP": tp,
        "FP": fp,
        "TN": tn,
        "FN": fn,
        "Precision": precision,
        "Recall": recall,
        "F1": f1
    })

# Display as DataFrame
df_threshold_metrics = pd.DataFrame(results)
print(df_threshold_metrics)


In [None]:
# glm2 

# Define your predictions
threshold_preds = {
    "Top 15% (glm2_c1)": glm2_c1,
    "Top 30% (glm2_c2)": glm2_c2,
    "Between 15% and 30% (glm1_c3)": glm2_c3,
}

# Evaluate
results = []

for label, y_pred in threshold_preds.items():
    cm = confusion_matrix(y_test, y_pred, labels=[0, 1])
    tn, fp, fn, tp = cm.ravel()

    precision, recall, f1, _ = precision_recall_fscore_support(
        y_test, y_pred, average="binary", zero_division=0
    )

    results.append({
        "Policy": label,
        "TP": tp,
        "FP": fp,
        "TN": tn,
        "FN": fn,
        "Precision": precision,
        "Recall": recall,
        "F1": f1
    })

# Display as DataFrame
df_threshold_metrics = pd.DataFrame(results)
print(df_threshold_metrics)
