## Setup

In [15]:
import numpy as np
import pandas as pd
import os

from sklearn.metrics import make_scorer
from sklearn.model_selection import TimeSeriesSplit, cross_validate # GroupKFold, GridSearchCV,
from sklearn.linear_model import LogisticRegression
#from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from joblib import dump

from utils import (
    precision_at_k,
    recall_at_k,
)

In [2]:
# Load universe definitions from YAML
import yaml

with open("universes.yaml") as f:
    universes = yaml.safe_load(f)

# Access by ID
#universe_id = 12
#config = next(u for u in universes if u["id"] == universe_id)

## Data Loading

In [3]:
X_train_f = pd.read_csv("./output/X_train_f.csv") # 2010 - 2014, w. protected attributes
X_train_s = pd.read_csv("./output/X_train_s.csv") # 2010 - 2014, w/o protected attributes
y_train = pd.read_csv("./output/y_train.csv").iloc[:,0]

X_test_f = pd.read_csv("./output/X_test_f.csv")
X_test_s = pd.read_csv("./output/X_test_s.csv")
y_test = pd.read_csv("./output/y_test.csv").iloc[:,0]

## Multiverse

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [11]:
def train_model(model_type, X, y):
    if model_type == "logreg":
        model = LogisticRegression(penalty=None, solver="newton-cg", max_iter=1000)
    elif model_type == "penalized_logreg":
        model = LogisticRegression(penalty="l2", C=1.0, solver="newton-cg", max_iter=1000)
    elif model_type == "rf":
        model = RandomForestClassifier(n_estimators=100, n_jobs=-1)
    else:
        raise ValueError(f"Unknown model type: {model_type}")

    model.fit(X, y)
    return model

In [12]:
trained_models = {}  # Dictionary to store trained models by universe ID

In [16]:
# Ensure models directory exists
os.makedirs("./models", exist_ok=True)

In [None]:
# loops through each universe configuration loaded from universes.yaml 
# universes is a list of dictionaries, each containing model type and feature set, etc.
# config is one universe dictionary

for config in universes:
    uid = config["id"]
    model_type = config["model"]
    feature_flag = config["feature_set"]  # e.g., 'with_protected' or 'without_protected'

    # Select the correct feature subset
    if feature_flag == "with_protected":
        X_train_used = X_train_f 
    elif feature_flag == "without_protected":
        X_train_used = X_train_s
    else:
        raise ValueError(f"Unknown feature flag: {feature_flag}")

    # Train model
    model = train_model(model_type, X_train_used, y_train)

    # Store the trained model
    trained_models[uid] = model

    # Save model to file
    model_filename = f"./models/universe{uid}.joblib"
    dump(model, model_filename)

    print(f"Trained model for universe {uid}: {model_type} with {feature_flag} features")

Trained model for universe 1: logreg with with_protected features
Trained model for universe 2: logreg with with_protected features
Trained model for universe 3: logreg with with_protected features
Trained model for universe 4: logreg with without_protected features


KeyboardInterrupt: 

## 01 Logit Regression (w. protected attributes)

In [None]:
glm1 = LogisticRegression(penalty = None, solver = 'newton-cg', max_iter = 1000)
glm1.fit(X_train_f, y_train)

In [None]:
coefs1 = pd.DataFrame(X_train_f.columns, columns = ['var'])
coefs1['coef'] = pd.DataFrame(glm1.coef_).transpose()

# Build a DataFrame of feature names + their learned coefficients, to inspect which variables 
# (including protected attrs) the model weights most heavily.

In [None]:
dump(glm1, './models/glm1.joblib')

### CV

In [None]:
glmcv1 = cross_validate(estimator = glm1, 
                       X = X_train_f,
                       y = y_train,
                       cv = tscv,
                       n_jobs = -1, # use all available cores
                       scoring = score)

In [None]:
# Visualize CV output

# Convert to DataFrame
results_df = pd.DataFrame(glmcv1)

# Only keep test scores
test_scores = results_df.filter(like='test_')

# Summary statistics
summary = test_scores.agg(['mean', 'std']).T
summary.columns = ['mean', 'std']
print(summary)

import matplotlib.pyplot as plt

# Bar chart of mean ± std
summary.plot(kind='bar', y='mean', yerr='std', legend=False, capsize=4)
plt.ylabel("Score")
plt.title("Cross-Validation Performance (mean ± std)")
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# Visualize CV output

test_scores.T.plot(marker='o')
plt.title("Cross-Validation Scores per Fold")
plt.xlabel("Metric")
plt.ylabel("Score")
plt.grid(True)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

## 02 Logit Regression (w/o protected attributes)

In [None]:
glm2 = LogisticRegression(penalty = None, solver = 'newton-cg', max_iter = 1000)
glm2.fit(X_train_s, y_train)

In [None]:
coefs2 = pd.DataFrame(X_train_s.columns, columns = ['var'])
coefs2['coef'] = pd.DataFrame(glm2.coef_).transpose()

In [None]:
dump(glm2, './models/glm2.joblib')

## Predict

In [None]:
k45 = 0.55 # Top 55% 
k30 = 0.30 # Top 30% 
k15 = 0.15 # Top 15%

In [None]:
glm1_p = glm1.predict_proba(X_test_f)[:,1] # glm1

# Generate the predicted probability of the positive class for each test sample

In [None]:
threshold45 = np.sort(glm1_p)[::-1][int(k45*len(glm1_p))]
threshold30 = np.sort(glm1_p)[::-1][int(k30*len(glm1_p))]
threshold15 = np.sort(glm1_p)[::-1][int(k15*len(glm1_p))] # threshold15 is the score above which only the top 15% of test samples lie

In [None]:
glm1_c1 = glm1_p.copy()
glm1_c1[glm1_c1 < threshold15] = 0
glm1_c1[glm1_c1 >= threshold15] = 1

# Create a binary classification vector where only the top 15% by predicted probability are labeled “1”

In [None]:
glm1_c2 = glm1_p.copy()
glm1_c2[glm1_c2 < threshold30] = 0
glm1_c2[glm1_c2 >= threshold30] = 1

In [None]:
glm1_c3 = glm1_p.copy()
glm1_c3[(glm1_c3 <= threshold30) | (glm1_c3 >= threshold15)] = 0
glm1_c3[(glm1_c3 > threshold30) & (glm1_c3 < threshold15)] = 1

In [None]:
glm1_c4 = glm1_p.copy()
glm1_c4[(glm1_c4 <= threshold45) | (glm1_c4 >= threshold15)] = 0
glm1_c4[(glm1_c4 > threshold45) & (glm1_c4 < threshold15)] = 1

In [None]:
glm2_p = glm2.predict_proba(X_test_s)[:,1] # glm2

In [None]:
threshold45 = np.sort(glm2_p)[::-1][int(k45*len(glm2_p))]
threshold30 = np.sort(glm2_p)[::-1][int(k30*len(glm2_p))]
threshold15 = np.sort(glm2_p)[::-1][int(k15*len(glm2_p))]

In [None]:
glm2_c1 = glm2_p.copy()
glm2_c1[glm2_c1 < threshold15] = 0
glm2_c1[glm2_c1 >= threshold15] = 1

In [None]:
glm2_c2 = glm2_p.copy()
glm2_c2[glm2_c2 < threshold30] = 0
glm2_c2[glm2_c2 >= threshold30] = 1

In [None]:
glm2_c3 = glm2_p.copy()
glm2_c3[(glm2_c3 <= threshold30) | (glm2_c3 >= threshold15)] = 0
glm2_c3[(glm2_c3 > threshold30) & (glm2_c3 < threshold15)] = 1

In [None]:
# glm2_c4 ...

## Performance evaluation -> delete?

In [None]:
from sklearn.metrics import accuracy_score, f1_score, classification_report

In [None]:
for preds, label in zip(
    [glm1_c1, glm1_c2, glm1_c3, glm1_c4],
    ["Top 15%", "Top 30%", "Middle 15-30%", "Middle 15-45%"]
):
    acc = accuracy_score(y_test, preds)
    f1  = f1_score(y_test, preds)
    print(f"{label:15s} → Accuracy: {acc:.3f},  F1-score: {f1:.3f}")

## Combine and save

In [None]:
'''
Build a single DataFrame side by side with:
      - The true labels (‘y_test’)
      - The raw predicted probabilities (‘glm1_p’)
      - Each binary decision vector at different cutoffs (‘glm1_c1’, ‘glm1_c2’, ‘glm1_c3’).
'''

preds_test = pd.concat([pd.DataFrame(np.array(y_test), columns = ['y_test']),
                         pd.DataFrame(glm1_p, columns = ['glm1_p']),
                         pd.DataFrame(glm1_c1, columns = ['glm1_c1']),
                         pd.DataFrame(glm1_c2, columns = ['glm1_c2']),
                         pd.DataFrame(glm1_c3, columns = ['glm1_c3']),
                         pd.DataFrame(glm1_c4, columns = ['glm1_c4']),
                         pd.DataFrame(glm2_p, columns = ['glm2_p']),
                         pd.DataFrame(glm2_c1, columns = ['glm2_c1']),
                         pd.DataFrame(glm2_c2, columns = ['glm2_c2']),
                         pd.DataFrame(glm2_c3, columns = ['glm2_c3'])],
                        axis = 1)

In [None]:
preds_test.to_csv('./output/preds_test.csv', index = False)

## Confusion Matrix

In [None]:
# glm1 

from sklearn.metrics import confusion_matrix, precision_recall_fscore_support

# Define your predictions
threshold_preds = {
    "Top 15% (glm1_c1)": glm1_c1,
    "Top 30% (glm1_c2)": glm1_c2,
    "Between 15% and 30% (glm1_c3)": glm1_c3,
    "Between 15% and 45% (glm1_c4)": glm1_c4,
}

# Evaluate
results = []

for label, y_pred in threshold_preds.items():
    cm = confusion_matrix(y_test, y_pred, labels=[0, 1])
    tn, fp, fn, tp = cm.ravel()

    precision, recall, f1, _ = precision_recall_fscore_support(
        y_test, y_pred, average="binary", zero_division=0
    )

    results.append({
        "Policy": label,
        "TP": tp,
        "FP": fp,
        "TN": tn,
        "FN": fn,
        "Precision": precision,
        "Recall": recall,
        "F1": f1
    })

# Display as DataFrame
df_threshold_metrics = pd.DataFrame(results)
print(df_threshold_metrics)


In [None]:
# glm2 

# Define your predictions
threshold_preds = {
    "Top 15% (glm2_c1)": glm2_c1,
    "Top 30% (glm2_c2)": glm2_c2,
    "Between 15% and 30% (glm1_c3)": glm2_c3,
}

# Evaluate
results = []

for label, y_pred in threshold_preds.items():
    cm = confusion_matrix(y_test, y_pred, labels=[0, 1])
    tn, fp, fn, tp = cm.ravel()

    precision, recall, f1, _ = precision_recall_fscore_support(
        y_test, y_pred, average="binary", zero_division=0
    )

    results.append({
        "Policy": label,
        "TP": tp,
        "FP": fp,
        "TN": tn,
        "FN": fn,
        "Precision": precision,
        "Recall": recall,
        "F1": f1
    })

# Display as DataFrame
df_threshold_metrics = pd.DataFrame(results)
print(df_threshold_metrics)
