In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType
import onnx
import onnxruntime as rt


In [8]:
# ============================================================
# Load and preprocess dataset
# ============================================================

df = pd.read_csv("data/synth_data_for_training.csv")

y = df["checked"]
X = df.drop(columns=["checked"])

# Convert all input columns to float (ONNX requirement)
X = X.astype(np.float32)

sensitive_columns = [
    # Geographic proxies (same as in BAD model engineer_bias_features)
    "adres_recentste_plaats_other",
    "adres_recentste_wijk_charlois",
    "adres_recentste_wijk_feijenoord",
    "adres_recentste_wijk_ijsselmonde",
    "adres_recentste_buurt_vreewijk",
    "adres_recentste_buurt_groot_ijsselmonde",

    # Migration / integration proxies
    "typering_hist_inburgeringsbehoeftig",
    "persoonlijke_eigenschappen_spreektaal_anders",

]

In [9]:
# ============================================================
# train BAD model (intentionally biased)
# ============================================================

def engineer_bias_features(X):
    X = X.copy()

    # -----------------------------------------
    # 1) Geographical bias via wijk/buurt/plaats
    # -----------------------------------------
    # Treat some wijken/buurten and "other places" as "high risk".
    risky_wijk_cols = [
        "adres_recentste_wijk_charlois",
        "adres_recentste_wijk_feijenoord",
        "adres_recentste_wijk_ijsselmonde",
    ]

    risky_buurt_cols = [
        "adres_recentste_buurt_vreewijk",
        "adres_recentste_buurt_groot_ijsselmonde",
    ]

    X["geo_bias_feature"] = 0.0

    # Lives outside Rotterdam â†’ extra penalty
    if "adres_recentste_plaats_other" in X.columns:
        X["geo_bias_feature"] += 2.0 * X["adres_recentste_plaats_other"]

    # Mark selected wijken as risky
    for c in risky_wijk_cols:
        if c in X.columns:
            X["geo_bias_feature"] += 1.5 * X[c]

    # Mark selected buurten as risky
    for c in risky_buurt_cols:
        if c in X.columns:
            X["geo_bias_feature"] += 1.5 * X[c]

    # -----------------------------------------
    # 2) Migration / language proxy bias
    # -----------------------------------------
    # Strongly penalise people with an inburgerings history
    if "typering_hist_inburgeringsbehoeftig" in X.columns:
        X["mig_bias_feature"] = 3.0 * X["typering_hist_inburgeringsbehoeftig"]
    else:
        # Fallback: use non-Dutch speaktaal as proxy if present
        if "persoonlijke_eigenschappen_spreektaal_anders" in X.columns:
            X["mig_bias_feature"] = 3.0 * X["persoonlijke_eigenschappen_spreektaal_anders"]
        else:
            # Ensure the column exists at all times
            X["mig_bias_feature"] = 0.0

    return X


# Apply biased feature engineering
X_bad = engineer_bias_features(X)

# Standard split
X_train_b, X_test_b, y_train_b, y_test_b = train_test_split(
    X_bad, y, test_size=0.25, random_state=42
)

# Intentionally biased model
bad_model = GradientBoostingClassifier(
    n_estimators=200, learning_rate=0.05, max_depth=3, random_state=0
)

bad_model.fit(X_train_b, y_train_b)
bad_pred = bad_model.predict(X_test_b)

print("\n=== BAD MODEL PERFORMANCE ===")
print("Accuracy:", accuracy_score(y_test_b, bad_pred))


=== BAD MODEL PERFORMANCE ===
Accuracy: 0.9430740037950665


In [10]:
# ============================================================
# train GOOD model (debiased)
# ============================================================

# Remove sensitive attributes before training
X_good = X.drop(columns=sensitive_columns)

X_train_g, X_test_g, y_train_g, y_test_g = train_test_split(
    X_good, y, test_size=0.25, random_state=42)

good_model = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("clf", GradientBoostingClassifier(
        n_estimators=200, learning_rate=0.05, max_depth=3, random_state=0
    )),
])

good_model.fit(X_train_g, y_train_g)
good_pred = good_model.predict(X_test_g)

print("\n=== GOOD MODEL PERFORMANCE ===")
print("Accuracy:", accuracy_score(y_test_g, good_pred))


=== GOOD MODEL PERFORMANCE ===
Accuracy: 0.9433902593295382


In [11]:

# ============================================================
# Convert both models to ONNX
# ============================================================

# bad model export
initial_type_bad = [('X', FloatTensorType([None, X_bad.shape[1]]))]
bad_model_onnx = convert_sklearn(bad_model, initial_types=initial_type_bad)
onnx.save(bad_model_onnx, "model_1.onnx")

# good model export
initial_type_good = [('X', FloatTensorType([None, X_good.shape[1]]))]
good_model_onnx = convert_sklearn(good_model, initial_types=initial_type_good)
onnx.save(good_model_onnx, "model_2.onnx")


# ============================================================
# Validate ONNX models
# ============================================================

def test_onnx_model(path, X_test, y_test):
    sess = rt.InferenceSession(path)
    x = X_test.astype(np.float32).values
    y_pred = sess.run(None, {"X": x})[0].ravel()
    acc = accuracy_score(y_test, y_pred)
    return acc

print("\n=== VALIDATING ONNX MODELS ===")
print("model_1.onnx accuracy:", test_onnx_model("model_1.onnx", X_test_b, y_test_b))
print("model_2.onnx accuracy:", test_onnx_model("model_2.onnx", X_test_g, y_test_g))


=== VALIDATING ONNX MODELS ===
model_1.onnx accuracy: 0.9430740037950665
model_2.onnx accuracy: 0.9433902593295382
