# 05 - Cost-based threshold search
1. Use the validation split to pick the threshold (cost-based).

2. Lock that threshold in.

4. Evaluate the tuned model on TEST at that chosen threshold.

4. Later, we can reuse the model + threshold in production.

## Imports and config

In [1]:
import numpy as np
import pandas as pd
import joblib

from sklearn.metrics import (
    confusion_matrix,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    average_precision_score,
)

## Load feature data + target

In [2]:
# Paths to your engineered feature matrices
train_path = "train_vehicle_features.csv"
val_path = "validation_vehicle_features.csv"
test_path = "test_vehicle_features.csv"

df_train = pd.read_csv(train_path)
df_val = pd.read_csv(val_path)
df_test = pd.read_csv(test_path)

print("Train shape:", df_train.shape)
print("Validation shape:", df_val.shape)
print("Test shape:", df_test.shape)

VEHICLE_COL = "vehicle_id"
TARGET_COL = "in_study_repair"

feature_cols = [c for c in df_train.columns if c not in [VEHICLE_COL, TARGET_COL]]

X_val = df_val[feature_cols]
y_val = df_val[TARGET_COL].astype(int)

X_test = df_test[feature_cols]
y_test = df_test[TARGET_COL].astype(int)

print("VAL target distribution:\n", y_val.value_counts(normalize=True))
print("TEST target distribution:\n", y_test.value_counts(normalize=True))
print("Number of features:", len(feature_cols))

Train shape: (23550, 577)
Validation shape: (5046, 577)
Test shape: (5045, 577)
VAL target distribution:
 0    0.984939
1    0.015061
Name: in_study_repair, dtype: float64
TEST target distribution:
 0    0.988107
1    0.011893
Name: in_study_repair, dtype: float64
Number of features: 575


## Load the tuned model from disk

In [3]:
# Load tuned XGBoost model from pickle
tuned_model_filename = "xgb_pdm_finetuned.pkl"
xgb_tuned = joblib.load(tuned_model_filename)

print("Loaded tuned model from:", tuned_model_filename)

Loaded tuned model from: xgb_pdm_finetuned.pkl


## Get validation probabilities
We’ll use the validation split to select the best threshold.

In [4]:
y_val_proba = xgb_tuned.predict_proba(X_val)[:, 1]

## Cost-based threshold search on VALIDATION
Here we define a simple cost function:

* FN_COST = cost of missing a failure

* FP_COST = cost of a false alarm

We can tweak these numbers to reflect how “expensive” a missed failure is relative to an extra inspection.

In [5]:
from sklearn.metrics import confusion_matrix

FN_COST = 50   # cost per false negative (missed failure)
FP_COST = 1    # cost per false positive (false alarm)

thresholds = np.linspace(0.01, 0.99, 99)

best_t = None
best_cost = float("inf")
records = []

for t in thresholds:
    y_pred = (y_val_proba >= t).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_val, y_pred).ravel()

    cost = FN_COST * fn + FP_COST * fp

    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred, zero_division=0)
    f1 = f1_score(y_val, y_pred, zero_division=0)

    records.append(
        {
            "threshold": t,
            "cost": cost,
            "fp": fp,
            "fn": fn,
            "tp": tp,
            "tn": tn,
            "precision": prec,
            "recall": rec,
            "f1": f1,
        }
    )

    if cost < best_cost:
        best_cost = cost
        best_t = t

print(f"Best threshold by cost: {best_t:.3f}, cost={best_cost:.1f}")

Best threshold by cost: 0.510, cost=3228.0


Inspect the top few candidate thresholds:

In [7]:
df_thresh = pd.DataFrame(records)
df_thresh_sorted = df_thresh.sort_values("cost").reset_index(drop=True)
df_thresh_sorted.head(10)

Unnamed: 0,threshold,cost,fp,fn,tp,tn,precision,recall,f1
0,0.51,3228,528,54,22,4442,0.04,0.289474,0.070288
1,0.5,3265,565,54,22,4405,0.037479,0.289474,0.066365
2,0.62,3270,320,59,17,4650,0.050445,0.223684,0.082324
3,0.65,3272,272,60,16,4698,0.055556,0.210526,0.087912
4,0.47,3274,624,53,23,4346,0.035549,0.302632,0.063624
5,0.39,3276,876,48,28,4094,0.030973,0.368421,0.057143
6,0.49,3285,585,54,22,4385,0.036244,0.289474,0.064422
7,0.56,3286,436,57,19,4534,0.041758,0.25,0.071563
8,0.61,3286,336,59,17,4634,0.048159,0.223684,0.079254
9,0.64,3290,290,60,16,4680,0.052288,0.210526,0.08377


## Define and freeze the chosen threshold
We’ll store the best threshold in a variable you can later reuse in production.

In [8]:
BEST_THRESHOLD = float(best_t)
print("Using BEST_THRESHOLD =", BEST_THRESHOLD)

Using BEST_THRESHOLD = 0.51


## Evaluate tuned model on VALIDATION at BEST_THRESHOLD

This documents “what we bought” with this threshold, in terms of precision/recall/cost.

In [9]:
def evaluate_with_threshold(model, X, y, threshold, split_name="SPLIT"):
    y_proba = model.predict_proba(X)[:, 1]
    y_pred = (y_proba >= threshold).astype(int)

    roc_auc = roc_auc_score(y, y_proba)
    pr_auc = average_precision_score(y, y_proba)

    tn, fp, fn, tp = confusion_matrix(y, y_pred).ravel()
    prec = precision_score(y, y_pred, zero_division=0)
    rec = recall_score(y, y_pred, zero_division=0)
    f1 = f1_score(y, y_pred, zero_division=0)

    cost = FN_COST * fn + FP_COST * fp

    print(f"=== {split_name} @ threshold={threshold:.3f} ===")
    print(f"ROC-AUC: {roc_auc:.4f}")
    print(f"PR-AUC : {pr_auc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall   : {rec:.4f}")
    print(f"F1-score : {f1:.4f}")
    print(f"Cost     : {cost:.1f}  (FN_COST={FN_COST}, FP_COST={FP_COST})\n")

    print("Confusion matrix:")
    print(np.array([[tn, fp], [fn, tp]]))

    return {
        "roc_auc": roc_auc,
        "pr_auc": pr_auc,
        "precision": prec,
        "recall": rec,
        "f1": f1,
        "cost": cost,
        "tn": tn,
        "fp": fp,
        "fn": fn,
        "tp": tp,
    }

val_metrics_best = evaluate_with_threshold(
    xgb_tuned, X_val, y_val, BEST_THRESHOLD, split_name="VALIDATION (TUNED, COST-OPTIMAL)"
)

=== VALIDATION (TUNED, COST-OPTIMAL) @ threshold=0.510 ===
ROC-AUC: 0.6669
PR-AUC : 0.0417
Precision: 0.0400
Recall   : 0.2895
F1-score : 0.0703
Cost     : 3228.0  (FN_COST=50, FP_COST=1)

Confusion matrix:
[[4442  528]
 [  54   22]]


## Evaluate tuned model on TEST at BEST_THRESHOLD

In [11]:
test_metrics_best = evaluate_with_threshold(
    xgb_tuned, X_test, y_test, BEST_THRESHOLD, split_name="TEST (TUNED, COST-OPTIMAL)"
)

=== TEST (TUNED, COST-OPTIMAL) @ threshold=0.510 ===
ROC-AUC: 0.6597
PR-AUC : 0.0451
Precision: 0.0251
Recall   : 0.3500
F1-score : 0.0468
Cost     : 2766.0  (FN_COST=50, FP_COST=1)

Confusion matrix:
[[4169  816]
 [  39   21]]


## Save the threshold for production

In [12]:
import json

threshold_config = {"best_threshold": BEST_THRESHOLD, "fn_cost": FN_COST, "fp_cost": FP_COST}

with open("threshold_config.json", "w") as f:
    json.dump(threshold_config, f)

print("Saved threshold_config.json:", threshold_config)

Saved threshold_config.json: {'best_threshold': 0.51, 'fn_cost': 50, 'fp_cost': 1}
