In [1]:
!pip -q install numpy pandas scikit-learn plotly

import numpy as np
import pandas as pd
from sklearn.isotonic import IsotonicRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import brier_score_loss, log_loss
import plotly.graph_objects as go
import plotly.express as px

In [2]:
%pip install --upgrade pip

Collecting pip
  Downloading pip-25.3-py3-none-any.whl.metadata (4.7 kB)
Downloading pip-25.3-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m12.2 MB/s[0m  [33m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 25.2
    Uninstalling pip-25.2:
      Successfully uninstalled pip-25.2
Successfully installed pip-25.3
Note: you may need to restart the kernel to use updated packages.


# Simulate Data

**What this section does:** This creates synthetic but realistic data to build and test the OCR optimization pipeline without needing real OCR logs.

**Key parameters to understand:**
- **TRUE_TAU = 80:** This represents the true threshold where OCR reliability changes from "often wrong" to "often right"
- **K = 5.0:** Controls the sharpness of the transition (smaller values = sharper change)
- **For each simulated item, we generate:**
  - A confidence score (0–99) from the OCR system
  - Ground truth correctness (is_correct) using a sigmoid curve where higher scores are more likely correct
  - A human reviewer (A/B/C) with varying reliability levels
  - A review label (the reviewer's judgment, which may be incorrect)
  - A timestamp for temporal ordering

**Output:** A dataset with columns: id, score, is_correct, reviewer_id, review_label, timestamp

This synthetic data serves as the training foundation for the rest of the analysis.


In [3]:
rng = np.random.default_rng(42)

n = 5000
TRUE_TAU = 80       # true threshold
K = 5.0             # slope of sigmoid (steepness)
REVIEWERS = {"A": 0.95, "B": 0.85, "C": 0.70}

def p_correct(score, tau=TRUE_TAU, k=K):
    """True underlying probability that OCR is correct."""
    return 1 / (1 + np.exp(-(score - tau) / k))

scores = rng.integers(0, 100, size=n)
probs = p_correct(scores)
is_correct = rng.binomial(1, probs)

reviewer_id = rng.choice(list(REVIEWERS.keys()), size=n)
review_label = np.array([
    (1 - is_correct[i]) if (rng.random() > REVIEWERS[r]) else is_correct[i]
    for i, r in enumerate(reviewer_id)
])

timestamp = pd.to_datetime("2025-08-01") + pd.to_timedelta(
    rng.integers(0, 60, size=n), unit="D"
)

df = pd.DataFrame({
    "id": np.arange(n),
    "score": scores,
    "is_correct": is_correct,
    "reviewer_id": reviewer_id,
    "review_label": review_label,
    "timestamp": timestamp
}).sort_values("timestamp").reset_index(drop=True)

df.head()

Unnamed: 0,id,score,is_correct,reviewer_id,review_label,timestamp
0,3948,96,1,C,0,2025-08-01
1,4541,62,0,B,1,2025-08-01
2,1183,44,0,A,0,2025-08-01
3,1564,55,0,B,0,2025-08-01
4,772,0,0,C,0,2025-08-01


# Exploratory Data Analysis (EDA)

**What this section does:** Performs basic sanity checks and creates intuition-building visualizations to understand the data patterns.

**Visualizations created:**
- **Score Distribution Histogram:** Shows the frequency of each confidence level in your dataset
- **Accuracy vs Score Bins:** Demonstrates how correctness increases with higher scores
- **True Threshold Marker:** A dashed line at TRUE_TAU=80 shows the intended reliability tipping point

**What to look for in the results:**
- A clear upward trend in the accuracy curve as scores increase
- Low scores should show mostly incorrect results
- High scores should show mostly correct results
- The curve should be smooth and monotonic

**Red flags to watch for:**
- Flat or decreasing accuracy curves (indicates data issues)
- Irregular patterns that don't match expected behavior
- If the true threshold marker doesn't align with the accuracy transition point


In [4]:
# Score distribution
fig1 = px.histogram(df, x="score", nbins=20, title="OCR Confidence Score Distribution",
                    color_discrete_sequence=["#007acc"])
fig1.show()

# Accuracy vs. score bins
bins = np.arange(0, 101, 5)
acc_by_bin = df.groupby(pd.cut(df["score"], bins), observed=False)["is_correct"].mean().reset_index()
acc_by_bin["score_bin"] = bins[:-1] + 2.5

fig2 = px.line(acc_by_bin, x="score_bin", y="is_correct", markers=True,
               title="Empirical Accuracy by Score Bin",
               labels={"is_correct": "Accuracy", "score_bin": "Score"})
fig2.add_vline(x=TRUE_TAU, line_dash="dash", line_color="red", annotation_text="True τ*=80")
fig2.show()

# Train/Validation Split

**What this section does:** Divides the dataset into training and validation sets using temporal ordering to simulate real-world deployment scenarios.

**Method:** 
- **Training set:** Uses the first 80% of data (chronologically earlier)
- **Validation set:** Uses the last 20% of data (chronologically later)

**Why temporal splitting matters:**
- **Realistic evaluation:** Mimics how you would deploy a model trained on historical data to make predictions on future data
- **Prevents data leakage:** Avoids the common mistake of using future information to predict the past
- **Tests model robustness:** Ensures your calibration works on new data, not just the data it was trained on

**Best practice:** Always use temporal splits for time-series data to get honest performance estimates.


In [5]:
cut = int(0.8 * len(df))
train = df.iloc[:cut].copy()
val   = df.iloc[cut:].copy()
print(f"Train size: {len(train)}, Validation size: {len(val)}")

Train size: 4000, Validation size: 1000


# Reviewer Reliability Analysis

**What this section does:** Estimates the trustworthiness of each human reviewer by analyzing their agreement with ground truth labels.

**Process:**
- **Agreement calculation:** For each reviewer, measures how often their review_label matches the true is_correct
- **Reliability scoring:** Converts agreement rates into reliability scores (0-1 scale) using Bayesian estimation
- **Weight assignment:** Uses these reliability scores as sample weights for model training

**Key insights:**
- **Reviewer A:** Highest reliability (0.95) - most trustworthy
- **Reviewer B:** Medium reliability (0.85) - moderately reliable  
- **Reviewer C:** Lower reliability (0.70) - least reliable

**Why this matters:**
- **Quality weighting:** More reliable reviewers have greater influence on model training
- **Noise reduction:** Prevents unreliable reviewers from misleading the calibration process
- **Realistic modeling:** Reflects that human reviewers vary in accuracy and consistency

**Visualization:** The bar chart shows each reviewer's reliability score, helping you identify which reviewers to trust most.


In [6]:
# Reliability (how often reviewer matches ground truth)
g = train.groupby("reviewer_id").apply(
    lambda d: pd.Series({
        "n": len(d),
        "agree": (d["review_label"] == d["is_correct"]).sum()
    })
)
g["alpha"] = 1 + g["agree"]
g["beta"]  = 1 + (g["n"] - g["agree"])
g["mean"]  = g["alpha"] / (g["alpha"] + g["beta"])

weights_map = g["mean"].to_dict()
train["w"] = train["reviewer_id"].map(weights_map).fillna(1.0)

fig = px.bar(g.reset_index(), x="reviewer_id", y="mean",
             title="Reviewer Reliability (posterior mean)",
             labels={"mean": "Reliability", "reviewer_id": "Reviewer"})
fig.show()





# Model Calibration Training

**What this section does:** Trains calibration models to convert raw OCR confidence scores into meaningful probability estimates.

**Two calibration approaches compared:**

**1. Isotonic Regression:**
- **Method:** Non-parametric approach that enforces monotonic relationships
- **Advantage:** Flexible, doesn't assume specific functional form
- **Best for:** When you want the model to learn the optimal curve shape

**2. Platt Scaling (Logistic Regression):**
- **Method:** Parametric approach using logistic regression
- **Advantage:** Simple, interpretable, fast to compute
- **Best for:** When you want a smooth, well-behaved calibration curve

**Training process:**
- Both models are trained using reviewer reliability weights from the previous section
- Performance is evaluated on validation data using Brier Score and Log Loss
- The better-performing model is selected as the final calibrator

**Output:** A calibration function `best_f(score)` that converts any OCR score to a calibrated probability

**Why calibration matters:**
- **Meaningful probabilities:** Raw scores are arbitrary; calibration makes them interpretable
- **Consistent thresholds:** Enables reliable decision-making across different score ranges
- **Temporal stability:** Calibrated probabilities remain meaningful over time


In [7]:
# --- Isotonic ---
iso = IsotonicRegression(out_of_bounds="clip")
iso.fit(train["score"], train["is_correct"], sample_weight=train["w"])
def f_iso(s): return iso.predict(np.asarray(s, dtype=float))

# --- Platt Scaling ---
Xtr = (train["score"].to_numpy().reshape(-1,1) / 99.0)
lr = LogisticRegression(max_iter=1000)
lr.fit(Xtr, train["is_correct"], sample_weight=train["w"])
def f_platt(s):
    s_arr = np.asarray(s, dtype=float).reshape(-1,1)/99.0
    return lr.predict_proba(s_arr)[:,1]

# --- Evaluate both on validation set ---
p_iso = np.clip(f_iso(val["score"]), 1e-6, 1-1e-6)
p_platt = np.clip(f_platt(val["score"]), 1e-6, 1-1e-6)
m_iso = {"brier": brier_score_loss(val["is_correct"], p_iso),
         "logloss": log_loss(val["is_correct"], p_iso)}
m_platt = {"brier": brier_score_loss(val["is_correct"], p_platt),
           "logloss": log_loss(val["is_correct"], p_platt)}

m_iso, m_platt

({'brier': 0.052272841770937294, 'logloss': 0.16309731165238128},
 {'brier': 0.052044098926547815, 'logloss': 0.1683710552926537})

# Calibration Reliability Check

**What this section does:** Creates a reliability plot to validate how well your calibration model performs by comparing predicted probabilities with observed accuracy rates.

**How to interpret the plot:**
- **X-axis:** Predicted probability of correctness (from your calibration model)
- **Y-axis:** Observed accuracy rate (actual performance in that probability bin)
- **Diagonal line (y=x):** Represents perfect calibration

**Reading the results:**
- **Points on the diagonal:** Perfect calibration - predictions match reality
- **Points below the diagonal:** Model is overconfident - predicting higher probabilities than actual performance
- **Points above the diagonal:** Model is underconfident - predicting lower probabilities than actual performance

**What to look for:**
- **Good calibration:** Points should cluster close to the diagonal line
- **Consistent behavior:** The relationship should be roughly linear
- **Bubble size:** Larger bubbles indicate more data points in that probability range

**Why this matters:**
- **Trust validation:** Ensures your probability estimates are reliable for decision-making
- **Model debugging:** Identifies systematic biases in your calibration
- **Deployment confidence:** Confirms your model is ready for production use


In [8]:
best_f = f_iso if m_iso["brier"] <= m_platt["brier"] else f_platt
best_name = "isotonic" if best_f is f_iso else "platt"

bins = np.arange(0, 101, 5)
val["p_hat"] = best_f(val["score"])
cal_curve = val.groupby(pd.cut(val["score"], bins), observed=False).agg(
    p=("p_hat","mean"),
    y=("is_correct","mean"),
    n=("is_correct","size")
).reset_index()

fig = px.scatter(cal_curve, x="p", y="y", size="n",
                 title=f"Reliability Plot (best={best_name})",
                 labels={"p": "Predicted P(correct)", "y": "Observed Accuracy"})
fig.add_trace(go.Scatter(x=[0,1], y=[0,1], mode="lines", name="Perfect Calibration", line=dict(dash="dot")))
fig.show()

# Threshold Optimization

**What this section does:** Finds the optimal confidence threshold (τ) that balances accuracy and human review workload using a utility function approach.

**The optimization process:**
- **Tests all thresholds:** Evaluates every possible threshold from 0 to 99
- **Calculates key metrics for each threshold:**
  - **Accuracy:** Percentage of correct decisions (accept if score ≥ τ, otherwise review)
  - **Review rate:** Percentage of items sent to human reviewers (score < τ)
  - **Utility:** Accuracy − λ × Review Rate (where λ = cost penalty for human reviews)

**Understanding the utility function:**
- **λ (lambda):** Represents the cost penalty for human reviews
- **Higher λ:** Human reviews are expensive → prefer higher thresholds (trust machine more)
- **Lower λ:** Human reviews are cheap → prefer lower thresholds (send more to humans)
- **Optimal τ:** The threshold that maximizes utility for your chosen λ

**Business implications:**
- **Cost optimization:** Balances accuracy gains against human review costs
- **Workload management:** Controls how much work goes to human reviewers
- **Risk tolerance:** Higher thresholds = more automation, lower thresholds = more human oversight

**Output:** A table showing metrics for all thresholds and the optimal threshold for your cost structure


In [9]:
def choose_threshold(scores, labels, lam=0.2):
    thresholds = np.arange(0, 100)
    best = None
    results = []
    for t in thresholds:
        preds = (scores >= t).astype(int)
        acc = (preds == labels).mean()
        review_rate = (scores < t).mean()
        U = acc - lam * review_rate
        results.append({"tau": t, "accuracy": acc, "review_rate": review_rate, "utility": U})
        if best is None or U > best["utility"]:
            best = {"tau": t, "accuracy": acc, "review_rate": review_rate, "utility": U}
    return pd.DataFrame(results), best

results_df, best = choose_threshold(val["score"].to_numpy(), val["is_correct"].to_numpy(), lam=0.2)
best

{'tau': 78,
 'accuracy': 0.929,
 'review_rate': 0.792,
 'utility': 0.7706000000000001}

# Threshold Trade-off Visualization

**What this section does:** Creates interactive visualizations to help you understand the trade-offs between different threshold choices and their impact on system performance.

**Two key visualizations:**

**1. Utility vs Threshold Plot:**
- **Purpose:** Shows the utility curve to identify the optimal threshold
- **Peak location:** The highest point on the curve represents the best threshold for your cost structure
- **Shape insights:** Steep curves indicate sensitive thresholds, flat curves suggest multiple good options

**2. Accuracy & Review Rate vs Threshold Plot:**
- **Dual metrics:** Shows how accuracy and human review workload change together
- **Trade-off visualization:** Helps you see the relationship between automation and accuracy
- **Threshold impact:** Demonstrates how moving the threshold affects both metrics

**Alternative threshold approaches:**
- **Utility-optimal threshold (dashed line):** Best balance of accuracy and cost for your λ value
- **Probability-based cutoff (dotted line):** Conservative approach using calibrated probabilities (e.g., P(correct) ≥ 0.95)

**How to use these plots:**
- **Peak identification:** Find the threshold that maximizes utility
- **Sensitivity analysis:** See how much utility changes with small threshold adjustments
- **Risk assessment:** Compare utility-optimal vs. probability-based approaches
- **Stakeholder communication:** Use visualizations to explain trade-offs to business stakeholders


In [10]:
# Probability-based threshold (for comparison)
score_grid = np.arange(0, 100)
p_grid = best_f(score_grid)
target = 0.95
candidates = score_grid[p_grid >= target]
tau_prob = int(candidates.min()) if len(candidates) else 99

# --- Plot 1: Utility vs Threshold ---
fig1 = go.Figure()
fig1.add_trace(go.Scatter(x=results_df["tau"], y=results_df["utility"],
                          mode="lines", name="Utility"))
fig1.add_vline(x=best["tau"], line_dash="dash", line_color="blue",
               annotation_text=f"Best τ={best['tau']}")
fig1.update_layout(title="Utility vs Threshold",
                   xaxis_title="Threshold (score)", yaxis_title="Utility")
fig1.show()

# --- Plot 2: Accuracy & Review Rate ---
fig2 = go.Figure()
fig2.add_trace(go.Scatter(x=results_df["tau"], y=results_df["accuracy"], mode="lines", name="Accuracy"))
fig2.add_trace(go.Scatter(x=results_df["tau"], y=results_df["review_rate"], mode="lines", name="Review Rate"))
fig2.add_vline(x=best["tau"], line_dash="dash", line_color="blue",
               annotation_text=f"Best τ={best['tau']}")
fig2.add_vline(x=tau_prob, line_dash="dot", line_color="orange",
               annotation_text=f"Prob-cut τ={tau_prob} (P≥{target})")
fig2.update_layout(title="Accuracy & Review Rate vs Threshold",
                   xaxis_title="Threshold (score)", yaxis_title="Metric value",
                   xaxis=dict(range=[0, 100]))
fig2.show()

# Model Artifact Persistence

**What this section does:** Saves the trained calibration model and optimal threshold to disk for deployment in production systems.

**Files created:**
- **`calibrator.pkl`:** Contains the trained calibration function that converts OCR scores to probabilities
- **`threshold.json`:** Stores the optimal threshold value and calibration method used

**Artifact contents:**
- **Calibration function:** The `best_f(score)` function that maps any OCR score to a calibrated probability
- **Optimal threshold:** The τ value that maximizes utility for your cost structure
- **Model metadata:** Information about which calibration method (isotonic vs. platt) was selected

**Production deployment benefits:**
- **Consistent decisions:** Your application can load these artifacts and make identical decisions
- **No retraining needed:** Deploy immediately without rebuilding models
- **Version control:** Track different model versions and rollback if needed
- **Performance:** Pre-trained models load quickly for real-time inference

**Usage in production:**
```python
# Load artifacts
with open('artifacts/calibrator.pkl', 'rb') as f:
    calibrator = pickle.load(f)
with open('artifacts/threshold.json', 'r') as f:
    config = json.load(f)

# Make decisions
score = 85  # OCR confidence score
prob = calibrator(score)  # Calibrated probability
decision = "accept" if score >= config['tau'] else "review"
```


In [11]:
import pickle, json, pathlib

art = pathlib.Path("artifacts"); art.mkdir(exist_ok=True)
with open(art/"calibrator.pkl","wb") as f:
    pickle.dump(best_f, f)
with open(art/"threshold.json","w") as f:
    json.dump({"tau": int(best["tau"]), "type": best_name}, f)

print("✅ Saved calibrator and threshold configuration.")

✅ Saved calibrator and threshold configuration.


# Production Validation with Drift Simulation

**What this section does:** Tests your deployed model on new data to ensure it still performs well, even when the underlying data distribution may have changed (model drift).

**Validation process:**
- **New data simulation:** Generates a fresh batch of OCR data (optionally with drift parameters)
- **Performance comparison:** Evaluates three different threshold strategies:
  - **Saved threshold:** Your previously optimized τ from training
  - **Optimal threshold:** The best τ for this new batch (ground truth)
  - **Probability cutoff:** Conservative approach using calibrated probabilities

**Drift simulation options:**
- **`delta_tau`:** Shifts the true threshold (e.g., -3 means threshold moved down by 3 points)
- **`delta_slope`:** Changes the transition sharpness (e.g., +1.0 makes the curve steeper)
- **Real-world scenarios:** Simulates how OCR system changes, data quality shifts, or domain changes affect performance

**Key metrics reported:**
- **Utility loss:** Difference between saved threshold performance and optimal performance
- **Accuracy comparison:** How well each threshold strategy performs
- **Review rate impact:** How much human workload each approach requires

**What to look for:**
- **Low utility loss:** Your saved threshold should perform close to the optimal threshold
- **Stable performance:** Small changes in utility indicate robust model deployment
- **Drift sensitivity:** How much performance degrades when data distribution changes

**Production monitoring:** Use this approach to continuously validate your deployed model and detect when retraining is needed.


In [12]:
# Cell 10 — Validate Saved Calibrator/Threshold on New Data (with optional drift)

import numpy as np
import pandas as pd
import json, pickle, pathlib
import plotly.graph_objects as go

# -----------------------------
# 0) Config
# -----------------------------
NEW_N = 5000
LAM   = 0.2           # same λ used for utility during training
DRIFT = {"delta_tau": 0, "delta_slope": 0.0}  # e.g., {"delta_tau": -3, "delta_slope": +1.0}

# -----------------------------
# 1) Load artifacts
# -----------------------------
art = pathlib.Path("artifacts")
with open(art/"calibrator.pkl","rb") as f:
    loaded_calibrator = pickle.load(f)
with open(art/"threshold.json","r") as f:
    saved = json.load(f)
TAU_SAVED = int(saved["tau"])
CAL_TYPE  = saved.get("type", "unknown")

print(f"Loaded calibrator type: {CAL_TYPE}, saved τ={TAU_SAVED}")

# -----------------------------
# 2) Simulate a new batch (optionally with drift)
# -----------------------------
rng = np.random.default_rng(2025)

# Reuse TRUE_TAU and K from earlier cells if they exist; else define defaults
try:
    TRUE_TAU
except NameError:
    TRUE_TAU = 80
try:
    K
except NameError:
    K = 5.0

tau_new = TRUE_TAU + DRIFT["delta_tau"]
k_new   = K + DRIFT["delta_slope"]

def p_correct_new(score, tau=tau_new, k=k_new):
    return 1.0 / (1.0 + np.exp(-(score - tau) / k))

scores_new = rng.integers(0, 100, size=NEW_N)
is_correct_new = rng.binomial(1, p_correct_new(scores_new))

df_new = pd.DataFrame({
    "id": np.arange(NEW_N),
    "score": scores_new,
    "is_correct": is_correct_new
}).sort_values("id").reset_index(drop=True)

# -----------------------------
# 3) Evaluate saved threshold on new batch
# -----------------------------
s = df_new["score"].to_numpy()
y = df_new["is_correct"].to_numpy()

def eval_at_tau(scores, labels, tau, lam=LAM):
    preds = (scores >= tau).astype(int)
    acc = (preds == labels).mean()
    review = (scores < tau).mean()
    util = acc - lam * review
    return acc, review, util

acc_saved, review_saved, util_saved = eval_at_tau(s, y, TAU_SAVED, lam=LAM)

# -----------------------------
# 4) Find ground-truth optimal τ on new batch
# -----------------------------
taus = np.arange(0, 100)
utils = []
for t in taus:
    _, _, u = eval_at_tau(s, y, t, lam=LAM)
    utils.append(u)
utils = np.array(utils)
best_idx = int(np.argmax(utils))
TAU_TRUE_STAR = int(taus[best_idx])
ACC_TRUE_STAR, REVIEW_TRUE_STAR, UTIL_TRUE_STAR = eval_at_tau(s, y, TAU_TRUE_STAR, lam=LAM)

# -----------------------------
# 5) Probability-based cutoff using loaded calibrator (optional)
# -----------------------------
# The saved calibrator maps score -> P(correct); we use it to find prob-cutoff score for P ≥ 0.95
try:
    p_grid = np.asarray(loaded_calibrator(np.arange(0,100)), dtype=float)
    target = 0.95
    cands = np.arange(0,100)[p_grid >= target]
    TAU_PROB = int(cands.min()) if len(cands) else 99
except Exception:
    p_grid, TAU_PROB, target = None, None, 0.95

# -----------------------------
# 6) Plotly visuals (Utility curve + markers)
# -----------------------------
fig = go.Figure()
fig.add_trace(go.Scatter(x=taus, y=utils, mode="lines", name="Utility"))
fig.add_vline(x=TAU_SAVED, line_dash="dash", line_color="blue",
              annotation_text=f"Saved τ={TAU_SAVED}")
fig.add_vline(x=TAU_TRUE_STAR, line_dash="dot", line_color="green",
              annotation_text=f"Best τ on new data={TAU_TRUE_STAR}")
if TAU_PROB is not None:
    fig.add_vline(x=TAU_PROB, line_dash="dashdot", line_color="orange",
                  annotation_text=f"Prob-cut τ={TAU_PROB} (P≥{target:.2f})")
fig.update_layout(title="Utility vs Threshold (New Batch)",
                  xaxis_title="Threshold (score)",
                  yaxis_title="Utility",
                  hovermode="x unified")
fig.show()

# -----------------------------
# 7) Secondary plot: Accuracy & Review rate vs τ (optional)
# -----------------------------
accs = []; reviews = []
for t in taus:
    a, r, _ = eval_at_tau(s, y, t, lam=LAM)
    accs.append(a); reviews.append(r)

fig2 = go.Figure()
fig2.add_trace(go.Scatter(x=taus, y=accs, mode="lines", name="Accuracy"))
fig2.add_trace(go.Scatter(x=taus, y=reviews, mode="lines", name="Review rate"))
fig2.add_vline(x=TAU_SAVED, line_dash="dash", line_color="blue",
               annotation_text=f"Saved τ={TAU_SAVED}")
fig2.add_vline(x=TAU_TRUE_STAR, line_dash="dot", line_color="green",
               annotation_text=f"Best τ={TAU_TRUE_STAR}")
if TAU_PROB is not None:
    fig2.add_vline(x=TAU_PROB, line_dash="dashdot", line_color="orange",
                   annotation_text=f"Prob-cut τ={TAU_PROB} (P≥{target:.2f})")
fig2.update_layout(title="Accuracy & Review Rate vs Threshold (New Batch)",
                   xaxis_title="Threshold (score)",
                   yaxis_title="Metric value",
                   hovermode="x unified")
fig2.show()

# -----------------------------
# 8) Print concise summary
# -----------------------------
util_loss = UTIL_TRUE_STAR - util_saved
pct_loss = 0.0 if UTIL_TRUE_STAR == 0 else (util_loss / abs(UTIL_TRUE_STAR)) * 100.0

print("=== New Batch Evaluation ===")
print(f"Saved τ: {TAU_SAVED} | Acc={acc_saved:.3f} | Review={review_saved:.3f} | Utility={util_saved:.3f}")
print(f"Best τ (new data): {TAU_TRUE_STAR} | Acc={ACC_TRUE_STAR:.3f} | Review={REVIEW_TRUE_STAR:.3f} | Utility={UTIL_TRUE_STAR:.3f}")
if TAU_PROB is not None:
    acc_prob, review_prob, util_prob = eval_at_tau(s, y, TAU_PROB, lam=LAM)
    print(f"Prob-cut τ (P≥{target:.2f}): {TAU_PROB} | Acc={acc_prob:.3f} | Review={review_prob:.3f} | Utility={util_prob:.3f}")
print(f"Utility loss of saved τ vs best: {util_loss:.3f} ({pct_loss:.2f}%)")
print(f"Drift simulated: Δτ={DRIFT['delta_tau']}, Δslope={DRIFT['delta_slope']}")

Loaded calibrator type: platt, saved τ=78


=== New Batch Evaluation ===
Saved τ: 78 | Acc=0.936 | Review=0.786 | Utility=0.778
Best τ (new data): 79 | Acc=0.939 | Review=0.795 | Utility=0.780
Prob-cut τ (P≥0.95): 99 | Acc=0.816 | Review=0.988 | Utility=0.618
Utility loss of saved τ vs best: 0.002 (0.19%)
Drift simulated: Δτ=0, Δslope=0.0


In [13]:
# -----------------------------
# 6) Plotly visuals (Utility curve + markers)
# -----------------------------
fig = go.Figure()
fig.add_trace(go.Scatter(x=taus, y=utils, mode="lines", name="Utility"))
fig.add_vline(x=TAU_SAVED, line_dash="dash", line_color="blue")
fig.add_vline(x=TAU_TRUE_STAR, line_dash="dot", line_color="green")
if TAU_PROB is not None:
    fig.add_vline(x=TAU_PROB, line_dash="dashdot", line_color="orange")

# Add non-overlapping annotations separately
annots = [
    dict(x=TAU_SAVED, y=utils.max(), text=f"Saved τ={TAU_SAVED}",
         showarrow=True, arrowhead=2, yshift=20, ax=40, ay=-40, font=dict(size=12)),
    dict(x=TAU_TRUE_STAR, y=utils.max()*0.95, text=f"Best τ={TAU_TRUE_STAR}",
         showarrow=True, arrowhead=2, yshift=20, ax=-40, ay=-40, font=dict(size=12))
]
if TAU_PROB is not None:
    annots.append(
        dict(x=TAU_PROB, y=utils.max()*0.9, text=f"Prob-cut τ={TAU_PROB} (P≥{target:.2f})",
             showarrow=True, arrowhead=2, yshift=20, ax=40, ay=-40, font=dict(size=12))
    )

fig.update_layout(
    title="Utility vs Threshold (New Batch)",
    xaxis_title="Threshold (score)",
    yaxis_title="Utility",
    annotations=annots,
    hovermode="x unified",
    margin=dict(l=60, r=40, t=60, b=50),
    template="plotly_white"
)
fig.show()

# -----------------------------
# 7) Secondary plot: Accuracy & Review rate vs τ
# -----------------------------
accs = []; reviews = []
for t in taus:
    a, r, _ = eval_at_tau(s, y, t, lam=LAM)
    accs.append(a); reviews.append(r)

fig2 = go.Figure()
fig2.add_trace(go.Scatter(x=taus, y=accs, mode="lines", name="Accuracy"))
fig2.add_trace(go.Scatter(x=taus, y=reviews, mode="lines", name="Review rate"))
fig2.add_vline(x=TAU_SAVED, line_dash="dash", line_color="blue")
fig2.add_vline(x=TAU_TRUE_STAR, line_dash="dot", line_color="green")
if TAU_PROB is not None:
    fig2.add_vline(x=TAU_PROB, line_dash="dashdot", line_color="orange")

fig2.update_layout(
    title="Accuracy & Review Rate vs Threshold (New Batch)",
    xaxis_title="Threshold (score)",
    yaxis_title="Metric value",
    hovermode="x unified",
    margin=dict(l=60, r=40, t=60, b=50),
    template="plotly_white",
    annotations=[
        dict(x=TAU_SAVED, y=max(accs), text=f"Saved τ={TAU_SAVED}", showarrow=False, yshift=25, font=dict(size=12)),
        dict(x=TAU_TRUE_STAR, y=max(accs)*0.95, text=f"Best τ={TAU_TRUE_STAR}", showarrow=False, yshift=25, font=dict(size=12)),
        dict(x=TAU_PROB, y=max(accs)*0.9, text=f"Prob-cut τ={TAU_PROB} (P≥{target:.2f})",
             showarrow=False, yshift=25, font=dict(size=12)) if TAU_PROB is not None else {}
    ]
)
fig2.show()

# Baseline Configuration — OCR Threshold Optimization

### Experiment Metadata
| Setting | Value | Notes |
|----------|--------|-------|
| Date | 2025-10-21 | Baseline run |
| Version | v1.0 | Increment when re-running pipeline |
| Dataset size | n = 5000 | Simulated OCR batch |
| Drift config | Δτ = 0, Δslope = 0.0 | Used for new batch test |
| Train / Validation split | 80 / 20 | Time-based split |
| λ (review-cost weight) | 0.2 | Penalty term in utility function |


### Model Configuration
| Component | Setting | Description |
|------------|----------|-------------|
| Calibration algorithm | Isotonic Regression | Maps OCR score to calibrated P(correct) |
| Alternative model tested | Logistic Regression (Platt Scaling) | Simpler sigmoid calibration |
| Reviewer weighting | Beta-Bernoulli posterior mean | Weights reviewers by reliability |
| Threshold optimization | Utility = Accuracy – λ × ReviewRate | Searches thresholds 0–99 |
| Probability cutoff (optional) | P(correct) ≥ 0.95 | Defines stricter "auto-accept" mode |


### Results Summary
| Metric | Value | Meaning |
|---------|--------|---------|
| Best τ (utility-optimal) | 78 | Learned decision threshold |
| Accuracy at τ | ≈ 0.93 | Overall system accuracy |
| Review rate | ≈ 0.79 | Percentage routed to human review |
| Utility | ≈ 0.77 | Combined accuracy–cost score |
| Probability-cut τ | 99 | Risk-averse acceptance cutoff |
| Calibrator type used | Isotonic Regression | Chosen by lowest Brier/LogLoss |
| Reviewer reliabilities | A ≈ 0.95, B ≈ 0.85, C ≈ 0.70 | Estimated from audits |
| Validation utility loss | 0% | Perfect generalization on new batch |


### Artifacts
| File | Description |
|------|--------------|
| artifacts/calibrator.pkl | Saved calibration model |
| artifacts/threshold.json | Saved τ configuration |
| plots/utility_vs_threshold.html | Utility optimization visualization |
| plots/reliability_curve.html | Calibration reliability visualization |


### Next Steps
- Freeze this configuration as baseline v1.0
- Implement online updating (rolling 30-day recalibration)
- Add drift monitoring for τ, utility, and calibration loss
- Track reviewer reliability over time
- Run λ-sensitivity analysis (λ = 0.1 – 0.5)

need to have online udpates next


In [14]:
# Save Individual Figures as HTML Files
# ------------------------------------
# Run this cell after all your other cells

import os
figures_dir = "figures"
os.makedirs(figures_dir, exist_ok=True)

print("Saving individual figures as HTML files...")

# Save each figure with unique names
fig1.write_html(f"{figures_dir}/score_distribution.html")
print("✅ Saved: score_distribution.html")

fig2.write_html(f"{figures_dir}/accuracy_vs_score.html") 
print("✅ Saved: accuracy_vs_score.html")

# Note: You'll need to save 'fig' variables immediately after creating them
# since they get overwritten by later figures

print(f"\n🎉 Figures saved to the '{figures_dir}' directory!")

Saving individual figures as HTML files...
✅ Saved: score_distribution.html
✅ Saved: accuracy_vs_score.html

🎉 Figures saved to the 'figures' directory!


In [15]:
import numpy as np
import pandas as pd
import plotly.express as px

# Range of lambda values to test
lams = np.linspace(0, 1, 21)
records = []

for lam in lams:
    best = None
    best_u = -np.inf
    for tau in range(0, 100):
        preds = (df["score"] >= tau).astype(int)
        acc = (preds == df["is_correct"]).mean()
        review_rate = (df["score"] < tau).mean()
        u = acc - lam * review_rate
        if u > best_u:
            best_u, best = u, tau
    records.append({
        "lam": lam,
        "tau_opt": best,
        "utility": best_u,
        "review_rate": (df["score"] < best).mean(),
        "accuracy": (df["score"] >= best).astype(int).eq(df["is_correct"]).mean()
    })

df_lam = pd.DataFrame(records)

# Plot review rate vs λ
fig1 = px.line(
    df_lam, x="lam", y="review_rate", markers=True,
    title="Review Rate vs λ (Cost Penalty)",
    labels={"lam": "λ (Review Cost Weight)", "review_rate": "Review Rate"}
)
fig1.show()

# Optional: utility vs λ
fig2 = px.line(
    df_lam, x="lam", y="utility", markers=True,
    title="Utility vs λ (Trade-off Curve)",
    labels={"lam": "λ (Review Cost Weight)", "utility": "Utility"}
)
fig2.show()

In [16]:
# ROC Curve Analysis for OCR System
# ---------------------------------
# This creates a ROC curve to evaluate OCR performance across all thresholds

from sklearn.metrics import roc_curve, auc, roc_auc_score
import plotly.graph_objects as go
import numpy as np
import pandas as pd

def create_roc_curve(y_true, y_scores, title="ROC Curve for OCR System"):
    """
    Create a ROC curve for OCR system evaluation.
    
    Parameters:
    -----------
    y_true : array-like
        True labels (0 or 1)
    y_scores : array-like  
        OCR confidence scores
    title : str
        Title for the plot
        
    Returns:
    --------
    tuple: (fig, auc_score, optimal_threshold)
    """
    
    # Calculate ROC curve
    fpr, tpr, thresholds = roc_curve(y_true, y_scores)
    roc_auc = auc(fpr, tpr)
    
    # Create ROC curve plot
    fig = go.Figure()
    
    # Add ROC curve
    fig.add_trace(go.Scatter(
        x=fpr, y=tpr,
        mode='lines',
        name=f'ROC Curve (AUC = {roc_auc:.3f})',
        line=dict(color='blue', width=3)
    ))
    
    # Add diagonal line (random classifier)
    fig.add_trace(go.Scatter(
        x=[0, 1], y=[0, 1],
        mode='lines',
        name='Random Classifier',
        line=dict(color='red', dash='dash', width=2)
    ))
    
    # Add optimal threshold point
    optimal_idx = np.argmax(tpr - fpr)
    optimal_threshold = thresholds[optimal_idx]
    fig.add_trace(go.Scatter(
        x=[fpr[optimal_idx]], y=[tpr[optimal_idx]],
        mode='markers',
        name=f'Optimal Threshold (τ={optimal_threshold:.0f})',
        marker=dict(color='green', size=12, symbol='star')
    ))
    
    # Update layout
    fig.update_layout(
        title=title,
        xaxis_title='False Positive Rate (FPR)',
        yaxis_title='True Positive Rate (TPR)',
        xaxis=dict(range=[0, 1]),
        yaxis=dict(range=[0, 1]),
        template='plotly_white',
        hovermode='x unified'
    )
    
    # Add annotations
    fig.add_annotation(
        x=0.6, y=0.2,
        text=f'AUC = {roc_auc:.3f}<br>Optimal τ = {optimal_threshold:.0f}',
        showarrow=True,
        arrowhead=2,
        bgcolor='rgba(255,255,255,0.8)',
        bordercolor='black',
        borderwidth=1
    )
    
    return fig, roc_auc, optimal_threshold

# Example usage (run this in your notebook after creating val and best variables):

# ROC Curve Analysis
fig_roc, auc_score, optimal_threshold = create_roc_curve(
    val["is_correct"], 
    val["score"], 
    "ROC Curve for OCR System Performance"
)

fig_roc.show()

# Print performance metrics
print("=== ROC Curve Analysis ===")
print(f"AUC Score: {auc_score:.3f}")
print(f"Optimal Threshold: {optimal_threshold:.0f}")

# Compare with utility-optimized threshold
utility_threshold = best["tau"]
print(f"\nUtility-optimized threshold: {utility_threshold}")
print(f"ROC-optimized threshold: {optimal_threshold:.0f}")
print(f"Difference: {abs(utility_threshold - optimal_threshold):.0f} points")



=== ROC Curve Analysis ===
AUC Score: 0.975
Optimal Threshold: 72


TypeError: 'int' object is not subscriptable