From 37f8edeb31a53520e3b8691da1210616250017ea Mon Sep 17 00:00:00 2001
From: J Xu <jxucoder@gmail.com>
Date: Sun, 22 Mar 2026 19:23:03 -0700
Subject: [PATCH 1/6] Add GPU optimizations, profiling tooling, parallel tests,
 and benchmark restructuring

- Wire fit_tree_gpu_native into training loop for ~7x GPU speedup
- Add histogram subtraction in LevelWise growth (halves histogram work)
- Cache GPU arrays in TreeStructure to avoid repeated to_device() calls
- Skip loss computation when no callbacks registered
- Add ProfilingCallback with structured JSON reports for improvement loops
- Add pytest-xdist for parallel test execution (1.4x faster test suite)
- Restructure benchmarks: replace scattered scripts with focused modules
- Add shared test fixtures in conftest.py and new test files
- Update CI workflow with fast/full/performance test stages

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/workflows/unit-tests.yml              |  61 +-
 CLAUDE.md                                     |  52 ++
 benchmarks/check_performance.py               | 177 ++++++
 benchmarks/check_scaling.py                   | 119 ++++
 benchmarks/{run.py => compare_cpu.py}         |  10 +-
 benchmarks/compare_gpu.py                     | 551 ++++++++++++++++
 benchmarks/ebm_benchmark.py                   | 502 ---------------
 benchmarks/ngboost_benchmark.py               | 315 ----------
 benchmarks/performance_report.py              | 436 -------------
 .../{modal_bench.py => profile_kernels.py}    |   2 +-
 benchmarks/profile_loop.py                    | 100 +++
 ...ml_integration.py => validate_datasets.py} |  14 +-
 benchmarks/xgboost_benchmark.py               | 594 ------------------
 pyproject.toml                                |  13 +-
 src/openboost/__init__.py                     |   2 +
 src/openboost/_core/_growth.py                | 153 ++++-
 src/openboost/_models/_boosting.py            | 113 +++-
 src/openboost/_profiler.py                    | 495 +++++++++++++++
 tests/conftest.py                             | 188 ++++++
 tests/test_binning_correctness.py             | 170 +++++
 tests/test_callbacks.py                       | 193 ++++++
 tests/test_core.py                            | 225 +++----
 tests/test_gam.py                             | 192 ++++++
 tests/test_kernel_correctness.py              | 475 ++++++++++++++
 tests/test_linear_leaf.py                     | 180 ++++++
 tests/test_loss_correctness.py                | 298 +++++++++
 tests/test_numerical_agreement.py             | 268 ++++++++
 uv.lock                                       |  34 +-
 28 files changed, 3875 insertions(+), 2057 deletions(-)
 create mode 100644 benchmarks/check_performance.py
 create mode 100644 benchmarks/check_scaling.py
 rename benchmarks/{run.py => compare_cpu.py} (98%)
 create mode 100644 benchmarks/compare_gpu.py
 delete mode 100644 benchmarks/ebm_benchmark.py
 delete mode 100644 benchmarks/ngboost_benchmark.py
 delete mode 100644 benchmarks/performance_report.py
 rename benchmarks/{modal_bench.py => profile_kernels.py} (99%)
 create mode 100644 benchmarks/profile_loop.py
 rename benchmarks/{openml_integration.py => validate_datasets.py} (98%)
 delete mode 100644 benchmarks/xgboost_benchmark.py
 create mode 100644 src/openboost/_profiler.py
 create mode 100644 tests/conftest.py
 create mode 100644 tests/test_binning_correctness.py
 create mode 100644 tests/test_callbacks.py
 create mode 100644 tests/test_gam.py
 create mode 100644 tests/test_kernel_correctness.py
 create mode 100644 tests/test_linear_leaf.py
 create mode 100644 tests/test_loss_correctness.py
 create mode 100644 tests/test_numerical_agreement.py

diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
index b7bb19c..62965f8 100644
--- a/.github/workflows/unit-tests.yml
+++ b/.github/workflows/unit-tests.yml
@@ -1,4 +1,4 @@
-name: Unit Tests
+name: Tests
 
 on:
   push:
@@ -7,12 +7,13 @@ on:
     branches: [main]
 
 jobs:
-  test:
+  # Fast tests: <3 min, runs on every PR
+  fast-tests:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
         os: [ubuntu-latest, macos-latest]
-        python-version: ["3.10", "3.11", "3.12"]
+        python-version: ["3.10", "3.12"]
 
     steps:
       - uses: actions/checkout@v4
@@ -30,9 +31,7 @@ jobs:
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
-          # Install package with test and sklearn dependencies
           pip install -e ".[test,sklearn]"
-          # Also install xgboost for integration tests
           pip install "xgboost>=2.0"
 
       - name: Lint with ruff
@@ -40,9 +39,59 @@ jobs:
           pip install "ruff>=0.4"
           ruff check src/openboost/
 
-      - name: Run tests (CPU backend)
+      - name: Run fast tests (CPU backend)
+        env:
+          OPENBOOST_BACKEND: "cpu"
+        run: |
+          pytest tests/ -v --tb=short -m "not slow and not benchmark"
+
+  # Full tests: includes slow tests, runs after fast tests pass
+  full-tests:
+    runs-on: ubuntu-latest
+    needs: fast-tests
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python 3.12
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -e ".[test,sklearn]"
+          pip install "xgboost>=2.0"
+
+      - name: Run all tests (CPU backend)
         env:
           OPENBOOST_BACKEND: "cpu"
         run: |
           pytest tests/ -v --tb=short
 
+  # Performance regression check: main branch only
+  performance-check:
+    runs-on: ubuntu-latest
+    if: github.ref == 'refs/heads/main'
+    needs: full-tests
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python 3.12
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -e ".[test,sklearn]"
+          pip install "xgboost>=2.0"
+
+      - name: Performance regression check
+        env:
+          OPENBOOST_BACKEND: "cpu"
+        run: |
+          python benchmarks/check_performance.py
diff --git a/CLAUDE.md b/CLAUDE.md
index fba7f6e..afc6f10 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -75,3 +75,55 @@ DART, LinearLeaf     growth strategies
 - **uv only** for package management — never `pip install` or `conda`.
 - All Numba-jitted functions use `@njit` or `@cuda.jit`. CPU kernels are in `_backends/_cpu.py`, CUDA in `_backends/_cuda.py`.
 - Test environment variable `OPENBOOST_BACKEND=cpu` forces CPU backend in CI.
+
+## Working Style
+
+### 1. Plan Mode Default
+- Enter plan mode for ANY non-trivial task (3+ steps or architectural decisions)
+- If something goes sideways, STOP and re-plan immediately -- don't keep pushing
+- Use plan mode for verification steps, not just building
+- Write detailed specs upfront to reduce ambiguity
+
+### 2. Subagent Strategy
+- Use subagents liberally to keep main context window clean
+- Offload research, exploration, and parallel analysis to subagents
+- For complex problems, throw more compute at it via subagents
+- One task per subagent for focused execution
+
+### 3. Self-Improvement Loop
+- After ANY correction from the user: update `tasks/lessons.md` with the pattern
+- Write rules for yourself that prevent the same mistake
+- Ruthlessly iterate on these lessons until mistake rate drops
+- Review lessons at session start for relevant project
+
+### 4. Verification Before Done
+- Never mark a task complete without proving it works
+- Diff behavior between main and your changes when relevant
+- Ask yourself: "Would a staff engineer approve this?"
+- Run tests, check logs, demonstrate correctness
+
+### 5. Demand Elegance (Balanced)
+- For non-trivial changes: pause and ask "is there a more elegant way?"
+- If a fix feels hacky: "Knowing everything I know now, implement the elegant solution"
+- Skip this for simple, obvious fixes -- don't over-engineer
+- Challenge your own work before presenting it
+
+### 6. Autonomous Bug Fixing
+- When given a bug report: just fix it. Don't ask for hand-holding
+- Point at logs, errors, failing tests -- then resolve them
+- Zero context switching required from the user
+- Go fix failing CI tests without being told how
+
+## Task Management
+
+1. **Plan First**: Write plan to `tasks/todo.md` with checkable items
+2. **Verify Plan**: Check in before starting implementation
+3. **Track Progress**: Mark items complete as you go
+4. **Explain Changes**: High-level summary at each step
+5. **Document Results**: Add review section to `tasks/todo.md`
+6. **Capture Lessons**: Update `tasks/lessons.md` after corrections
+
+## Core Principles
+
+- **Simplicity First**: Make every change as simple as possible. Impact minimal code.
+- **No Laziness**: Find root causes. No temporary fixes. Senior developer standards.
diff --git a/benchmarks/check_performance.py b/benchmarks/check_performance.py
new file mode 100644
index 0000000..bc8aa5c
--- /dev/null
+++ b/benchmarks/check_performance.py
@@ -0,0 +1,177 @@
+"""Performance regression check for CI.
+
+Runs a fixed, small benchmark and compares against stored baselines.
+Fails if any metric degrades by more than 20%.
+
+Usage:
+    uv run python benchmarks/check_performance.py
+    uv run python benchmarks/check_performance.py --update-baselines
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+import time
+import tracemalloc
+from pathlib import Path
+
+import numpy as np
+
+PROJECT_ROOT = Path(__file__).parent.parent
+sys.path.insert(0, str(PROJECT_ROOT / "src"))
+
+BASELINE_FILE = Path(__file__).parent / "results" / "performance_baselines.json"
+
+# Regression threshold: fail if metric exceeds baseline by this factor
+REGRESSION_THRESHOLD = 1.20  # 20%
+
+
+def _generate_data(n_samples=5000, n_features=10, seed=42):
+    """Generate fixed synthetic dataset."""
+    rng = np.random.RandomState(seed)
+    X = rng.randn(n_samples, n_features).astype(np.float32)
+    y = (X[:, 0] + 0.5 * X[:, 1] - 0.3 * X[:, 2]
+         + rng.randn(n_samples).astype(np.float32) * 0.1).astype(np.float32)
+    return X, y
+
+
+def run_fixed_benchmark():
+    """Run fixed benchmark and return results dict."""
+    import openboost as ob
+
+    X, y = _generate_data()
+    n_trees = 100
+    max_depth = 6
+
+    # Measure fit time (median of 3 trials)
+    fit_times = []
+    for _ in range(3):
+        model = ob.GradientBoosting(
+            n_trees=n_trees, max_depth=max_depth, learning_rate=0.1
+        )
+        t0 = time.perf_counter()
+        model.fit(X, y)
+        fit_times.append(time.perf_counter() - t0)
+
+    # Measure predict time
+    predict_times = []
+    for _ in range(3):
+        t0 = time.perf_counter()
+        model.predict(X)
+        predict_times.append(time.perf_counter() - t0)
+
+    # Measure peak memory
+    tracemalloc.start()
+    model2 = ob.GradientBoosting(
+        n_trees=n_trees, max_depth=max_depth, learning_rate=0.1
+    )
+    model2.fit(X, y)
+    _, peak = tracemalloc.get_traced_memory()
+    tracemalloc.stop()
+    peak_mb = peak / 1024 / 1024
+
+    # Measure accuracy
+    pred = model.predict(X)
+    mse = float(np.mean((pred - y) ** 2))
+    r2 = float(1 - np.sum((pred - y) ** 2) / np.sum((y - np.mean(y)) ** 2))
+
+    return {
+        "fit_time_median": float(sorted(fit_times)[1]),
+        "predict_time_median": float(sorted(predict_times)[1]),
+        "peak_memory_mb": float(peak_mb),
+        "mse": mse,
+        "r2": r2,
+        "n_samples": len(X),
+        "n_features": X.shape[1],
+        "n_trees": n_trees,
+        "max_depth": max_depth,
+    }
+
+
+def save_baselines(results):
+    """Save results as new baselines."""
+    BASELINE_FILE.parent.mkdir(parents=True, exist_ok=True)
+    with open(BASELINE_FILE, "w") as f:
+        json.dump(results, f, indent=2)
+    print(f"Baselines saved to {BASELINE_FILE}")
+
+
+def load_baselines():
+    """Load stored baselines."""
+    with open(BASELINE_FILE) as f:
+        return json.load(f)
+
+
+def check_regression(results, baselines):
+    """Compare results against baselines. Returns list of regressions."""
+    regressions = []
+
+    # Time metrics: fail if current > baseline * threshold
+    for metric in ["fit_time_median", "predict_time_median"]:
+        if results[metric] > baselines[metric] * REGRESSION_THRESHOLD:
+            regressions.append(
+                f"  {metric}: {results[metric]:.4f}s > "
+                f"{baselines[metric]:.4f}s * {REGRESSION_THRESHOLD} = "
+                f"{baselines[metric] * REGRESSION_THRESHOLD:.4f}s"
+            )
+
+    # Memory: fail if current > baseline * threshold
+    if results["peak_memory_mb"] > baselines["peak_memory_mb"] * REGRESSION_THRESHOLD:
+        regressions.append(
+            f"  peak_memory_mb: {results['peak_memory_mb']:.2f}MB > "
+            f"{baselines['peak_memory_mb']:.2f}MB * {REGRESSION_THRESHOLD}"
+        )
+
+    # Accuracy: fail if MSE increases (model got worse)
+    if results["mse"] > baselines["mse"] * REGRESSION_THRESHOLD:
+        regressions.append(
+            f"  mse: {results['mse']:.6f} > "
+            f"{baselines['mse']:.6f} * {REGRESSION_THRESHOLD}"
+        )
+
+    return regressions
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Performance regression check")
+    parser.add_argument(
+        "--update-baselines", action="store_true",
+        help="Update baselines with current results"
+    )
+    args = parser.parse_args()
+
+    print("Running fixed benchmark...")
+    results = run_fixed_benchmark()
+
+    print(f"  fit_time:     {results['fit_time_median']:.4f}s")
+    print(f"  predict_time: {results['predict_time_median']:.4f}s")
+    print(f"  peak_memory:  {results['peak_memory_mb']:.2f}MB")
+    print(f"  mse:          {results['mse']:.6f}")
+    print(f"  r2:           {results['r2']:.4f}")
+
+    if args.update_baselines:
+        save_baselines(results)
+        return
+
+    if not BASELINE_FILE.exists():
+        print(f"\nNo baselines found at {BASELINE_FILE}")
+        print("Run with --update-baselines to create them.")
+        save_baselines(results)
+        return
+
+    baselines = load_baselines()
+    regressions = check_regression(results, baselines)
+
+    if regressions:
+        print(f"\nPerformance regression detected ({REGRESSION_THRESHOLD:.0%} threshold):")
+        for r in regressions:
+            print(r)
+        sys.exit(1)
+    else:
+        print("\nNo performance regressions detected.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/check_scaling.py b/benchmarks/check_scaling.py
new file mode 100644
index 0000000..47633c6
--- /dev/null
+++ b/benchmarks/check_scaling.py
@@ -0,0 +1,119 @@
+"""Scaling analysis for OpenBoost.
+
+Measures how training time scales with n_samples and n_features.
+Computes scaling exponents to verify sub-quadratic behavior.
+
+Usage:
+    uv run python benchmarks/check_scaling.py
+    uv run python benchmarks/check_scaling.py --quick
+"""
+
+from __future__ import annotations
+
+import argparse
+import sys
+import time
+from pathlib import Path
+
+import numpy as np
+
+PROJECT_ROOT = Path(__file__).parent.parent
+sys.path.insert(0, str(PROJECT_ROOT / "src"))
+
+import openboost as ob  # noqa: E402
+
+
+def run_scaling_analysis(quick=False):
+    """Run scaling analysis across n_samples and n_features."""
+
+    if quick:
+        sample_grid = [1_000, 5_000, 10_000]
+        feature_grid = [10, 50]
+        n_trees = 20
+    else:
+        sample_grid = [1_000, 5_000, 10_000, 50_000, 100_000]
+        feature_grid = [10, 50, 100]
+        n_trees = 50
+
+    max_depth = 6
+    learning_rate = 0.1
+
+    results = []
+
+    print(f"{'n_samples':>10} {'n_features':>10} {'fit_time':>10} {'pred_time':>10}")
+    print("-" * 45)
+
+    for n_features in feature_grid:
+        for n_samples in sample_grid:
+            rng = np.random.RandomState(42)
+            X = rng.randn(n_samples, n_features).astype(np.float32)
+            y = (X[:, 0] + 0.5 * X[:, 1] + rng.randn(n_samples).astype(np.float32) * 0.1).astype(np.float32)
+
+            # Fit timing (single run for large data, 3 runs for small)
+            trials = 3 if n_samples <= 10_000 else 1
+            fit_times = []
+            for _ in range(trials):
+                m = ob.GradientBoosting(
+                    n_trees=n_trees, max_depth=max_depth, learning_rate=learning_rate
+                )
+                t0 = time.perf_counter()
+                m.fit(X, y)
+                fit_times.append(time.perf_counter() - t0)
+
+            # Predict timing
+            pred_times = []
+            for _ in range(3):
+                t0 = time.perf_counter()
+                m.predict(X)
+                pred_times.append(time.perf_counter() - t0)
+
+            fit_time = sorted(fit_times)[len(fit_times) // 2]
+            pred_time = sorted(pred_times)[1]
+
+            results.append({
+                "n_samples": n_samples,
+                "n_features": n_features,
+                "fit_time": fit_time,
+                "pred_time": pred_time,
+            })
+
+            print(f"{n_samples:>10} {n_features:>10} {fit_time:>10.4f}s {pred_time:>10.4f}s")
+
+    # Compute scaling exponents
+    print("\n" + "=" * 50)
+    print("Scaling Exponents (log(time) = alpha * log(n_samples) + beta)")
+    print("  alpha ≈ 1.0: linear scaling (optimal)")
+    print("  alpha ≈ 1.5: O(n^1.5) (acceptable)")
+    print("  alpha ≈ 2.0: quadratic (bad)")
+    print("=" * 50)
+
+    for n_features in feature_grid:
+        subset = [r for r in results if r["n_features"] == n_features]
+        if len(subset) < 3:
+            continue
+
+        log_n = np.log([r["n_samples"] for r in subset])
+        log_t = np.log([r["fit_time"] for r in subset])
+
+        # Linear regression: log_t = alpha * log_n + beta
+        alpha, beta = np.polyfit(log_n, log_t, 1)
+
+        print(f"\n  n_features={n_features}: alpha = {alpha:.2f}")
+        if alpha < 1.3:
+            print("    -> Near-linear scaling")
+        elif alpha < 1.7:
+            print("    -> Slightly super-linear")
+        else:
+            print("    -> WARNING: scaling appears quadratic or worse")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="OpenBoost Scaling Analysis")
+    parser.add_argument("--quick", action="store_true", help="Quick mode with fewer data points")
+    args = parser.parse_args()
+
+    run_scaling_analysis(quick=args.quick)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/run.py b/benchmarks/compare_cpu.py
similarity index 98%
rename from benchmarks/run.py
rename to benchmarks/compare_cpu.py
index d7aa1ab..5da011f 100644
--- a/benchmarks/run.py
+++ b/benchmarks/compare_cpu.py
@@ -4,11 +4,11 @@
 dependencies beyond the project's [bench] extras.
 
 Usage:
-    uv run python benchmarks/run.py
-    uv run python benchmarks/run.py --quick
-    uv run python benchmarks/run.py --task regression
-    uv run python benchmarks/run.py --trials 5
-    uv run python benchmarks/run.py --n-samples 100000
+    uv run python benchmarks/compare_cpu.py
+    uv run python benchmarks/compare_cpu.py --quick
+    uv run python benchmarks/compare_cpu.py --task regression
+    uv run python benchmarks/compare_cpu.py --trials 5
+    uv run python benchmarks/compare_cpu.py --n-samples 100000
 
 Options:
     --task          Run a specific task: regression, binary, multiclass, all (default: all)
diff --git a/benchmarks/compare_gpu.py b/benchmarks/compare_gpu.py
new file mode 100644
index 0000000..2433df1
--- /dev/null
+++ b/benchmarks/compare_gpu.py
@@ -0,0 +1,551 @@
+"""GPU benchmarks: OpenBoost vs competitors on Modal A100.
+
+Three comparisons in one file:
+  1. GradientBoosting vs XGBoost  — regression, binary, multiclass, poisson
+  2. NaturalBoost vs NGBoost      — distributional GBDT (uncertainty)
+  3. OpenBoostGAM vs InterpretML  — interpretable models (GAM)
+
+Usage:
+    # Run everything on Modal A100
+    uv run modal run benchmarks/compare_gpu.py
+
+    # Run a single comparison
+    uv run modal run benchmarks/compare_gpu.py --bench xgboost
+    uv run modal run benchmarks/compare_gpu.py --bench ngboost
+    uv run modal run benchmarks/compare_gpu.py --bench ebm
+
+    # Run locally (CPU, smaller data)
+    uv run python benchmarks/compare_gpu.py --local
+    uv run python benchmarks/compare_gpu.py --local --bench ngboost
+"""
+
+from __future__ import annotations
+
+import json
+import time
+from pathlib import Path
+
+PROJECT_ROOT = Path(__file__).parent.parent
+
+try:
+    import modal
+
+    app = modal.App("openboost-gpu-bench")
+
+    image = (
+        modal.Image.from_registry(
+            "nvidia/cuda:12.4.0-devel-ubuntu22.04", add_python="3.12"
+        )
+        .pip_install(
+            "numpy>=1.24",
+            "numba>=0.60",
+            "scikit-learn>=1.0",
+            "xgboost>=2.0",
+            "ngboost>=0.5",
+            "interpret>=0.6",
+        )
+        .add_local_dir(
+            str(PROJECT_ROOT / "src" / "openboost"),
+            remote_path="/root/openboost",
+        )
+    )
+except ImportError:
+    modal = None
+    app = None
+    image = None
+
+
+# =============================================================================
+# Data generators
+# =============================================================================
+
+
+def _generate_regression(n_samples, n_features=20, seed=42):
+    import numpy as np
+
+    rng = np.random.RandomState(seed)
+    X = rng.randn(n_samples, n_features).astype(np.float32)
+    y = (
+        np.sin(X[:, 0] * 2)
+        + 0.5 * X[:, 1] ** 2
+        + 0.3 * X[:, 2] * X[:, 3]
+        + 0.1 * rng.randn(n_samples)
+    ).astype(np.float32)
+    return X, y
+
+
+def _generate_binary(n_samples, n_features=20, seed=42):
+    import numpy as np
+
+    rng = np.random.RandomState(seed)
+    X = rng.randn(n_samples, n_features).astype(np.float32)
+    logits = X[:, 0] + 0.5 * X[:, 1] - 0.3 * X[:, 2]
+    y = (rng.rand(n_samples) < 1 / (1 + np.exp(-logits))).astype(np.float32)
+    return X, y
+
+
+def _generate_multiclass(n_samples, n_features=20, n_classes=5, seed=42):
+    import numpy as np
+
+    rng = np.random.RandomState(seed)
+    X = rng.randn(n_samples, n_features).astype(np.float32)
+    scores = np.zeros((n_samples, n_classes))
+    for k in range(n_classes):
+        scores[:, k] = X[:, k % n_features] + 0.5 * X[:, (k + 1) % n_features]
+    y = np.argmax(scores + 0.5 * rng.randn(n_samples, n_classes), axis=1)
+    return X, y.astype(np.int32)
+
+
+def _generate_poisson(n_samples, n_features=20, seed=42):
+    import numpy as np
+
+    rng = np.random.RandomState(seed)
+    X = rng.randn(n_samples, n_features).astype(np.float32)
+    log_mu = 1.0 + 0.5 * X[:, 0] + 0.3 * X[:, 1] - 0.2 * X[:, 2]
+    mu = np.exp(np.clip(log_mu, -5, 5))
+    y = rng.poisson(mu).astype(np.float32)
+    return X, y
+
+
+def _train_test_split(X, y, test_size=0.2, seed=42):
+    import numpy as np
+
+    rng = np.random.RandomState(seed)
+    n = len(y)
+    idx = rng.permutation(n)
+    split = int(n * (1 - test_size))
+    return X[idx[:split]], X[idx[split:]], y[idx[:split]], y[idx[split:]]
+
+
+# =============================================================================
+# Timing helper
+# =============================================================================
+
+
+def _time_fit(create_model_fn, X, y, n_trials=3, sync_gpu=False):
+    """Time model.fit() over n_trials, return (median_time, last_fitted_model)."""
+    times = []
+    model = None
+    for _ in range(n_trials):
+        model = create_model_fn()
+        t0 = time.perf_counter()
+        model.fit(X, y)
+        if sync_gpu:
+            from numba import cuda
+            cuda.synchronize()
+        times.append(time.perf_counter() - t0)
+    times.sort()
+    return times[len(times) // 2], model
+
+
+# =============================================================================
+# Benchmark 1: GradientBoosting vs XGBoost
+# =============================================================================
+
+
+def bench_xgboost(n_samples=50_000, n_trees=100, max_depth=6, use_gpu=False):
+    """Compare OpenBoost vs XGBoost across tasks."""
+    import numpy as np
+    from sklearn.metrics import accuracy_score, r2_score
+
+    import openboost as ob
+
+    sync = use_gpu
+
+    # Warmup JIT — two iterations to ensure all CUDA kernels are compiled
+    X_w, y_w = _generate_regression(500)
+    for _ in range(2):
+        ob.GradientBoosting(n_trees=3, max_depth=max_depth).fit(X_w, y_w)
+    if sync:
+        from numba import cuda
+        cuda.synchronize()
+
+    xgb_device = "cuda" if use_gpu else "cpu"
+
+    results = {}
+
+    tasks = [
+        ("regression", _generate_regression, "mse", "R²"),
+        ("binary", _generate_binary, "logloss", "AUC"),
+        ("multiclass", _generate_multiclass, "softmax", "Accuracy"),
+        ("poisson", _generate_poisson, "poisson", "Deviance"),
+    ]
+
+    for task_name, gen_fn, ob_loss, metric_label in tasks:
+        X, y = gen_fn(n_samples)
+        X_train, X_test, y_train, y_test = _train_test_split(X, y)
+
+        # --- OpenBoost ---
+        if task_name == "multiclass":
+            n_classes = len(np.unique(y))
+            ob_time, ob_model = _time_fit(
+                lambda nc=n_classes: ob.MultiClassGradientBoosting(
+                    n_classes=nc, n_trees=n_trees, max_depth=max_depth,
+                    learning_rate=0.1,
+                ),
+                X_train, y_train, sync_gpu=sync,
+            )
+            ob_pred = np.argmax(ob_model.predict_proba(X_test), axis=1)
+        else:
+            ob_time, ob_model = _time_fit(
+                lambda loss=ob_loss: ob.GradientBoosting(
+                    n_trees=n_trees, max_depth=max_depth, learning_rate=0.1,
+                    loss=loss,
+                ),
+                X_train, y_train, sync_gpu=sync,
+            )
+            ob_pred = ob_model.predict(X_test)
+
+        # --- XGBoost ---
+        import xgboost as xgb
+
+        if task_name == "regression":
+            xgb_time, xgb_model = _time_fit(
+                lambda: xgb.XGBRegressor(
+                    n_estimators=n_trees, max_depth=max_depth, learning_rate=0.1,
+                    tree_method="hist", device=xgb_device, verbosity=0,
+                ),
+                X_train, y_train,
+            )
+            xgb_pred = xgb_model.predict(X_test)
+        elif task_name == "poisson":
+            xgb_time, xgb_model = _time_fit(
+                lambda: xgb.XGBRegressor(
+                    n_estimators=n_trees, max_depth=max_depth, learning_rate=0.1,
+                    tree_method="hist", device=xgb_device, objective="count:poisson",
+                    verbosity=0,
+                ),
+                X_train, y_train,
+            )
+            xgb_pred = xgb_model.predict(X_test)
+        else:
+            xgb_time, xgb_model = _time_fit(
+                lambda: xgb.XGBClassifier(
+                    n_estimators=n_trees, max_depth=max_depth, learning_rate=0.1,
+                    tree_method="hist", device=xgb_device, verbosity=0,
+                ),
+                X_train, y_train,
+            )
+            xgb_pred = xgb_model.predict(X_test)
+
+        # --- Metrics ---
+        if task_name == "regression":
+            ob_metric = r2_score(y_test, ob_pred)
+            xgb_metric = r2_score(y_test, xgb_pred)
+        elif task_name == "poisson":
+            ob_exp = np.exp(ob_pred)
+            ob_metric = float(np.mean(ob_exp - y_test * np.log(np.maximum(ob_exp, 1e-8))))
+            xgb_metric = float(np.mean(xgb_pred - y_test * np.log(np.maximum(xgb_pred, 1e-8))))
+        elif task_name == "binary":
+            ob_labels = (ob_pred > 0).astype(float) if np.any(ob_pred < 0) else ob_pred
+            ob_metric = accuracy_score(y_test, ob_labels)
+            xgb_metric = accuracy_score(y_test, xgb_pred)
+        else:
+            ob_metric = accuracy_score(y_test, ob_pred)
+            xgb_metric = accuracy_score(y_test, xgb_pred)
+
+        speedup = xgb_time / ob_time
+        results[task_name] = {
+            "ob_time": ob_time, "xgb_time": xgb_time, "speedup": speedup,
+            "ob_metric": float(ob_metric), "xgb_metric": float(xgb_metric),
+            "metric_label": metric_label,
+        }
+
+    # Print results
+    print(f"\n{'='*70}")
+    print(f"  OpenBoost vs XGBoost  |  {n_samples:,} samples, {n_trees} trees, depth {max_depth}")
+    print(f"  Device: {'GPU' if use_gpu else 'CPU'}")
+    print(f"{'='*70}")
+    print(f"  {'Task':<14} {'OB (s)':<10} {'XGB (s)':<10} {'Speedup':<10} {'OB metric':<12} {'XGB metric':<12}")
+    print(f"  {'─'*66}")
+    for task_name, r in results.items():
+        faster = "OB" if r["speedup"] > 1 else "XGB"
+        print(
+            f"  {task_name:<14} {r['ob_time']:<10.3f} {r['xgb_time']:<10.3f} "
+            f"{r['speedup']:.2f}x {faster:<4} {r['ob_metric']:<12.4f} {r['xgb_metric']:<12.4f}"
+        )
+
+    return results
+
+
+# =============================================================================
+# Benchmark 2: NaturalBoost vs NGBoost
+# =============================================================================
+
+
+def bench_ngboost(n_samples=10_000, n_trees=100, use_gpu=False):
+    """Compare NaturalBoost vs NGBoost (distributional GBDT)."""
+    import numpy as np
+    from sklearn.datasets import fetch_california_housing
+
+    import openboost as ob
+
+    sync = use_gpu
+
+    # Warmup
+    X_w, y_w = _generate_regression(500)
+    ob.NaturalBoostNormal(n_trees=3, max_depth=3, learning_rate=0.1).fit(X_w, y_w)
+    if sync:
+        from numba import cuda
+        cuda.synchronize()
+
+    results = {}
+
+    # --- Synthetic data ---
+    for n in [n_samples]:
+        X, y = _generate_regression(n)
+        X_train, X_test, y_train, y_test = _train_test_split(X, y)
+
+        # NGBoost
+        from ngboost import NGBRegressor
+        from ngboost.distns import Normal
+
+        ngb = NGBRegressor(Dist=Normal, n_estimators=n_trees, learning_rate=0.1, verbose=False)
+        t0 = time.perf_counter()
+        ngb.fit(X_train, y_train)
+        ngb_time = time.perf_counter() - t0
+
+        ngb_dist = ngb.pred_dist(X_test)
+        ngb_nll = float(-ngb_dist.logpdf(y_test).mean())
+        ngb_lower = ngb_dist.ppf(0.05)
+        ngb_upper = ngb_dist.ppf(0.95)
+        ngb_coverage = float(np.mean((y_test >= ngb_lower) & (y_test <= ngb_upper)))
+
+        # NaturalBoost
+        nb = ob.NaturalBoostNormal(n_trees=n_trees, learning_rate=0.1, max_depth=3)
+        t0 = time.perf_counter()
+        nb.fit(X_train, y_train)
+        if sync:
+            from numba import cuda
+            cuda.synchronize()
+        nb_time = time.perf_counter() - t0
+
+        nb_nll = float(nb.nll(X_test, y_test))
+        nb_lower, nb_upper = nb.predict_interval(X_test, alpha=0.1)
+        nb_coverage = float(np.mean((y_test >= nb_lower) & (y_test <= nb_upper)))
+
+        results[f"synthetic_{n}"] = {
+            "ngb_time": ngb_time, "nb_time": nb_time,
+            "speedup": ngb_time / nb_time,
+            "ngb_nll": ngb_nll, "nb_nll": nb_nll,
+            "ngb_coverage": ngb_coverage, "nb_coverage": nb_coverage,
+        }
+
+    # --- California Housing ---
+    data = fetch_california_housing()
+    X = data.data.astype(np.float32)
+    y = data.target.astype(np.float32)
+    X_train, X_test, y_train, y_test = _train_test_split(X, y)
+
+    ngb = NGBRegressor(Dist=Normal, n_estimators=n_trees, learning_rate=0.1, verbose=False)
+    t0 = time.perf_counter()
+    ngb.fit(X_train, y_train)
+    ngb_time = time.perf_counter() - t0
+    ngb_nll = float(-ngb.pred_dist(X_test).logpdf(y_test).mean())
+
+    nb = ob.NaturalBoostNormal(n_trees=n_trees, learning_rate=0.1, max_depth=3)
+    t0 = time.perf_counter()
+    nb.fit(X_train, y_train)
+    if sync:
+        from numba import cuda
+        cuda.synchronize()
+    nb_time = time.perf_counter() - t0
+    nb_nll = float(nb.nll(X_test, y_test))
+
+    results["california_housing"] = {
+        "ngb_time": ngb_time, "nb_time": nb_time,
+        "speedup": ngb_time / nb_time,
+        "ngb_nll": ngb_nll, "nb_nll": nb_nll,
+    }
+
+    # Print
+    print(f"\n{'='*70}")
+    print(f"  NaturalBoost vs NGBoost  |  {n_trees} trees, Normal distribution")
+    print(f"{'='*70}")
+    print(f"  {'Dataset':<22} {'NGBoost (s)':<14} {'NatBoost (s)':<14} {'Speedup':<10} {'NGB NLL':<10} {'NB NLL':<10}")
+    print(f"  {'─'*78}")
+    for name, r in results.items():
+        faster = "NB" if r["speedup"] > 1 else "NGB"
+        print(
+            f"  {name:<22} {r['ngb_time']:<14.2f} {r['nb_time']:<14.2f} "
+            f"{r['speedup']:.2f}x {faster:<4} {r['ngb_nll']:<10.4f} {r['nb_nll']:<10.4f}"
+        )
+
+    return results
+
+
+# =============================================================================
+# Benchmark 3: OpenBoostGAM vs InterpretML EBM
+# =============================================================================
+
+
+def bench_ebm(n_samples=50_000, n_rounds=200, use_gpu=False):
+    """Compare OpenBoostGAM vs InterpretML EBM."""
+    from sklearn.metrics import r2_score
+
+    from openboost import OpenBoostGAM
+
+    sync = use_gpu
+
+    # Warmup
+    X_w, y_w = _generate_regression(500)
+    OpenBoostGAM(n_rounds=10).fit(X_w, y_w)
+    if sync:
+        from numba import cuda
+        cuda.synchronize()
+
+    results = {}
+
+    for n in [n_samples]:
+        X, y = _generate_regression(n)
+        X_train, X_test, y_train, y_test = _train_test_split(X, y)
+
+        # OpenBoostGAM
+        gam = OpenBoostGAM(n_rounds=n_rounds, learning_rate=0.05)
+        t0 = time.perf_counter()
+        gam.fit(X_train, y_train)
+        if sync:
+            from numba import cuda
+            cuda.synchronize()
+        gam_time = time.perf_counter() - t0
+        gam_r2 = float(r2_score(y_test, gam.predict(X_test)))
+
+        # InterpretML EBM
+        from interpret.glassbox import ExplainableBoostingRegressor
+
+        ebm = ExplainableBoostingRegressor(
+            max_rounds=n_rounds, learning_rate=0.05,
+            outer_bags=1, inner_bags=0, interactions=0, n_jobs=-1,
+        )
+        t0 = time.perf_counter()
+        ebm.fit(X_train, y_train)
+        ebm_time = time.perf_counter() - t0
+        ebm_r2 = float(r2_score(y_test, ebm.predict(X_test)))
+
+        results[f"synthetic_{n}"] = {
+            "gam_time": gam_time, "ebm_time": ebm_time,
+            "speedup": ebm_time / gam_time,
+            "gam_r2": gam_r2, "ebm_r2": ebm_r2,
+        }
+
+    # Print
+    print(f"\n{'='*70}")
+    print(f"  OpenBoostGAM vs InterpretML EBM  |  {n_rounds} rounds")
+    print(f"{'='*70}")
+    print(f"  {'Dataset':<22} {'GAM (s)':<12} {'EBM (s)':<12} {'Speedup':<10} {'GAM R²':<10} {'EBM R²':<10}")
+    print(f"  {'─'*74}")
+    for name, r in results.items():
+        print(
+            f"  {name:<22} {r['gam_time']:<12.2f} {r['ebm_time']:<12.2f} "
+            f"{r['speedup']:.1f}x       {r['gam_r2']:<10.4f} {r['ebm_r2']:<10.4f}"
+        )
+
+    return results
+
+
+# =============================================================================
+# Run all benchmarks
+# =============================================================================
+
+
+def run_all(use_gpu=False, bench=None, n_samples=None):
+    """Run selected or all benchmarks."""
+    import sys
+    if use_gpu:
+        sys.path.insert(0, "/root")
+
+    import openboost as ob
+
+    if use_gpu:
+        ob.set_backend("cuda")
+
+    print(f"OpenBoost backend: {ob.get_backend()}")
+    if use_gpu:
+        from numba import cuda
+        gpu_name = cuda.get_current_device().name
+        if isinstance(gpu_name, bytes):
+            gpu_name = gpu_name.decode()
+        print(f"GPU: {gpu_name}")
+
+    all_results = {}
+    benches = [bench] if bench else ["xgboost", "ngboost", "ebm"]
+
+    if "xgboost" in benches:
+        n = n_samples or (50_000 if use_gpu else 20_000)
+        all_results["xgboost"] = bench_xgboost(n_samples=n, use_gpu=use_gpu)
+
+    if "ngboost" in benches:
+        try:
+            import ngboost  # noqa: F401
+            n = n_samples or (10_000 if not use_gpu else 50_000)
+            all_results["ngboost"] = bench_ngboost(n_samples=n, use_gpu=use_gpu)
+        except ImportError:
+            print("\n  ngboost not installed, skipping. Install: pip install ngboost")
+
+    if "ebm" in benches:
+        try:
+            import interpret  # noqa: F401
+            n = n_samples or (50_000 if use_gpu else 10_000)
+            all_results["ebm"] = bench_ebm(n_samples=n, use_gpu=use_gpu)
+        except ImportError:
+            print("\n  interpret not installed, skipping. Install: pip install interpret")
+
+    return all_results
+
+
+# =============================================================================
+# Modal entry points
+# =============================================================================
+
+if modal is not None and app is not None:
+
+    @app.function(gpu="A100", image=image, timeout=3600)
+    def _run_on_gpu(bench=None, n_samples=None):
+        return run_all(use_gpu=True, bench=bench, n_samples=n_samples)
+
+    @app.local_entrypoint()
+    def main(bench: str = None, n_samples: int = None):
+        """Run benchmarks on Modal A100."""
+        if bench:
+            print(f"Running '{bench}' benchmark on Modal A100...")
+        else:
+            print("Running all benchmarks on Modal A100...")
+
+        results = _run_on_gpu.remote(bench=bench, n_samples=n_samples)
+
+        # Save results
+        results_dir = PROJECT_ROOT / "benchmarks" / "results"
+        results_dir.mkdir(exist_ok=True)
+        timestamp = time.strftime("%Y%m%d_%H%M%S")
+        out_file = results_dir / f"gpu_benchmark_{timestamp}.json"
+        with open(out_file, "w") as f:
+            json.dump(results, f, indent=2)
+        print(f"\nResults saved to {out_file}")
+
+
+# =============================================================================
+# Local execution
+# =============================================================================
+
+if __name__ == "__main__":
+    import argparse
+    import sys
+
+    parser = argparse.ArgumentParser(description="OpenBoost GPU Benchmarks")
+    parser.add_argument("--local", action="store_true", help="Run locally on CPU")
+    parser.add_argument(
+        "--bench", choices=["xgboost", "ngboost", "ebm"],
+        help="Run a single benchmark (default: all)",
+    )
+    parser.add_argument("--n-samples", type=int, help="Override dataset size")
+    args = parser.parse_args()
+
+    if args.local:
+        sys.path.insert(0, str(PROJECT_ROOT / "src"))
+        run_all(use_gpu=False, bench=args.bench, n_samples=args.n_samples)
+    else:
+        print("Usage:")
+        print("  Modal:  uv run modal run benchmarks/compare_gpu.py")
+        print("  Modal:  uv run modal run benchmarks/compare_gpu.py --bench xgboost")
+        print("  Local:  uv run python benchmarks/compare_gpu.py --local")
+        print("  Local:  uv run python benchmarks/compare_gpu.py --local --bench ngboost")
diff --git a/benchmarks/ebm_benchmark.py b/benchmarks/ebm_benchmark.py
deleted file mode 100644
index 29c31d0..0000000
--- a/benchmarks/ebm_benchmark.py
+++ /dev/null
@@ -1,502 +0,0 @@
-"""Benchmark: OpenBoost GPU-GAM vs InterpretML EBM.
-
-Run locally:
-    cd openboost
-    uv run python benchmarks/ebm_benchmark.py --local
-
-Run on Modal (cloud A100):
-    cd openboost
-    uv run modal run benchmarks/ebm_benchmark.py
-"""
-
-from __future__ import annotations
-
-from pathlib import Path
-
-PROJECT_ROOT = Path(__file__).parent.parent
-
-try:
-    import modal
-
-    app = modal.App("openboost-ebm-bench")
-
-    image = (
-        modal.Image.from_registry("nvidia/cuda:12.4.0-devel-ubuntu22.04", add_python="3.12")
-        .pip_install(
-            "numpy>=1.24",
-            "numba>=0.60",
-            "cupy-cuda12x>=13.0",
-            "scikit-learn>=1.0",
-            "interpret>=0.6",
-            "xgboost>=2.0",
-        )
-        .add_local_dir(
-            str(PROJECT_ROOT / "src" / "openboost"),
-            remote_path="/root/openboost",
-        )
-    )
-except ImportError:
-    modal = None
-    app = None
-    image = None
-
-
-def generate_data(n_samples: int, n_features: int, task: str = "regression"):
-    """Generate synthetic data for benchmarking."""
-    import numpy as np
-    
-    np.random.seed(42)
-    X = np.random.randn(n_samples, n_features).astype(np.float32)
-    
-    if task == "regression":
-        # True model: additive (perfect for GAM)
-        y = (
-            np.sin(X[:, 0] * 2) +           # Non-linear effect
-            0.5 * X[:, 1] +                  # Linear effect
-            np.where(X[:, 2] > 0, 0.3, -0.3) +  # Step function
-            0.1 * np.random.randn(n_samples)  # Noise
-        ).astype(np.float32)
-    else:
-        # Classification
-        logits = X[:, 0] + 0.5 * X[:, 1] - 0.3 * X[:, 2]
-        probs = 1 / (1 + np.exp(-logits))
-        y = (np.random.rand(n_samples) < probs).astype(np.float32)
-    
-    return X, y
-
-
-def benchmark_gam_vs_ebm(
-    n_samples: int = 100_000,
-    n_features: int = 20,
-    n_rounds: int = 500,
-):
-    """Compare OpenBoost GPU-GAM vs InterpretML EBM.
-    
-    Args:
-        n_samples: Number of training samples
-        n_features: Number of features
-        n_rounds: Number of boosting rounds (for fair comparison)
-        
-    Returns:
-        Benchmark results dict
-    """
-    import sys
-    sys.path.insert(0, "/root")
-    
-    import numpy as np
-    import time
-    from sklearn.metrics import mean_squared_error, r2_score
-    from sklearn.model_selection import train_test_split
-    
-    print("=" * 60)
-    print("OpenBoost GPU-GAM vs InterpretML EBM Benchmark")
-    print("=" * 60)
-    
-    # Check GPU
-    try:
-        from numba import cuda
-        print(f"GPU: {cuda.get_current_device().name}")
-    except Exception as e:
-        print(f"GPU not available: {e}")
-    
-    # Generate data
-    print(f"\nGenerating data: {n_samples:,} samples × {n_features} features")
-    X, y = generate_data(n_samples, n_features, task="regression")
-    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
-    
-    results = {
-        "n_samples": n_samples,
-        "n_features": n_features,
-        "n_rounds": n_rounds,
-    }
-    
-    # =========================================================================
-    # 1. OpenBoost GPU-GAM
-    # =========================================================================
-    print("\n" + "-" * 40)
-    print("1. OpenBoost GPU-GAM")
-    print("-" * 40)
-    
-    try:
-        import openboost as ob
-        from openboost import OpenBoostGAM
-        
-        print(f"Backend: {ob.get_backend()}")
-        
-        gam = OpenBoostGAM(
-            n_rounds=n_rounds,
-            learning_rate=0.05,
-            reg_lambda=1.0,
-            loss='mse',
-        )
-        
-        # Warmup (JIT compilation)
-        gam_warmup = OpenBoostGAM(n_rounds=10, learning_rate=0.1)
-        gam_warmup.fit(X_train[:1000], y_train[:1000])
-        cuda.synchronize()
-        
-        # Benchmark training
-        start = time.perf_counter()
-        gam.fit(X_train, y_train)
-        cuda.synchronize()
-        ob_train_time = time.perf_counter() - start
-        
-        # Benchmark inference
-        start = time.perf_counter()
-        y_pred_ob = gam.predict(X_test)
-        cuda.synchronize()
-        ob_pred_time = time.perf_counter() - start
-        
-        ob_mse = mean_squared_error(y_test, y_pred_ob)
-        ob_r2 = r2_score(y_test, y_pred_ob)
-        
-        print(f"Train time:    {ob_train_time:.3f}s")
-        print(f"Predict time:  {ob_pred_time*1000:.2f}ms")
-        print(f"MSE:           {ob_mse:.6f}")
-        print(f"R²:            {ob_r2:.4f}")
-        
-        results["openboost_gam"] = {
-            "train_time_s": ob_train_time,
-            "predict_time_ms": ob_pred_time * 1000,
-            "mse": ob_mse,
-            "r2": ob_r2,
-        }
-        
-    except Exception as e:
-        print(f"OpenBoost GAM failed: {e}")
-        import traceback
-        traceback.print_exc()
-        results["openboost_gam"] = {"error": str(e)}
-    
-    # =========================================================================
-    # 2. InterpretML EBM
-    # =========================================================================
-    print("\n" + "-" * 40)
-    print("2. InterpretML EBM")
-    print("-" * 40)
-    
-    try:
-        from interpret.glassbox import ExplainableBoostingRegressor
-        
-        # EBM with comparable settings
-        # Note: EBM's "outer_bags" and "inner_bags" add bagging overhead
-        # For fair comparison, we disable some features
-        ebm = ExplainableBoostingRegressor(
-            max_rounds=n_rounds,
-            learning_rate=0.05,
-            min_samples_leaf=2,
-            max_bins=256,
-            outer_bags=1,  # Disable bagging for speed comparison
-            inner_bags=0,
-            interactions=0,  # No pairwise interactions (pure GAM)
-            n_jobs=-1,  # Use all CPU cores
-        )
-        
-        # Benchmark training
-        start = time.perf_counter()
-        ebm.fit(X_train, y_train)
-        ebm_train_time = time.perf_counter() - start
-        
-        # Benchmark inference
-        start = time.perf_counter()
-        y_pred_ebm = ebm.predict(X_test)
-        ebm_pred_time = time.perf_counter() - start
-        
-        ebm_mse = mean_squared_error(y_test, y_pred_ebm)
-        ebm_r2 = r2_score(y_test, y_pred_ebm)
-        
-        print(f"Train time:    {ebm_train_time:.3f}s")
-        print(f"Predict time:  {ebm_pred_time*1000:.2f}ms")
-        print(f"MSE:           {ebm_mse:.6f}")
-        print(f"R²:            {ebm_r2:.4f}")
-        
-        results["interpretml_ebm"] = {
-            "train_time_s": ebm_train_time,
-            "predict_time_ms": ebm_pred_time * 1000,
-            "mse": ebm_mse,
-            "r2": ebm_r2,
-        }
-        
-    except Exception as e:
-        print(f"InterpretML EBM failed: {e}")
-        import traceback
-        traceback.print_exc()
-        results["interpretml_ebm"] = {"error": str(e)}
-    
-    # =========================================================================
-    # 3. XGBoost (baseline, non-interpretable)
-    # =========================================================================
-    print("\n" + "-" * 40)
-    print("3. XGBoost (baseline)")
-    print("-" * 40)
-    
-    try:
-        import xgboost as xgb
-        
-        xgb_model = xgb.XGBRegressor(
-            n_estimators=n_rounds,
-            learning_rate=0.05,
-            max_depth=6,
-            tree_method="hist",
-            device="cuda",
-            n_jobs=-1,
-        )
-        
-        # Benchmark training
-        start = time.perf_counter()
-        xgb_model.fit(X_train, y_train)
-        cuda.synchronize()
-        xgb_train_time = time.perf_counter() - start
-        
-        # Benchmark inference
-        start = time.perf_counter()
-        y_pred_xgb = xgb_model.predict(X_test)
-        cuda.synchronize()
-        xgb_pred_time = time.perf_counter() - start
-        
-        xgb_mse = mean_squared_error(y_test, y_pred_xgb)
-        xgb_r2 = r2_score(y_test, y_pred_xgb)
-        
-        print(f"Train time:    {xgb_train_time:.3f}s")
-        print(f"Predict time:  {xgb_pred_time*1000:.2f}ms")
-        print(f"MSE:           {xgb_mse:.6f}")
-        print(f"R²:            {xgb_r2:.4f}")
-        
-        results["xgboost"] = {
-            "train_time_s": xgb_train_time,
-            "predict_time_ms": xgb_pred_time * 1000,
-            "mse": xgb_mse,
-            "r2": xgb_r2,
-        }
-        
-    except Exception as e:
-        print(f"XGBoost failed: {e}")
-        results["xgboost"] = {"error": str(e)}
-    
-    # =========================================================================
-    # Summary
-    # =========================================================================
-    print("\n" + "=" * 60)
-    print("SUMMARY")
-    print("=" * 60)
-    
-    def safe_get(d, key, default="N/A"):
-        if "error" in d:
-            return default
-        return d.get(key, default)
-    
-    print(f"\n{'Model':<25} {'Train (s)':<12} {'Predict (ms)':<14} {'R²':<10}")
-    print("-" * 60)
-    
-    for name, key in [
-        ("OpenBoost GPU-GAM", "openboost_gam"),
-        ("InterpretML EBM", "interpretml_ebm"),
-        ("XGBoost (GPU)", "xgboost"),
-    ]:
-        d = results.get(key, {})
-        train = safe_get(d, "train_time_s")
-        pred = safe_get(d, "predict_time_ms")
-        r2 = safe_get(d, "r2")
-        
-        train_str = f"{train:.3f}" if isinstance(train, float) else train
-        pred_str = f"{pred:.2f}" if isinstance(pred, float) else pred
-        r2_str = f"{r2:.4f}" if isinstance(r2, float) else r2
-        
-        print(f"{name:<25} {train_str:<12} {pred_str:<14} {r2_str:<10}")
-    
-    # Speedup calculation
-    if ("openboost_gam" in results and "interpretml_ebm" in results and 
-        "error" not in results["openboost_gam"] and "error" not in results["interpretml_ebm"]):
-        speedup = results["interpretml_ebm"]["train_time_s"] / results["openboost_gam"]["train_time_s"]
-        print(f"\nOpenBoost GPU-GAM is {speedup:.1f}x faster than InterpretML EBM")
-        results["speedup_vs_ebm"] = speedup
-    
-    return results
-
-
-def benchmark_scaling(max_samples: int = 1_000_000):
-    """Benchmark how both scale with data size."""
-    import sys
-    sys.path.insert(0, "/root")
-    
-    import numpy as np
-    import time
-    from numba import cuda
-    
-    print("=" * 60)
-    print("Scaling Benchmark: OpenBoost GPU-GAM vs InterpretML EBM")
-    print("=" * 60)
-    print(f"GPU: {cuda.get_current_device().name}")
-    
-    import openboost as ob
-    from openboost import OpenBoostGAM
-    from interpret.glassbox import ExplainableBoostingRegressor
-    
-    n_features = 20
-    n_rounds = 200
-    
-    # Warmup
-    X_warm, y_warm = generate_data(1000, n_features)
-    OpenBoostGAM(n_rounds=10).fit(X_warm, y_warm)
-    cuda.synchronize()
-    
-    results = []
-    
-    for n_samples in [10_000, 50_000, 100_000, 500_000, max_samples]:
-        if n_samples > max_samples:
-            break
-            
-        print(f"\n--- {n_samples:,} samples ---")
-        
-        X, y = generate_data(n_samples, n_features)
-        row = {"n_samples": n_samples}
-        
-        # OpenBoost GPU-GAM
-        try:
-            gam = OpenBoostGAM(n_rounds=n_rounds, learning_rate=0.05)
-            start = time.perf_counter()
-            gam.fit(X, y)
-            cuda.synchronize()
-            row["openboost_time"] = time.perf_counter() - start
-            print(f"  OpenBoost GPU-GAM: {row['openboost_time']:.2f}s")
-        except Exception as e:
-            print(f"  OpenBoost failed: {e}")
-            row["openboost_time"] = None
-        
-        # InterpretML EBM (only for smaller sizes due to time)
-        if n_samples <= 100_000:
-            try:
-                ebm = ExplainableBoostingRegressor(
-                    max_rounds=n_rounds,
-                    learning_rate=0.05,
-                    outer_bags=1,
-                    inner_bags=0,
-                    interactions=0,
-                    n_jobs=-1,
-                )
-                start = time.perf_counter()
-                ebm.fit(X, y)
-                row["ebm_time"] = time.perf_counter() - start
-                print(f"  InterpretML EBM:   {row['ebm_time']:.2f}s")
-            except Exception as e:
-                print(f"  EBM failed: {e}")
-                row["ebm_time"] = None
-        else:
-            print(f"  InterpretML EBM:   (skipped - too slow)")
-            row["ebm_time"] = None
-        
-        if row.get("openboost_time") and row.get("ebm_time"):
-            row["speedup"] = row["ebm_time"] / row["openboost_time"]
-            print(f"  Speedup: {row['speedup']:.1f}x")
-        
-        results.append(row)
-    
-    print("\n" + "=" * 60)
-    print("SCALING SUMMARY")
-    print("=" * 60)
-    print(f"\n{'Samples':<12} {'OpenBoost (s)':<15} {'EBM (s)':<12} {'Speedup':<10}")
-    print("-" * 50)
-    for r in results:
-        ob_str = f"{r['openboost_time']:.2f}" if r.get('openboost_time') else "N/A"
-        ebm_str = f"{r['ebm_time']:.2f}" if r.get('ebm_time') else "N/A"
-        sp_str = f"{r['speedup']:.1f}x" if r.get('speedup') else "N/A"
-        print(f"{r['n_samples']:<12,} {ob_str:<15} {ebm_str:<12} {sp_str:<10}")
-    
-    return results
-
-
-if modal is not None and app is not None:
-    _benchmark_gam_vs_ebm_modal = app.function(gpu="A100", image=image, timeout=1800)(benchmark_gam_vs_ebm)
-    _benchmark_scaling_modal = app.function(gpu="A100", image=image, timeout=3600)(benchmark_scaling)
-
-    @app.local_entrypoint()
-    def main():
-        """Run benchmarks on Modal."""
-        print("Running GAM vs EBM benchmark on Modal A100...")
-
-        results = _benchmark_gam_vs_ebm_modal.remote(
-            n_samples=100_000,
-            n_features=20,
-            n_rounds=500,
-        )
-
-        print("\n\nFinal Results:")
-        print(results)
-
-
-# For local execution without Modal
-if __name__ == "__main__":
-    import sys
-
-    if len(sys.argv) > 1 and sys.argv[1] == "--local":
-        print("Running locally...")
-
-        import numpy as np
-        import time
-        from sklearn.metrics import r2_score
-        from sklearn.model_selection import train_test_split
-
-        sys.path.insert(0, str(PROJECT_ROOT / "src"))
-
-        n_samples = 50_000
-        n_features = 20
-        n_rounds = 200
-
-        X, y = generate_data(n_samples, n_features)
-        X_train, X_test, y_train, y_test = train_test_split(
-            X, y, test_size=0.2, random_state=42
-        )
-
-        print(f"Data: {n_samples:,} samples x {n_features} features")
-
-        # OpenBoost GPU-GAM
-        try:
-            import openboost as ob
-            from openboost import OpenBoostGAM
-
-            print(f"\nOpenBoost backend: {ob.get_backend()}")
-
-            # Warmup JIT
-            OpenBoostGAM(n_rounds=10, learning_rate=0.1).fit(
-                X_train[:1000], y_train[:1000]
-            )
-
-            gam = OpenBoostGAM(n_rounds=n_rounds, learning_rate=0.05)
-            start = time.perf_counter()
-            gam.fit(X_train, y_train)
-            train_time = time.perf_counter() - start
-
-            y_pred = gam.predict(X_test)
-            print(f"OpenBoost GPU-GAM: {train_time:.2f}s, R²={r2_score(y_test, y_pred):.4f}")
-        except Exception as e:
-            print(f"OpenBoost failed: {e}")
-            import traceback
-            traceback.print_exc()
-
-        # InterpretML EBM
-        try:
-            from interpret.glassbox import ExplainableBoostingRegressor
-
-            ebm = ExplainableBoostingRegressor(
-                max_rounds=n_rounds,
-                learning_rate=0.05,
-                outer_bags=1,
-                inner_bags=0,
-                interactions=0,
-                n_jobs=-1,
-            )
-            start = time.perf_counter()
-            ebm.fit(X_train, y_train)
-            train_time = time.perf_counter() - start
-
-            y_pred = ebm.predict(X_test)
-            print(f"InterpretML EBM:   {train_time:.2f}s, R²={r2_score(y_test, y_pred):.4f}")
-        except ImportError:
-            print("InterpretML not installed. Run: pip install interpret")
-        except Exception as e:
-            print(f"EBM failed: {e}")
-    else:
-        print("Usage:")
-        print("  Modal:  uv run modal run benchmarks/ebm_benchmark.py")
-        print("  Local:  uv run python benchmarks/ebm_benchmark.py --local")
-
diff --git a/benchmarks/ngboost_benchmark.py b/benchmarks/ngboost_benchmark.py
deleted file mode 100644
index e8b827b..0000000
--- a/benchmarks/ngboost_benchmark.py
+++ /dev/null
@@ -1,315 +0,0 @@
-"""Benchmark: OpenBoost NaturalBoost vs Official NGBoost.
-
-Compare:
-1. Training speed
-2. Prediction speed
-3. NLL (negative log-likelihood) - prediction quality
-4. Calibration of prediction intervals
-
-Usage:
-    uv run python benchmarks/ngboost_benchmark.py
-"""
-
-from __future__ import annotations
-
-import sys
-import time
-from pathlib import Path
-
-import numpy as np
-from sklearn.datasets import fetch_california_housing, make_regression
-from sklearn.model_selection import train_test_split
-
-PROJECT_ROOT = Path(__file__).parent.parent
-sys.path.insert(0, str(PROJECT_ROOT / "src"))
-
-
-def benchmark_synthetic(n_samples: int = 10000, n_features: int = 20, n_trees: int = 100):
-    """Benchmark on synthetic data."""
-    print(f"\n{'='*70}")
-    print(f"SYNTHETIC DATA: {n_samples:,} samples, {n_features} features, {n_trees} trees")
-    print('='*70)
-    
-    # Generate data
-    X, y = make_regression(n_samples=n_samples, n_features=n_features, noise=10, random_state=42)
-    X = X.astype(np.float32)
-    y = y.astype(np.float32)
-    
-    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
-    
-    results = {}
-    
-    # --- Official NGBoost ---
-    try:
-        from ngboost import NGBRegressor
-        from ngboost.distns import Normal
-        
-        print("\n[Official NGBoost]")
-        model_official = NGBRegressor(
-            Dist=Normal,
-            n_estimators=n_trees,
-            learning_rate=0.1,
-            verbose=False,
-        )
-        
-        t0 = time.perf_counter()
-        model_official.fit(X_train, y_train)
-        train_time_official = time.perf_counter() - t0
-        
-        t0 = time.perf_counter()
-        pred_official = model_official.predict(X_test)
-        pred_time_official = time.perf_counter() - t0
-        
-        # Get distribution params for NLL
-        dist_official = model_official.pred_dist(X_test)
-        nll_official = -dist_official.logpdf(y_test).mean()
-        
-        # Prediction intervals
-        lower_official = dist_official.ppf(0.05)
-        upper_official = dist_official.ppf(0.95)
-        coverage_official = np.mean((y_test >= lower_official) & (y_test <= upper_official))
-        
-        rmse_official = np.sqrt(np.mean((pred_official - y_test)**2))
-        
-        results['official'] = {
-            'train_time': train_time_official,
-            'pred_time': pred_time_official,
-            'nll': nll_official,
-            'rmse': rmse_official,
-            'coverage_90': coverage_official,
-        }
-        
-        print(f"  Train time:  {train_time_official:.2f}s")
-        print(f"  Pred time:   {pred_time_official*1000:.1f}ms")
-        print(f"  NLL:         {nll_official:.4f}")
-        print(f"  RMSE:        {rmse_official:.4f}")
-        print(f"  90% coverage: {coverage_official:.1%}")
-        
-    except Exception as e:
-        print(f"  Error: {e}")
-        results['official'] = None
-    
-    # --- OpenBoost NaturalBoost ---
-    try:
-        import openboost as ob
-
-        # Warmup JIT
-        ob.NaturalBoostNormal(n_trees=3, learning_rate=0.1, max_depth=3).fit(
-            X_train[:500], y_train[:500]
-        )
-        
-        print("\n[OpenBoost NaturalBoost]")
-        model_openboost = ob.NaturalBoostNormal(
-            n_trees=n_trees,
-            learning_rate=0.1,
-            max_depth=3,
-        )
-        
-        t0 = time.perf_counter()
-        model_openboost.fit(X_train, y_train)
-        train_time_openboost = time.perf_counter() - t0
-        
-        t0 = time.perf_counter()
-        pred_openboost = model_openboost.predict(X_test)
-        pred_time_openboost = time.perf_counter() - t0
-        
-        # NLL
-        nll_openboost = model_openboost.score(X_test, y_test)
-        if hasattr(nll_openboost, 'mean'):
-            nll_openboost = nll_openboost.mean()
-        
-        # Prediction intervals
-        lower_openboost, upper_openboost = model_openboost.predict_interval(X_test, alpha=0.1)
-        coverage_openboost = np.mean((y_test >= lower_openboost) & (y_test <= upper_openboost))
-        
-        rmse_openboost = np.sqrt(np.mean((pred_openboost - y_test)**2))
-        
-        results['openboost'] = {
-            'train_time': train_time_openboost,
-            'pred_time': pred_time_openboost,
-            'nll': nll_openboost,
-            'rmse': rmse_openboost,
-            'coverage_90': coverage_openboost,
-        }
-        
-        print(f"  Train time:  {train_time_openboost:.2f}s")
-        print(f"  Pred time:   {pred_time_openboost*1000:.1f}ms")
-        print(f"  NLL:         {nll_openboost:.4f}")
-        print(f"  RMSE:        {rmse_openboost:.4f}")
-        print(f"  90% coverage: {coverage_openboost:.1%}")
-        
-    except Exception as e:
-        print(f"  Error: {e}")
-        import traceback
-        traceback.print_exc()
-        results['openboost'] = None
-    
-    # --- Comparison ---
-    if results.get('official') and results.get('openboost'):
-        print("\n[Comparison]")
-        speedup = results['official']['train_time'] / results['openboost']['train_time']
-        print(f"  Training speedup: {speedup:.2f}x {'(OpenBoost faster)' if speedup > 1 else '(NGBoost faster)'}")
-        
-        pred_speedup = results['official']['pred_time'] / results['openboost']['pred_time']
-        print(f"  Prediction speedup: {pred_speedup:.2f}x")
-        
-        nll_diff = results['openboost']['nll'] - results['official']['nll']
-        print(f"  NLL difference: {nll_diff:+.4f} {'(OpenBoost better)' if nll_diff < 0 else '(NGBoost better)'}")
-    
-    return results
-
-
-def benchmark_california_housing(n_trees: int = 100):
-    """Benchmark on California Housing dataset."""
-    print(f"\n{'='*70}")
-    print(f"CALIFORNIA HOUSING DATASET: {n_trees} trees")
-    print('='*70)
-    
-    # Load data
-    data = fetch_california_housing()
-    X, y = data.data.astype(np.float32), data.target.astype(np.float32)
-    
-    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
-    
-    print(f"Train: {len(X_train):,} samples, Test: {len(X_test):,} samples")
-    
-    results = {}
-    
-    # --- Official NGBoost ---
-    try:
-        from ngboost import NGBRegressor
-        from ngboost.distns import Normal
-        
-        print("\n[Official NGBoost]")
-        model_official = NGBRegressor(
-            Dist=Normal,
-            n_estimators=n_trees,
-            learning_rate=0.1,
-            verbose=False,
-        )
-        
-        t0 = time.perf_counter()
-        model_official.fit(X_train, y_train)
-        train_time = time.perf_counter() - t0
-        
-        pred = model_official.predict(X_test)
-        dist = model_official.pred_dist(X_test)
-        nll = -dist.logpdf(y_test).mean()
-        rmse = np.sqrt(np.mean((pred - y_test)**2))
-        
-        results['official'] = {'train_time': train_time, 'nll': nll, 'rmse': rmse}
-        print(f"  Train time: {train_time:.2f}s | NLL: {nll:.4f} | RMSE: {rmse:.4f}")
-        
-    except Exception as e:
-        print(f"  Error: {e}")
-    
-    # --- OpenBoost NaturalBoost ---
-    try:
-        import openboost as ob
-
-        # Warmup JIT
-        ob.NaturalBoostNormal(n_trees=3, learning_rate=0.1, max_depth=3).fit(
-            X_train[:500], y_train[:500]
-        )
-        
-        print("\n[OpenBoost NaturalBoost]")
-        model = ob.NaturalBoostNormal(n_trees=n_trees, learning_rate=0.1, max_depth=3)
-        
-        t0 = time.perf_counter()
-        model.fit(X_train, y_train)
-        train_time = time.perf_counter() - t0
-        
-        pred = model.predict(X_test)
-        nll = model.score(X_test, y_test)
-        if hasattr(nll, 'mean'):
-            nll = nll.mean()
-        rmse = np.sqrt(np.mean((pred - y_test)**2))
-        
-        results['openboost'] = {'train_time': train_time, 'nll': nll, 'rmse': rmse}
-        print(f"  Train time: {train_time:.2f}s | NLL: {nll:.4f} | RMSE: {rmse:.4f}")
-        
-    except Exception as e:
-        print(f"  Error: {e}")
-        import traceback
-        traceback.print_exc()
-    
-    # --- Comparison ---
-    if results.get('official') and results.get('openboost'):
-        print("\n[Comparison]")
-        speedup = results['official']['train_time'] / results['openboost']['train_time']
-        print(f"  Training speedup: {speedup:.2f}x {'(OpenBoost faster)' if speedup > 1 else '(NGBoost faster)'}")
-    
-    return results
-
-
-def benchmark_scaling():
-    """Benchmark training time scaling with data size."""
-    print(f"\n{'='*70}")
-    print("SCALING BENCHMARK")
-    print('='*70)
-    
-    sizes = [1000, 5000, 10000, 20000]
-    n_trees = 50
-    
-    print(f"\n{'Size':<10} {'NGBoost':<12} {'OpenBoost':<12} {'Speedup':<10}")
-    print("-" * 44)
-    
-    # Warmup JIT on small data
-    try:
-        import openboost as ob
-        warmup_X, warmup_y = make_regression(n_samples=500, n_features=10, noise=10, random_state=0)
-        ob.NaturalBoostNormal(n_trees=3, learning_rate=0.1, max_depth=3).fit(
-            warmup_X.astype(np.float32), warmup_y.astype(np.float32)
-        )
-    except Exception:
-        pass
-
-    for n in sizes:
-        X, y = make_regression(n_samples=n, n_features=10, noise=10, random_state=42)
-        X = X.astype(np.float32)
-        y = y.astype(np.float32)
-        
-        # Official NGBoost
-        try:
-            from ngboost import NGBRegressor
-            from ngboost.distns import Normal
-            
-            model = NGBRegressor(Dist=Normal, n_estimators=n_trees, learning_rate=0.1, verbose=False)
-            t0 = time.perf_counter()
-            model.fit(X, y)
-            time_official = time.perf_counter() - t0
-        except Exception:
-            time_official = float('nan')
-        
-        # OpenBoost
-        try:
-            import openboost as ob
-            
-            model = ob.NaturalBoostNormal(n_trees=n_trees, learning_rate=0.1, max_depth=3)
-            t0 = time.perf_counter()
-            model.fit(X, y)
-            time_openboost = time.perf_counter() - t0
-        except Exception:
-            time_openboost = float('nan')
-        
-        speedup = time_official / time_openboost if time_openboost > 0 else 0
-        print(f"{n:<10} {time_official:<12.2f}s {time_openboost:<12.2f}s {speedup:<10.2f}x")
-
-
-if __name__ == '__main__':
-    print("="*70)
-    print("OpenBoost NaturalBoost vs Official NGBoost Benchmark")
-    print("="*70)
-    
-    # Quick benchmark
-    benchmark_synthetic(n_samples=5000, n_features=10, n_trees=50)
-    
-    # Real dataset
-    benchmark_california_housing(n_trees=50)
-    
-    # Scaling
-    benchmark_scaling()
-    
-    print("\n" + "="*70)
-    print("BENCHMARK COMPLETE")
-    print("="*70)
diff --git a/benchmarks/performance_report.py b/benchmarks/performance_report.py
deleted file mode 100644
index 9c09165..0000000
--- a/benchmarks/performance_report.py
+++ /dev/null
@@ -1,436 +0,0 @@
-"""Generate comprehensive performance report for OpenBoost.
-
-GPU Performance Validation — compares:
-1. NaturalBoost vs NGBoost (distributional GBDT)
-2. OpenBoostGAM vs InterpretML EBM (interpretable models)
-
-Run on Modal:
-    uv run modal run benchmarks/performance_report.py
-
-Run locally (if you have GPU):
-    uv run python benchmarks/performance_report.py --local
-"""
-
-from __future__ import annotations
-
-import json
-import time
-from pathlib import Path
-
-PROJECT_ROOT = Path(__file__).parent.parent
-
-try:
-    import modal
-
-    app = modal.App("openboost-perf-report")
-
-    image = (
-        modal.Image.from_registry("nvidia/cuda:12.4.0-devel-ubuntu22.04", add_python="3.12")
-        .pip_install(
-            "numpy>=1.24",
-            "numba>=0.60",
-            "scikit-learn>=1.0",
-            "ngboost>=0.5",
-            "interpret>=0.6",
-            "xgboost>=2.0",
-            "tabulate>=0.9",
-            "scipy>=1.10",
-        )
-        .add_local_dir(
-            str(PROJECT_ROOT / "src" / "openboost"),
-            remote_path="/root/openboost",
-        )
-    )
-except ImportError:
-    modal = None
-    app = None
-    image = None
-
-
-def generate_data(n_samples: int, n_features: int, noise: float = 10.0, seed: int = 42):
-    """Generate synthetic regression data."""
-    import numpy as np
-    
-    np.random.seed(seed)
-    X = np.random.randn(n_samples, n_features).astype(np.float32)
-    
-    # True model: linear + non-linear effects
-    y = (
-        2.0 * X[:, 0] +
-        np.sin(X[:, 1] * 2) +
-        0.5 * X[:, 2] ** 2 +
-        noise * np.random.randn(n_samples)
-    ).astype(np.float32)
-    
-    return X, y
-
-
-def run_performance_report():
-    """Run all performance benchmarks."""
-    import sys
-    sys.path.insert(0, "/root")
-
-    import numpy as np
-    from sklearn.datasets import make_regression, fetch_california_housing
-    from sklearn.model_selection import train_test_split
-    from numba import cuda
-
-    gpu_name = cuda.get_current_device().name
-    if isinstance(gpu_name, bytes):
-        gpu_name = gpu_name.decode()
-    
-    results = {
-        "gpu_device": gpu_name,
-        "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
-        "benchmarks": {}
-    }
-
-    # =========================================================================
-    # Benchmark 1: NaturalBoost vs NGBoost
-    # =========================================================================
-    print("=" * 70)
-    print("BENCHMARK 1: NaturalBoost vs NGBoost")
-    print("=" * 70)
-
-    import openboost as ob
-    from ngboost import NGBRegressor
-    from ngboost.distns import Normal
-
-    ob.set_backend("cuda")
-    print(f"OpenBoost backend: {ob.get_backend()}")
-
-    ngboost_results = []
-
-    for n_samples in [250_000, 500_000, 1_000_000]:
-        print(f"\n{n_samples:,} samples:")
-
-        X, y = generate_data(n_samples, n_features=20, noise=10.0)
-        X_train, X_test, y_train, y_test = train_test_split(
-            X, y, test_size=0.2, random_state=42
-        )
-
-        # NGBoost
-        ngb = NGBRegressor(
-            Dist=Normal,
-            n_estimators=100,
-            learning_rate=0.1,
-            verbose=False
-        )
-        start = time.perf_counter()
-        ngb.fit(X_train, y_train)
-        ngb_time = time.perf_counter() - start
-
-        # NGBoost predictions and metrics
-        ngb_pred = ngb.predict(X_test)
-        ngb_dist = ngb.pred_dist(X_test)
-        ngb_nll = float(-ngb_dist.logpdf(y_test).mean())
-        ngb_lower = ngb_dist.ppf(0.05)
-        ngb_upper = ngb_dist.ppf(0.95)
-        ngb_coverage = float(np.mean((y_test >= ngb_lower) & (y_test <= ngb_upper)))
-
-        # NaturalBoost - warmup first
-        nb_warmup = ob.NaturalBoostNormal(n_trees=2, max_depth=3)
-        nb_warmup.fit(X_train[:500], y_train[:500])
-        cuda.synchronize()
-
-        # NaturalBoost
-        nb = ob.NaturalBoostNormal(n_trees=100, learning_rate=0.1, max_depth=3)
-        start = time.perf_counter()
-        nb.fit(X_train, y_train)
-        cuda.synchronize()
-        nb_time = time.perf_counter() - start
-
-        # NaturalBoost predictions and metrics
-        nb_pred = nb.predict(X_test)
-        nb_nll = float(nb.nll(X_test, y_test))
-        nb_lower, nb_upper = nb.predict_interval(X_test, alpha=0.1)
-        nb_coverage = float(np.mean((y_test >= nb_lower) & (y_test <= nb_upper)))
-
-        speedup = ngb_time / nb_time
-
-        print(f"  NGBoost:      {ngb_time:.2f}s (NLL: {ngb_nll:.4f}, Coverage: {ngb_coverage:.1%})")
-        print(f"  NaturalBoost: {nb_time:.2f}s (NLL: {nb_nll:.4f}, Coverage: {nb_coverage:.1%})")
-        print(f"  Speedup:      {speedup:.2f}x")
-
-        ngboost_results.append({
-            "samples": n_samples,
-            "ngboost_time": ngb_time,
-            "ngboost_nll": ngb_nll,
-            "ngboost_coverage": ngb_coverage,
-            "naturalboost_time": nb_time,
-            "naturalboost_nll": nb_nll,
-            "naturalboost_coverage": nb_coverage,
-            "speedup": speedup,
-        })
-
-    results["benchmarks"]["ngboost"] = ngboost_results
-
-    # California Housing benchmark
-    print(f"\nCalifornia Housing Dataset:")
-    data = fetch_california_housing()
-    X, y = data.data.astype(np.float32), data.target.astype(np.float32)
-    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
-    print(f"  Samples: {len(X_train):,} train, {len(X_test):,} test")
-
-    # NGBoost on California Housing
-    ngb = NGBRegressor(Dist=Normal, n_estimators=100, learning_rate=0.1, verbose=False)
-    start = time.perf_counter()
-    ngb.fit(X_train, y_train)
-    ngb_time_cal = time.perf_counter() - start
-
-    # NaturalBoost on California Housing
-    nb = ob.NaturalBoostNormal(n_trees=100, learning_rate=0.1, max_depth=3)
-    start = time.perf_counter()
-    nb.fit(X_train, y_train)
-    cuda.synchronize()
-    nb_time_cal = time.perf_counter() - start
-
-    speedup_cal = ngb_time_cal / nb_time_cal
-    print(f"  NGBoost:      {ngb_time_cal:.2f}s")
-    print(f"  NaturalBoost: {nb_time_cal:.2f}s")
-    print(f"  Speedup:      {speedup_cal:.2f}x")
-
-    results["benchmarks"]["ngboost_california"] = {
-        "samples": len(X_train),
-        "ngboost_time": ngb_time_cal,
-        "naturalboost_time": nb_time_cal,
-        "speedup": speedup_cal,
-    }
-
-    # =========================================================================
-    # Benchmark 2: OpenBoostGAM vs InterpretML EBM
-    # =========================================================================
-    print("\n" + "=" * 70)
-    print("BENCHMARK 2: OpenBoostGAM vs InterpretML EBM")
-    print("=" * 70)
-
-    from interpret.glassbox import ExplainableBoostingRegressor
-    from openboost import OpenBoostGAM
-    from sklearn.metrics import r2_score
-
-    ebm_results = []
-
-    for n_samples in [500_000, 1_000_000, 2_000_000]:
-        print(f"\n{n_samples:,} samples:")
-
-        X, y = generate_data(n_samples, n_features=20, noise=0.1)
-
-        # InterpretML EBM
-        ebm = ExplainableBoostingRegressor(
-            max_rounds=200,
-            learning_rate=0.05,
-            outer_bags=1,
-            inner_bags=0,
-            interactions=0,
-            n_jobs=-1,
-        )
-        start = time.perf_counter()
-        ebm.fit(X, y)
-        ebm_time = time.perf_counter() - start
-        ebm_r2 = r2_score(y, ebm.predict(X))
-
-        # OpenBoostGAM - warmup
-        gam_warmup = OpenBoostGAM(n_rounds=10)
-        gam_warmup.fit(X[:1000], y[:1000])
-        cuda.synchronize()
-
-        # OpenBoostGAM
-        gam = OpenBoostGAM(n_rounds=200, learning_rate=0.05)
-        start = time.perf_counter()
-        gam.fit(X, y)
-        cuda.synchronize()
-        gam_time = time.perf_counter() - start
-        gam_r2 = r2_score(y, gam.predict(X))
-
-        speedup = ebm_time / gam_time
-
-        print(f"  EBM:          {ebm_time:.2f}s (R²: {ebm_r2:.4f})")
-        print(f"  OpenBoostGAM: {gam_time:.2f}s (R²: {gam_r2:.4f})")
-        print(f"  Speedup:      {speedup:.1f}x")
-
-        ebm_results.append({
-            "samples": n_samples,
-            "ebm_time": ebm_time,
-            "ebm_r2": ebm_r2,
-            "openboostgam_time": gam_time,
-            "openboostgam_r2": gam_r2,
-            "speedup": speedup,
-        })
-
-    results["benchmarks"]["ebm"] = ebm_results
-
-    # =========================================================================
-    # Summary
-    # =========================================================================
-    print("\n" + "=" * 70)
-    print("PERFORMANCE SUMMARY")
-    print("=" * 70)
-
-    print("\nNaturalBoost vs NGBoost (100 trees, Normal distribution):")
-    print(f"{'Samples':<14} {'NGBoost (s)':<14} {'NaturalBoost (s)':<18} {'Speedup':<10}")
-    print("-" * 56)
-    for r in ngboost_results:
-        print(f"{r['samples']:<14,} {r['ngboost_time']:<14.2f} {r['naturalboost_time']:<18.2f} {r['speedup']:<10.2f}x")
-
-    print("\nOpenBoostGAM vs InterpretML EBM (200 rounds, 20 features):")
-    print(f"{'Samples':<14} {'EBM (s)':<12} {'OpenBoostGAM (s)':<18} {'Speedup':<10}")
-    print("-" * 54)
-    for r in ebm_results:
-        print(f"{r['samples']:<14,} {r['ebm_time']:<12.2f} {r['openboostgam_time']:<18.2f} {r['speedup']:<10.1f}x")
-
-    # Acceptance criteria check
-    print("\n" + "=" * 70)
-    print("ACCEPTANCE CRITERIA")
-    print("=" * 70)
-
-    # Check NaturalBoost >1.3x faster
-    nb_faster = all(r['speedup'] > 1.0 for r in ngboost_results)
-    nb_speedup_1m = next(r['speedup'] for r in ngboost_results if r['samples'] == 1_000_000)
-    print(f"[{'✓' if nb_faster else '✗'}] NaturalBoost faster than NGBoost at all sizes")
-    print(f"[{'✓' if nb_speedup_1m > 1.3 else '✗'}] NaturalBoost >1.3x faster at 1M samples (actual: {nb_speedup_1m:.2f}x)")
-
-    # Check OpenBoostGAM >10x faster at 2M
-    gam_speedup_2m = next(r['speedup'] for r in ebm_results if r['samples'] == 2_000_000)
-    print(f"[{'✓' if gam_speedup_2m > 10 else '✗'}] OpenBoostGAM >10x faster at 2M samples (actual: {gam_speedup_2m:.1f}x)")
-
-    # Check comparable accuracy
-    gam_r2_comparable = all(abs(r['openboostgam_r2'] - r['ebm_r2']) < 0.1 for r in ebm_results)
-    print(f"[{'✓' if gam_r2_comparable else '✗'}] OpenBoostGAM comparable R² to EBM")
-
-    results["acceptance_criteria"] = {
-        "naturalboost_faster_all": nb_faster,
-        "naturalboost_speedup_1m": nb_speedup_1m,
-        "openboostgam_speedup_2m": gam_speedup_2m,
-        "r2_comparable": gam_r2_comparable,
-    }
-
-    return results
-
-
-class BytesEncoder(json.JSONEncoder):
-    """Custom JSON encoder that handles bytes objects."""
-    def default(self, obj):
-        if isinstance(obj, bytes):
-            return obj.decode('utf-8', errors='replace')
-        return super().default(obj)
-
-
-def convert_bytes_in_dict(obj):
-    """Recursively convert bytes to strings in a dictionary."""
-    if isinstance(obj, bytes):
-        return obj.decode('utf-8', errors='replace')
-    elif isinstance(obj, dict):
-        return {k: convert_bytes_in_dict(v) for k, v in obj.items()}
-    elif isinstance(obj, list):
-        return [convert_bytes_in_dict(item) for item in obj]
-    return obj
-
-
-if modal is not None and app is not None:
-    _run_performance_report_modal = app.function(
-        gpu="A100", image=image, timeout=14400
-    )(run_performance_report)
-
-    @app.local_entrypoint()
-    def main():
-        """Run performance report."""
-        print("Running OpenBoost Performance Report on Modal A100...")
-        print("This may take 10-20 minutes.\n")
-
-        results = _run_performance_report_modal.remote()
-
-        results = convert_bytes_in_dict(results)
-
-        results_dir = PROJECT_ROOT / "benchmarks" / "results"
-        results_dir.mkdir(exist_ok=True)
-
-        timestamp = time.strftime("%Y%m%d_%H%M%S")
-        results_file = results_dir / f"performance_report_{timestamp}.json"
-
-        with open(results_file, "w") as f:
-            json.dump(results, f, indent=2, cls=BytesEncoder)
-
-        print(f"\nResults saved to: {results_file}")
-
-        print("\n" + "=" * 70)
-        print("README MARKDOWN (copy-paste ready):")
-        print("=" * 70)
-
-        print("""
-### NaturalBoost vs NGBoost
-
-| Samples | NGBoost | NaturalBoost (GPU) | Speedup |
-|---------|---------|-------------------|---------|""")
-        for r in results["benchmarks"]["ngboost"]:
-            print(f"| {r['samples']:,}   | {r['ngboost_time']:.1f}s    | {r['naturalboost_time']:.1f}s             | {r['speedup']:.1f}x    |")
-
-        print("""
-*Benchmark: Normal distribution, 100 trees, 20 features, A100 GPU*
-
-### OpenBoostGAM vs InterpretML EBM
-
-| Samples | EBM (CPU) | OpenBoostGAM (GPU) | Speedup |
-|---------|-----------|-------------------|---------|""")
-        for r in results["benchmarks"]["ebm"]:
-            print(f"| {r['samples']:,}  | {r['ebm_time']:.0f}s       | {r['openboostgam_time']:.1f}s              | {r['speedup']:.0f}x     |")
-
-        print("""
-*Benchmark: 200 rounds, 20 features, pure GAM (no interactions), A100 GPU*
-""")
-
-
-# For local execution without Modal
-if __name__ == "__main__":
-    import sys
-
-    if len(sys.argv) > 1 and sys.argv[1] == "--local":
-        print("Running locally (requires GPU)...")
-
-        import numpy as np
-        from sklearn.model_selection import train_test_split
-
-        # Add openboost to path
-        sys.path.insert(0, str(PROJECT_ROOT / "src"))
-
-        import openboost as ob
-        print(f"OpenBoost backend: {ob.get_backend()}")
-
-        # Quick benchmark
-        print("\nQuick NaturalBoost vs NGBoost benchmark (5K samples):")
-        X, y = generate_data(5000, 20, noise=10.0)
-        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
-
-        try:
-            from ngboost import NGBRegressor
-            from ngboost.distns import Normal
-
-            ngb = NGBRegressor(Dist=Normal, n_estimators=50, learning_rate=0.1, verbose=False)
-            start = time.perf_counter()
-            ngb.fit(X_train, y_train)
-            ngb_time = time.perf_counter() - start
-            print(f"  NGBoost:      {ngb_time:.2f}s")
-        except ImportError:
-            print("  NGBoost not installed. Run: pip install ngboost")
-            ngb_time = None
-
-        try:
-            # Warmup JIT
-            ob.NaturalBoostNormal(n_trees=3, learning_rate=0.1, max_depth=3).fit(
-                X_train[:500], y_train[:500]
-            )
-
-            nb = ob.NaturalBoostNormal(n_trees=50, learning_rate=0.1, max_depth=3)
-            start = time.perf_counter()
-            nb.fit(X_train, y_train)
-            nb_time = time.perf_counter() - start
-            print(f"  NaturalBoost: {nb_time:.2f}s")
-
-            if ngb_time:
-                print(f"  Speedup:      {ngb_time / nb_time:.2f}x")
-        except Exception as e:
-            print(f"  NaturalBoost failed: {e}")
-
-    else:
-        print("Usage:")
-        print("  Modal:  uv run modal run benchmarks/performance_report.py")
-        print("  Local:  uv run python benchmarks/performance_report.py --local")
diff --git a/benchmarks/modal_bench.py b/benchmarks/profile_kernels.py
similarity index 99%
rename from benchmarks/modal_bench.py
rename to benchmarks/profile_kernels.py
index c9971c1..3ed5bf7 100644
--- a/benchmarks/modal_bench.py
+++ b/benchmarks/profile_kernels.py
@@ -2,7 +2,7 @@
 
 Run from Mac with:
     cd openboost
-    uv run modal run benchmarks/modal_bench.py
+    uv run modal run benchmarks/profile_kernels.py
 
 This will execute benchmarks on a cloud A100 GPU.
 """
diff --git a/benchmarks/profile_loop.py b/benchmarks/profile_loop.py
new file mode 100644
index 0000000..2a17e6f
--- /dev/null
+++ b/benchmarks/profile_loop.py
@@ -0,0 +1,100 @@
+"""Profile OpenBoost training and identify bottlenecks.
+
+Part of the self-recursive improvement loop:
+  1. Run this script to profile and identify the top bottleneck
+  2. Optimize the target code
+  3. Re-run to verify improvement and compare with previous run
+
+Usage:
+    uv run python benchmarks/profile_loop.py
+    uv run python benchmarks/profile_loop.py --n-samples 200000 --n-features 50
+    uv run python benchmarks/profile_loop.py --n-trees 200 --max-depth 8
+    uv run python benchmarks/profile_loop.py --summarize
+    uv run python benchmarks/profile_loop.py --growth leafwise
+"""
+
+from __future__ import annotations
+
+import argparse
+import sys
+import time
+
+import numpy as np
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Profile OpenBoost training loop")
+    parser.add_argument("--n-samples", type=int, default=50_000)
+    parser.add_argument("--n-features", type=int, default=20)
+    parser.add_argument("--n-trees", type=int, default=100)
+    parser.add_argument("--max-depth", type=int, default=6)
+    parser.add_argument("--learning-rate", type=float, default=0.1)
+    parser.add_argument("--loss", type=str, default="mse")
+    parser.add_argument("--output-dir", type=str, default="logs/")
+    parser.add_argument("--summarize", action="store_true",
+                        help="Print machine-readable summary for improvement loops")
+    parser.add_argument("--seed", type=int, default=42)
+    args = parser.parse_args()
+
+    # Generate synthetic dataset
+    rng = np.random.RandomState(args.seed)
+    X = rng.randn(args.n_samples, args.n_features).astype(np.float32)
+    y = (X[:, 0] + 0.5 * X[:, 1] - 0.3 * X[:, 2]
+         + rng.randn(args.n_samples).astype(np.float32) * 0.1).astype(np.float32)
+
+    import openboost as ob
+    from openboost._profiler import ProfilingCallback, print_profile_summary
+
+    profiler = ProfilingCallback(output_dir=args.output_dir)
+    model = ob.GradientBoosting(
+        n_trees=args.n_trees,
+        max_depth=args.max_depth,
+        learning_rate=args.learning_rate,
+        loss=args.loss,
+    )
+
+    print(f"Profiling: {args.n_samples:,} samples, {args.n_features} features, "
+          f"{args.n_trees} trees, depth={args.max_depth}")
+
+    # Warmup JIT (first fit compiles Numba kernels)
+    warmup_model = ob.GradientBoosting(n_trees=2, max_depth=args.max_depth, loss=args.loss)
+    warmup_model.fit(X[:1000], y[:1000])
+
+    wall_start = time.perf_counter()
+    model.fit(X, y, callbacks=[profiler])
+    wall_time = time.perf_counter() - wall_start
+
+    print(f"Wall time: {wall_time:.2f}s")
+    print(f"Report: {profiler.report_path}")
+
+    if args.summarize:
+        print()
+        report = profiler.report
+        report["_path"] = str(profiler.report_path)
+        print_profile_summary(report)
+    else:
+        # Print compact phase table
+        report = profiler.report
+        print(f"\n{'Phase':<20} {'Time (s)':>10} {'%':>8} {'Calls':>8}")
+        print("-" * 50)
+        for phase, data in report["phases"].items():
+            calls = str(data["calls"]) if data["calls"] is not None else "-"
+            print(f"{phase:<20} {data['total_s']:>10.3f} {data['pct']:>7.1f}% {calls:>8}")
+        print("-" * 50)
+        print(f"{'TOTAL':<20} {report['total_time_s']:>10.3f}")
+
+        if report.get("bottlenecks"):
+            print(f"\nTop bottleneck: {report['bottlenecks'][0]['phase']} "
+                  f"({report['bottlenecks'][0]['pct']}%)")
+            print(f"  Target: {report['bottlenecks'][0]['target']}")
+            print(f"  Recommendation: {report['bottlenecks'][0]['recommendation']}")
+
+        if report.get("comparison"):
+            comp = report["comparison"]
+            delta = comp["delta_total_pct"]
+            sign = "+" if delta > 0 else ""
+            print(f"\nvs previous run: {sign}{delta}% total time")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/openml_integration.py b/benchmarks/validate_datasets.py
similarity index 98%
rename from benchmarks/openml_integration.py
rename to benchmarks/validate_datasets.py
index 5edd495..0f48c15 100644
--- a/benchmarks/openml_integration.py
+++ b/benchmarks/validate_datasets.py
@@ -3,19 +3,19 @@
 Pre-release validation of OpenBoost vs XGBoost performance on real-world datasets.
 
 Run on Modal (GPU):
-    uv run modal run benchmarks/openml_integration.py
+    uv run modal run benchmarks/validate_datasets.py
 
 Run locally (small datasets only):
-    uv run python benchmarks/openml_integration.py --local
+    uv run python benchmarks/validate_datasets.py --local
 
 Run specific datasets:
-    uv run modal run benchmarks/openml_integration.py --datasets cpu_act higgs
+    uv run modal run benchmarks/validate_datasets.py --datasets cpu_act higgs
 
 Run specific configs:
-    uv run modal run benchmarks/openml_integration.py --configs baseline deep_tree
+    uv run modal run benchmarks/validate_datasets.py --configs baseline deep_tree
 
 Run extended suite:
-    uv run modal run benchmarks/openml_integration.py --extended
+    uv run modal run benchmarks/validate_datasets.py --extended
 """
 
 from __future__ import annotations
@@ -1269,8 +1269,8 @@ def main(
         print(f"\nResults saved to: {results_file}")
     else:
         print("Usage:")
-        print("  Modal:  uv run modal run benchmarks/openml_integration.py")
-        print("  Local:  uv run python benchmarks/openml_integration.py --local")
+        print("  Modal:  uv run modal run benchmarks/validate_datasets.py")
+        print("  Local:  uv run python benchmarks/validate_datasets.py --local")
         print("")
         print("Options:")
         print("  --datasets cpu_act higgs     Run specific datasets")
diff --git a/benchmarks/xgboost_benchmark.py b/benchmarks/xgboost_benchmark.py
deleted file mode 100644
index cf16644..0000000
--- a/benchmarks/xgboost_benchmark.py
+++ /dev/null
@@ -1,594 +0,0 @@
-"""Benchmark: OpenBoost vs XGBoost on Multiple Tasks.
-
-Run locally:
-    uv run python benchmarks/xgboost_benchmark.py --local
-
-Run on Modal (cloud A100):
-    uv run modal run benchmarks/xgboost_benchmark.py
-"""
-
-from __future__ import annotations
-
-from pathlib import Path
-
-PROJECT_ROOT = Path(__file__).parent.parent
-
-try:
-    import modal
-
-    app = modal.App("openboost-xgboost-bench")
-
-    image = (
-        modal.Image.from_registry("nvidia/cuda:12.4.0-devel-ubuntu22.04", add_python="3.12")
-        .pip_install(
-            "numpy>=1.24",
-            "numba>=0.60",
-            "scikit-learn>=1.0",
-            "xgboost>=2.0",
-        )
-        .add_local_dir(
-            str(PROJECT_ROOT / "src" / "openboost"),
-            remote_path="/root/openboost",
-        )
-    )
-except ImportError:
-    modal = None
-    app = None
-    image = None
-
-
-# =============================================================================
-# Data Generators
-# =============================================================================
-
-def generate_regression_data(n_samples: int, n_features: int, noise: float = 0.1):
-    """Generate regression data."""
-    import numpy as np
-    np.random.seed(42)
-    X = np.random.randn(n_samples, n_features).astype(np.float32)
-    # Non-linear target with interactions
-    y = (
-        np.sin(X[:, 0] * 2) + 
-        0.5 * X[:, 1] ** 2 + 
-        0.3 * X[:, 2] * X[:, 3] +
-        noise * np.random.randn(n_samples)
-    ).astype(np.float32)
-    return X, y
-
-
-def generate_binary_data(n_samples: int, n_features: int):
-    """Generate binary classification data."""
-    import numpy as np
-    np.random.seed(42)
-    X = np.random.randn(n_samples, n_features).astype(np.float32)
-    logits = X[:, 0] + 0.5 * X[:, 1] - 0.3 * X[:, 2] + 0.2 * X[:, 3]
-    probs = 1 / (1 + np.exp(-logits))
-    y = (np.random.rand(n_samples) < probs).astype(np.float32)
-    return X, y
-
-
-def generate_multiclass_data(n_samples: int, n_features: int, n_classes: int = 5):
-    """Generate multi-class classification data."""
-    import numpy as np
-    np.random.seed(42)
-    X = np.random.randn(n_samples, n_features).astype(np.float32)
-    # Create class boundaries based on first few features
-    scores = np.zeros((n_samples, n_classes))
-    for k in range(n_classes):
-        scores[:, k] = X[:, k % n_features] + 0.5 * X[:, (k + 1) % n_features]
-    y = np.argmax(scores + 0.5 * np.random.randn(n_samples, n_classes), axis=1)
-    return X, y.astype(np.int32)
-
-
-def generate_quantile_data(n_samples: int, n_features: int):
-    """Generate heteroscedastic data for quantile regression."""
-    import numpy as np
-    np.random.seed(42)
-    X = np.random.randn(n_samples, n_features).astype(np.float32)
-    # Heteroscedastic noise: variance depends on X[:, 0]
-    noise_std = 0.5 + np.abs(X[:, 0])
-    y = (X[:, 0] + 0.5 * X[:, 1] + noise_std * np.random.randn(n_samples)).astype(np.float32)
-    return X, y
-
-
-def generate_poisson_data(n_samples: int, n_features: int):
-    """Generate count data for Poisson regression."""
-    import numpy as np
-    np.random.seed(42)
-    X = np.random.randn(n_samples, n_features).astype(np.float32)
-    # Log-linear model
-    log_mu = 1.0 + 0.5 * X[:, 0] + 0.3 * X[:, 1] - 0.2 * X[:, 2]
-    mu = np.exp(np.clip(log_mu, -5, 5))
-    y = np.random.poisson(mu).astype(np.float32)
-    return X, y
-
-
-def generate_gamma_data(n_samples: int, n_features: int):
-    """Generate positive continuous data for Gamma regression."""
-    import numpy as np
-    np.random.seed(42)
-    X = np.random.randn(n_samples, n_features).astype(np.float32)
-    # Log-linear model for mean
-    log_mu = 2.0 + 0.3 * X[:, 0] + 0.2 * X[:, 1]
-    mu = np.exp(np.clip(log_mu, -3, 5))
-    # Gamma with shape=2
-    shape = 2.0
-    scale = mu / shape
-    y = np.random.gamma(shape, scale).astype(np.float32)
-    return X, y
-
-
-# =============================================================================
-# Benchmark Functions
-# =============================================================================
-
-def benchmark_regression(X_train, X_test, y_train, y_test, n_trees=100, max_depth=6, use_gpu=False):
-    """Benchmark regression task."""
-    import numpy as np
-    import time
-    from sklearn.metrics import mean_squared_error, r2_score
-    
-    results = {}
-    
-    # OpenBoost
-    import openboost as ob
-    model = ob.GradientBoosting(
-        n_trees=n_trees,
-        max_depth=max_depth,
-        learning_rate=0.1,
-        loss='mse',
-    )
-    
-    # Warmup
-    ob.GradientBoosting(n_trees=5, max_depth=3).fit(X_train[:1000], y_train[:1000])
-    if use_gpu:
-        from numba import cuda
-        cuda.synchronize()
-    
-    start = time.perf_counter()
-    model.fit(X_train, y_train)
-    if use_gpu:
-        cuda.synchronize()
-    train_time = time.perf_counter() - start
-    
-    start = time.perf_counter()
-    y_pred = model.predict(X_test)
-    if use_gpu:
-        cuda.synchronize()
-    pred_time = time.perf_counter() - start
-    
-    results['openboost'] = {
-        'train_time': train_time,
-        'pred_time': pred_time * 1000,
-        'r2': r2_score(y_test, y_pred),
-        'rmse': np.sqrt(mean_squared_error(y_test, y_pred)),
-    }
-    
-    # XGBoost
-    import xgboost as xgb
-    xgb_model = xgb.XGBRegressor(
-        n_estimators=n_trees,
-        max_depth=max_depth,
-        learning_rate=0.1,
-        tree_method='hist',
-        device='cuda' if use_gpu else 'cpu',
-    )
-    
-    start = time.perf_counter()
-    xgb_model.fit(X_train, y_train)
-    if use_gpu:
-        cuda.synchronize()
-    train_time = time.perf_counter() - start
-    
-    start = time.perf_counter()
-    y_pred_xgb = xgb_model.predict(X_test)
-    if use_gpu:
-        cuda.synchronize()
-    pred_time = time.perf_counter() - start
-    
-    results['xgboost'] = {
-        'train_time': train_time,
-        'pred_time': pred_time * 1000,
-        'r2': r2_score(y_test, y_pred_xgb),
-        'rmse': np.sqrt(mean_squared_error(y_test, y_pred_xgb)),
-    }
-    
-    return results
-
-
-def benchmark_binary(X_train, X_test, y_train, y_test, n_trees=100, max_depth=6, use_gpu=False):
-    """Benchmark binary classification task."""
-    import numpy as np
-    import time
-    from sklearn.metrics import roc_auc_score, accuracy_score
-    
-    results = {}
-    
-    # OpenBoost
-    import openboost as ob
-
-    # Warmup JIT
-    ob.GradientBoosting(n_trees=5, max_depth=3, loss='logloss').fit(
-        X_train[:1000], y_train[:1000]
-    )
-    if use_gpu:
-        from numba import cuda
-        cuda.synchronize()
-
-    model = ob.GradientBoosting(
-        n_trees=n_trees,
-        max_depth=max_depth,
-        learning_rate=0.1,
-        loss='logloss',
-    )
-    
-    start = time.perf_counter()
-    model.fit(X_train, y_train)
-    if use_gpu:
-        from numba import cuda
-        cuda.synchronize()
-    train_time = time.perf_counter() - start
-    
-    start = time.perf_counter()
-    y_pred_raw = model.predict(X_test)
-    if use_gpu:
-        cuda.synchronize()
-    pred_time = time.perf_counter() - start
-    
-    # Convert to probabilities
-    y_pred_prob = 1 / (1 + np.exp(-y_pred_raw))
-    y_pred = (y_pred_prob > 0.5).astype(int)
-    
-    results['openboost'] = {
-        'train_time': train_time,
-        'pred_time': pred_time * 1000,
-        'auc': roc_auc_score(y_test, y_pred_prob),
-        'accuracy': accuracy_score(y_test, y_pred),
-    }
-    
-    # XGBoost
-    import xgboost as xgb
-    xgb_model = xgb.XGBClassifier(
-        n_estimators=n_trees,
-        max_depth=max_depth,
-        learning_rate=0.1,
-        tree_method='hist',
-        device='cuda' if use_gpu else 'cpu',
-    )
-    
-    start = time.perf_counter()
-    xgb_model.fit(X_train, y_train)
-    if use_gpu:
-        cuda.synchronize()
-    train_time = time.perf_counter() - start
-    
-    start = time.perf_counter()
-    y_pred_xgb_prob = xgb_model.predict_proba(X_test)[:, 1]
-    if use_gpu:
-        cuda.synchronize()
-    pred_time = time.perf_counter() - start
-    
-    y_pred_xgb = (y_pred_xgb_prob > 0.5).astype(int)
-    
-    results['xgboost'] = {
-        'train_time': train_time,
-        'pred_time': pred_time * 1000,
-        'auc': roc_auc_score(y_test, y_pred_xgb_prob),
-        'accuracy': accuracy_score(y_test, y_pred_xgb),
-    }
-    
-    return results
-
-
-def benchmark_multiclass(X_train, X_test, y_train, y_test, n_classes=5, n_trees=100, max_depth=6, use_gpu=False):
-    """Benchmark multi-class classification task."""
-    import numpy as np
-    import time
-    from sklearn.metrics import accuracy_score, log_loss
-    
-    results = {}
-    
-    # OpenBoost MultiClass
-    import openboost as ob
-
-    # Warmup JIT
-    ob.MultiClassGradientBoosting(
-        n_classes=n_classes, n_trees=5, max_depth=3
-    ).fit(X_train[:1000], y_train[:1000])
-    if use_gpu:
-        from numba import cuda
-        cuda.synchronize()
-
-    model = ob.MultiClassGradientBoosting(
-        n_classes=n_classes,
-        n_trees=n_trees,
-        max_depth=max_depth,
-        learning_rate=0.1,
-    )
-    
-    start = time.perf_counter()
-    model.fit(X_train, y_train)
-    if use_gpu:
-        from numba import cuda
-        cuda.synchronize()
-    train_time = time.perf_counter() - start
-    
-    start = time.perf_counter()
-    y_pred_prob = model.predict_proba(X_test)
-    if use_gpu:
-        cuda.synchronize()
-    pred_time = time.perf_counter() - start
-    
-    y_pred = np.argmax(y_pred_prob, axis=1)
-    
-    results['openboost'] = {
-        'train_time': train_time,
-        'pred_time': pred_time * 1000,
-        'accuracy': accuracy_score(y_test, y_pred),
-        'logloss': log_loss(y_test, y_pred_prob),
-    }
-    
-    # XGBoost
-    import xgboost as xgb
-    xgb_model = xgb.XGBClassifier(
-        n_estimators=n_trees,
-        max_depth=max_depth,
-        learning_rate=0.1,
-        tree_method='hist',
-        device='cuda' if use_gpu else 'cpu',
-        objective='multi:softprob',
-        num_class=n_classes,
-    )
-    
-    start = time.perf_counter()
-    xgb_model.fit(X_train, y_train)
-    if use_gpu:
-        cuda.synchronize()
-    train_time = time.perf_counter() - start
-    
-    start = time.perf_counter()
-    y_pred_xgb_prob = xgb_model.predict_proba(X_test)
-    if use_gpu:
-        cuda.synchronize()
-    pred_time = time.perf_counter() - start
-    
-    y_pred_xgb = np.argmax(y_pred_xgb_prob, axis=1)
-    
-    results['xgboost'] = {
-        'train_time': train_time,
-        'pred_time': pred_time * 1000,
-        'accuracy': accuracy_score(y_test, y_pred_xgb),
-        'logloss': log_loss(y_test, y_pred_xgb_prob),
-    }
-    
-    return results
-
-
-def benchmark_poisson(X_train, X_test, y_train, y_test, n_trees=100, max_depth=6, use_gpu=False):
-    """Benchmark Poisson regression task."""
-    import numpy as np
-    import time
-    
-    def poisson_deviance(y_true, y_pred):
-        """Compute Poisson deviance."""
-        y_pred = np.maximum(y_pred, 1e-8)
-        return 2 * np.mean(y_pred - y_true + y_true * np.log(np.maximum(y_true, 1e-8) / y_pred))
-    
-    results = {}
-    
-    # OpenBoost
-    import openboost as ob
-
-    # Warmup JIT
-    ob.GradientBoosting(n_trees=5, max_depth=3, loss='poisson').fit(
-        X_train[:1000], y_train[:1000]
-    )
-    if use_gpu:
-        from numba import cuda
-        cuda.synchronize()
-
-    model = ob.GradientBoosting(
-        n_trees=n_trees,
-        max_depth=max_depth,
-        learning_rate=0.1,
-        loss='poisson',
-    )
-    
-    start = time.perf_counter()
-    model.fit(X_train, y_train)
-    if use_gpu:
-        from numba import cuda
-        cuda.synchronize()
-    train_time = time.perf_counter() - start
-    
-    start = time.perf_counter()
-    y_pred_raw = model.predict(X_test)
-    if use_gpu:
-        cuda.synchronize()
-    pred_time = time.perf_counter() - start
-    
-    # Poisson uses log link
-    y_pred = np.exp(y_pred_raw)
-    
-    results['openboost'] = {
-        'train_time': train_time,
-        'pred_time': pred_time * 1000,
-        'deviance': poisson_deviance(y_test, y_pred),
-        'mean_pred': np.mean(y_pred),
-    }
-    
-    # XGBoost
-    import xgboost as xgb
-    xgb_model = xgb.XGBRegressor(
-        n_estimators=n_trees,
-        max_depth=max_depth,
-        learning_rate=0.1,
-        tree_method='hist',
-        device='cuda' if use_gpu else 'cpu',
-        objective='count:poisson',
-    )
-    
-    start = time.perf_counter()
-    xgb_model.fit(X_train, y_train)
-    if use_gpu:
-        cuda.synchronize()
-    train_time = time.perf_counter() - start
-    
-    start = time.perf_counter()
-    y_pred_xgb = xgb_model.predict(X_test)
-    if use_gpu:
-        cuda.synchronize()
-    pred_time = time.perf_counter() - start
-    
-    results['xgboost'] = {
-        'train_time': train_time,
-        'pred_time': pred_time * 1000,
-        'deviance': poisson_deviance(y_test, y_pred_xgb),
-        'mean_pred': np.mean(y_pred_xgb),
-    }
-    
-    return results
-
-
-# =============================================================================
-# Main Benchmark Runner
-# =============================================================================
-
-def print_results(task_name, results, metric1_name, metric2_name):
-    """Print formatted results."""
-    print(f"\n{'─' * 60}")
-    print(f"Task: {task_name}")
-    print(f"{'─' * 60}")
-    print(f"{'Model':<15} {'Train (s)':<12} {'Pred (ms)':<12} {metric1_name:<12} {metric2_name:<12}")
-    print(f"{'─' * 60}")
-    
-    metric_keys = [k for k in list(results['openboost'].keys()) if k not in ('train_time', 'pred_time')]
-    for name in ['openboost', 'xgboost']:
-        r = results[name]
-        m1 = r[metric_keys[0]] if metric_keys else 0.0
-        m2 = r[metric_keys[1]] if len(metric_keys) > 1 else 0.0
-        print(f"{name:<15} {r['train_time']:<12.3f} {r['pred_time']:<12.2f} {m1:<12.4f} {m2:<12.4f}")
-    
-    # Speedup
-    speedup = results['xgboost']['train_time'] / results['openboost']['train_time']
-    print(f"{'─' * 60}")
-    print(f"Speedup: {speedup:.2f}x {'(OpenBoost faster)' if speedup > 1 else '(XGBoost faster)'}")
-
-
-def run_all_benchmarks(n_samples=50_000, n_features=20, n_trees=100, max_depth=6, use_gpu=False):
-    """Run all benchmark tasks."""
-    from sklearn.model_selection import train_test_split
-    
-    print("=" * 60)
-    print("OPENBOOST vs XGBOOST BENCHMARK")
-    print("=" * 60)
-    print(f"Config: {n_samples:,} samples, {n_features} features, {n_trees} trees, depth {max_depth}")
-    print(f"Device: {'GPU' if use_gpu else 'CPU'}")
-    
-    all_results = {}
-    
-    # 1. Regression
-    print("\n[1/5] Regression (MSE)...")
-    X, y = generate_regression_data(n_samples, n_features)
-    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
-    results = benchmark_regression(X_train, X_test, y_train, y_test, n_trees, max_depth, use_gpu)
-    print_results("Regression (MSE)", results, "R²", "RMSE")
-    all_results['regression'] = results
-    
-    # 2. Binary Classification
-    print("\n[2/5] Binary Classification (LogLoss)...")
-    X, y = generate_binary_data(n_samples, n_features)
-    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
-    results = benchmark_binary(X_train, X_test, y_train, y_test, n_trees, max_depth, use_gpu)
-    print_results("Binary Classification", results, "AUC", "Accuracy")
-    all_results['binary'] = results
-    
-    # 3. Multi-class Classification
-    print("\n[3/5] Multi-class Classification (Softmax)...")
-    n_classes = 5
-    X, y = generate_multiclass_data(n_samples, n_features, n_classes)
-    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
-    results = benchmark_multiclass(X_train, X_test, y_train, y_test, n_classes, n_trees, max_depth, use_gpu)
-    print_results("Multi-class (5 classes)", results, "Accuracy", "LogLoss")
-    all_results['multiclass'] = results
-    
-    # 4. Poisson Regression
-    print("\n[4/5] Poisson Regression...")
-    X, y = generate_poisson_data(n_samples, n_features)
-    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
-    results = benchmark_poisson(X_train, X_test, y_train, y_test, n_trees, max_depth, use_gpu)
-    print_results("Poisson Regression", results, "Deviance", "Mean Pred")
-    all_results['poisson'] = results
-    
-    # Summary
-    print("\n" + "=" * 60)
-    print("SUMMARY")
-    print("=" * 60)
-    print(f"\n{'Task':<25} {'OpenBoost (s)':<15} {'XGBoost (s)':<15} {'Speedup':<10}")
-    print("─" * 65)
-    
-    for task, res in all_results.items():
-        ob_time = res['openboost']['train_time']
-        xgb_time = res['xgboost']['train_time']
-        speedup = xgb_time / ob_time
-        faster = "OB" if speedup > 1 else "XGB"
-        print(f"{task:<25} {ob_time:<15.3f} {xgb_time:<15.3f} {speedup:.2f}x ({faster})")
-    
-    return all_results
-
-
-# =============================================================================
-# Modal Entry Points (only defined when modal is installed)
-# =============================================================================
-
-if modal is not None and app is not None:
-
-    @app.function(gpu="A100", image=image, timeout=1800)
-    def benchmark_gpu(n_samples: int = 100_000, n_features: int = 20, n_trees: int = 100):
-        """Run benchmark on GPU."""
-        import sys
-        sys.path.insert(0, "/root")
-
-        from numba import cuda
-        print(f"GPU: {cuda.get_current_device().name}")
-
-        return run_all_benchmarks(
-            n_samples=n_samples,
-            n_features=n_features,
-            n_trees=n_trees,
-            max_depth=6,
-            use_gpu=True,
-        )
-
-    @app.local_entrypoint()
-    def main():
-        """Run benchmark on Modal."""
-        print("Running OpenBoost vs XGBoost benchmark on Modal A100...")
-        results = benchmark_gpu.remote(n_samples=100_000, n_features=20, n_trees=100)
-        print("\n\nFinal Results:")
-        print(results)
-
-
-# =============================================================================
-# Local Execution
-# =============================================================================
-
-if __name__ == "__main__":
-    import sys
-    
-    if len(sys.argv) > 1 and sys.argv[1] == "--local":
-        print("Running locally on CPU...")
-        
-        sys.path.insert(0, str(PROJECT_ROOT / "src"))
-        
-        run_all_benchmarks(
-            n_samples=20_000,  # Smaller for CPU
-            n_features=20,
-            n_trees=50,
-            max_depth=6,
-            use_gpu=False,
-        )
-    else:
-        print("Usage:")
-        print("  Modal:  uv run modal run benchmarks/xgboost_benchmark.py")
-        print("  Local:  uv run python benchmarks/xgboost_benchmark.py --local")
diff --git a/pyproject.toml b/pyproject.toml
index 9136a45..87a468a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -54,6 +54,7 @@ sklearn = [
 test = [
     "pytest>=7.0",
     "pytest-cov>=4.0",
+    "pytest-xdist>=3.0",
 ]
 # Benchmarking
 bench = [
@@ -92,7 +93,16 @@ ignore = ["E501"]  # Line length handled separately
 
 [tool.pytest.ini_options]
 testpaths = ["tests"]
-addopts = "-v --tb=short"
+addopts = "-v --tb=short -n auto --dist loadfile"
+markers = [
+    "slow: marks tests that take >10s (deselect with '-m \"not slow\"')",
+    "gpu: marks tests requiring CUDA GPU",
+    "xgboost: marks tests requiring xgboost package",
+    "lightgbm: marks tests requiring lightgbm package",
+    "numerical: marks numerical agreement tests against reference implementations",
+    "parity: marks CPU/GPU parity tests",
+    "benchmark: marks performance benchmark tests (not run in CI)",
+]
 
 [dependency-groups]
 dev = [
@@ -104,6 +114,7 @@ dev = [
     "mypy>=1.19.1",
     "pytest>=7.0",
     "pytest-cov>=4.0",
+    "pytest-xdist>=3.0",
     "ruff>=0.4",
     "xgboost>=2.0",
 ]
diff --git a/src/openboost/__init__.py b/src/openboost/__init__.py
index 4bbbc56..206e100 100644
--- a/src/openboost/__init__.py
+++ b/src/openboost/__init__.py
@@ -161,6 +161,7 @@
     CallbackManager,
     TrainingState,
 )
+from ._profiler import ProfilingCallback
 
 # =============================================================================
 # Feature Importance (Phase 13)
@@ -352,6 +353,7 @@ def __getattr__(name: str):
     "HistoryCallback",
     "CallbackManager",
     "TrainingState",
+    "ProfilingCallback",
     # Feature importance (Phase 13)
     "compute_feature_importances",
     "get_feature_importance_dict",
diff --git a/src/openboost/_core/_growth.py b/src/openboost/_core/_growth.py
index e9cb773..b45cf67 100644
--- a/src/openboost/_core/_growth.py
+++ b/src/openboost/_core/_growth.py
@@ -214,6 +214,9 @@ class TreeStructure:
     # Phase 14.3: Categorical split support
     is_categorical_split: NDArray | None = None  # (n_nodes,) bool - True if categorical split
     cat_bitsets: NDArray | None = None           # (n_nodes,) uint64 - bitmask for categories going left
+
+    # Cached GPU arrays for fast repeated prediction (avoids re-transferring)
+    _gpu_arrays: dict | None = field(default=None, repr=False)
     
     def get_leaf_values(self, leaf_ids: NDArray) -> NDArray:
         """Get leaf values for given leaf IDs.
@@ -335,38 +338,51 @@ def _predict_standard_cpu(self, binned: NDArray) -> NDArray:
         # Get leaf values (works with both NDArray and LeafValues)
         return self.get_leaf_values(leaf_ids)
     
-    def _predict_standard_gpu(self, binned) -> NDArray:
-        """GPU prediction for standard trees."""
-        from numba import cuda
-        from .._backends._cuda import predict_cuda, predict_with_categorical_cuda, to_device
+    def _ensure_gpu_arrays(self):
+        """Cache tree structure arrays on GPU to avoid repeated transfers."""
+        if self._gpu_arrays is not None:
+            return self._gpu_arrays
+        from .._backends._cuda import to_device
 
+        self._gpu_arrays = {
+            'features': to_device(self.features),
+            'thresholds': to_device(self.thresholds.astype(np.uint8)),
+            'values': to_device(self.values if isinstance(self.values, np.ndarray) else self.leaf_values_array),
+            'left': to_device(self.left_children),
+            'right': to_device(self.right_children),
+            'missing_left': to_device(self.missing_go_left) if self.missing_go_left is not None else None,
+        }
         has_categorical = (
             self.is_categorical_split is not None
             and self.cat_bitsets is not None
             and np.any(self.is_categorical_split)
         )
-
         if has_categorical:
+            self._gpu_arrays['is_categorical'] = to_device(self.is_categorical_split)
+            self._gpu_arrays['cat_bitsets'] = to_device(self.cat_bitsets)
+        return self._gpu_arrays
+
+    def _predict_standard_gpu(self, binned) -> NDArray:
+        """GPU prediction for standard trees."""
+        from .._backends._cuda import predict_cuda, predict_with_categorical_cuda
+
+        ga = self._ensure_gpu_arrays()
+
+        if 'is_categorical' in ga:
             return predict_with_categorical_cuda(
                 binned,
-                to_device(self.features),
-                to_device(self.thresholds.astype(np.uint8)),
-                to_device(self.values),
-                to_device(self.left_children),
-                to_device(self.right_children),
-                tree_missing_left=to_device(self.missing_go_left) if self.missing_go_left is not None else None,
-                is_categorical_split=to_device(self.is_categorical_split),
-                cat_bitsets=to_device(self.cat_bitsets),
+                ga['features'], ga['thresholds'], ga['values'],
+                ga['left'], ga['right'],
+                tree_missing_left=ga['missing_left'],
+                is_categorical_split=ga['is_categorical'],
+                cat_bitsets=ga['cat_bitsets'],
             )
 
         return predict_cuda(
             binned,
-            to_device(self.features),
-            to_device(self.thresholds.astype(np.uint8)),
-            to_device(self.values),
-            to_device(self.left_children),
-            to_device(self.right_children),
-            tree_missing_left=to_device(self.missing_go_left) if self.missing_go_left is not None else None,
+            ga['features'], ga['thresholds'], ga['values'],
+            ga['left'], ga['right'],
+            tree_missing_left=ga['missing_left'],
         )
     
     def _predict_symmetric(self, binned: NDArray) -> NDArray:
@@ -517,20 +533,34 @@ def grow(
         # Build level by level
         for depth in range(config.max_depth):
             nodes_at_level = get_nodes_at_depth(depth)
-            
+
             # Filter to nodes that have samples
             active_nodes = self._get_active_nodes(sample_node_ids, nodes_at_level)
             if not active_nodes:
                 break
 
-            # Build histograms for active nodes
-            histograms = build_node_histograms(
-                binned, grad, hess, sample_node_ids, active_nodes
-            )
+            # Histogram subtraction: build only smaller children, subtract for larger
+            if depth > 0 and parent_histograms:
+                build_nodes, subtract_info = self._plan_histogram_subtraction(
+                    active_nodes, sample_node_ids, parent_histograms
+                )
+                # Build histograms only for the subset that needs full computation
+                histograms = build_node_histograms(
+                    binned, grad, hess, sample_node_ids, build_nodes
+                ) if build_nodes else {}
+                # Derive larger children's histograms via subtraction (O(features*bins))
+                for child_id, (parent_hist, sibling_id) in subtract_info.items():
+                    histograms[child_id] = subtract_histogram(
+                        parent_hist, histograms[sibling_id], child_id
+                    )
+            else:
+                histograms = build_node_histograms(
+                    binned, grad, hess, sample_node_ids, active_nodes
+                )
 
             # Column subsampling: zero out non-selected feature histograms
             if col_mask is not None:
-                for node_id, hist in histograms.items():
+                for _node_id, hist in histograms.items():
                     hist.hist_grad[~col_mask] = 0.0
                     hist.hist_hess[~col_mask] = 0.0
 
@@ -544,7 +574,7 @@ def grow(
                 is_categorical=is_categorical,  # Phase 14.3
                 n_categories=n_categories,      # Phase 14.3
             )
-            
+
             # Only update depth if at least one valid split was found
             if splits:
                 actual_depth = depth + 1
@@ -558,15 +588,15 @@ def grow(
                 missing_go_left[node_id] = node_split.missing_go_left      # Phase 14
                 is_categorical_split[node_id] = node_split.is_categorical  # Phase 14.3
                 cat_bitsets[node_id] = node_split.cat_bitset               # Phase 14.3
-            
+
             # Partition samples (handles missing via learned direction)
             if splits:
                 sample_node_ids = partition_samples(
-                    binned, sample_node_ids, splits, 
+                    binned, sample_node_ids, splits,
                     missing_go_left=missing_go_left  # Phase 14
                 )
-            
-            # Store histograms for potential subtraction (future optimization)
+
+            # Store histograms for subtraction at next level
             parent_histograms = histograms
         
         # Compute leaf values for all leaf nodes
@@ -598,6 +628,69 @@ def grow(
             cat_bitsets=cat_bitsets[:n_nodes] if any_cat else None,                    # Phase 14.3
         )
     
+    def _plan_histogram_subtraction(
+        self,
+        active_nodes: list[int],
+        sample_node_ids,
+        parent_histograms: dict[int, NodeHistogram],
+    ) -> tuple[list[int], dict[int, tuple[NodeHistogram, int]]]:
+        """Plan which children to build vs subtract.
+
+        For each parent that split, build the histogram for the smaller child
+        and derive the larger child via subtraction: larger = parent - smaller.
+        This halves histogram computation on average.
+
+        Returns:
+            build_nodes: Nodes whose histograms must be built from samples.
+            subtract_info: {child_id: (parent_histogram, sibling_id_to_subtract_from)}
+        """
+        if hasattr(sample_node_ids, 'copy_to_host'):
+            ids_cpu = sample_node_ids.copy_to_host()
+        else:
+            ids_cpu = np.asarray(sample_node_ids)
+
+        # Count samples per node
+        node_counts: dict[int, int] = {}
+        for nid in active_nodes:
+            node_counts[nid] = int(np.sum(ids_cpu == nid))
+
+        build_nodes: list[int] = []
+        subtract_info: dict[int, tuple[NodeHistogram, int]] = {}
+
+        # Group children by parent
+        processed_parents: set[int] = set()
+        for nid in active_nodes:
+            parent_id = (nid - 1) // 2
+            if parent_id in processed_parents:
+                continue
+            if parent_id not in parent_histograms:
+                # No parent histogram — must build from samples
+                build_nodes.append(nid)
+                continue
+
+            # Find sibling
+            left_child = 2 * parent_id + 1
+            right_child = 2 * parent_id + 2
+            sibling = right_child if nid == left_child else left_child
+
+            # Both children must be active for subtraction
+            if sibling not in node_counts:
+                build_nodes.append(nid)
+                continue
+
+            processed_parents.add(parent_id)
+            parent_hist = parent_histograms[parent_id]
+
+            # Build the smaller child, subtract for the larger
+            if node_counts[left_child] <= node_counts[right_child]:
+                build_nodes.append(left_child)
+                subtract_info[right_child] = (parent_hist, left_child)
+            else:
+                build_nodes.append(right_child)
+                subtract_info[left_child] = (parent_hist, right_child)
+
+        return build_nodes, subtract_info
+
     def _get_active_nodes(self, sample_node_ids, candidate_nodes: list[int]) -> list[int]:
         """Get nodes that have samples assigned to them."""
         if hasattr(sample_node_ids, 'copy_to_host'):
diff --git a/src/openboost/_models/_boosting.py b/src/openboost/_models/_boosting.py
index 0ecc6c2..a9a1e7a 100644
--- a/src/openboost/_models/_boosting.py
+++ b/src/openboost/_models/_boosting.py
@@ -14,6 +14,7 @@
 
 from __future__ import annotations
 
+import os
 from dataclasses import dataclass, field
 from pathlib import Path
 from typing import TYPE_CHECKING, Callable, Literal
@@ -227,6 +228,12 @@ def fit(
         """
         # Clear any previous fit
         self.trees_ = []
+
+        # Auto-enable profiling via env var
+        if os.environ.get("OPENBOOST_PROFILE"):
+            from .._profiler import ProfilingCallback
+            _profile_dir = os.environ.get("OPENBOOST_PROFILE_DIR", "logs/")
+            callbacks = list(callbacks or []) + [ProfilingCallback(output_dir=_profile_dir)]
         
         # Validate inputs (Phase 20.3)
         X = validate_X(X, allow_nan=True, context="fit")
@@ -700,45 +707,72 @@ def _fit_gpu(
                     colsample_bytree=self.colsample_bytree,
                 )
             else:
-                # Standard training (with optional row/col subsampling in fit_tree)
-                tree = fit_tree(
-                    self.X_binned_,
-                    grad_gpu,
-                    hess_gpu,
-                    max_depth=self.max_depth,
-                    min_child_weight=self.min_child_weight,
-                    reg_lambda=self.reg_lambda,
-                    reg_alpha=self.reg_alpha,
-                    gamma=self.gamma,
-                    subsample=self.subsample,
-                    colsample_bytree=self.colsample_bytree,
+                # Use GPU-native tree builder when no features require the
+                # growth-strategy path (reg_alpha, colsample, subsample, etc.)
+                use_gpu_native = (
+                    is_cuda()
+                    and self.reg_alpha == 0.0
+                    and self.colsample_bytree >= 1.0
+                    and self.subsample >= 1.0
                 )
+                if use_gpu_native:
+                    from .._core._tree import fit_tree_gpu_native
+                    tree = fit_tree_gpu_native(
+                        self.X_binned_,
+                        grad_gpu,
+                        hess_gpu,
+                        max_depth=self.max_depth,
+                        min_child_weight=self.min_child_weight,
+                        reg_lambda=self.reg_lambda,
+                        min_gain=self.gamma,
+                    )
+                else:
+                    tree = fit_tree(
+                        self.X_binned_,
+                        grad_gpu,
+                        hess_gpu,
+                        max_depth=self.max_depth,
+                        min_child_weight=self.min_child_weight,
+                        reg_lambda=self.reg_lambda,
+                        reg_alpha=self.reg_alpha,
+                        gamma=self.gamma,
+                        subsample=self.subsample,
+                        colsample_bytree=self.colsample_bytree,
+                    )
             
-            # Update predictions
-            tree_pred = tree(self.X_binned_)
-            if hasattr(tree_pred, 'copy_to_host'):
-                tree_pred_cpu = tree_pred.copy_to_host()
+            # Update predictions on GPU
+            from .._core._tree import Tree
+            if isinstance(tree, Tree) and tree.on_gpu:
+                # Fused traversal + add: single kernel, no intermediate array
+                from .._core._predict import predict_tree_add_gpu
+                predict_tree_add_gpu(tree, self.X_binned_, pred_gpu, self.learning_rate)
             else:
-                tree_pred_cpu = tree_pred
-            
-            # Update GPU predictions
-            pred_cpu = pred_gpu.copy_to_host()
-            pred_cpu += self.learning_rate * tree_pred_cpu
-            cuda.to_device(pred_cpu, to=pred_gpu)
-            
+                tree_pred = tree(self.X_binned_)
+                if hasattr(tree_pred, '__cuda_array_interface__'):
+                    from .._core._predict import _add_inplace_cuda
+                    _add_inplace_cuda(pred_gpu, tree_pred, self.learning_rate)
+                else:
+                    if hasattr(tree_pred, 'copy_to_host'):
+                        tree_pred = tree_pred.copy_to_host()
+                    pred_cpu = pred_gpu.copy_to_host()
+                    pred_cpu += self.learning_rate * tree_pred
+                    cuda.to_device(pred_cpu, to=pred_gpu)
+
             self.trees_.append(tree)
-            
-            # Compute losses for callbacks using actual loss function
-            state.train_loss = _compute_loss_value(self.loss, pred_cpu, y, quantile_alpha=self.quantile_alpha, tweedie_rho=self.tweedie_rho)
 
-            if eval_set:
-                X_val, y_val = eval_set[0]
-                val_pred = self.predict(X_val)
-                state.val_loss = _compute_loss_value(self.loss, val_pred, y_val, quantile_alpha=self.quantile_alpha, tweedie_rho=self.tweedie_rho)
+            # Only compute loss and copy to CPU when callbacks need it
+            if cb_manager.callbacks:
+                pred_cpu = pred_gpu.copy_to_host()
+                state.train_loss = _compute_loss_value(self.loss, pred_cpu, y, quantile_alpha=self.quantile_alpha, tweedie_rho=self.tweedie_rho)
 
-            # Check if callbacks want to stop
-            if not cb_manager.on_round_end(state):
-                break
+                if eval_set:
+                    X_val, y_val = eval_set[0]
+                    val_pred = self.predict(X_val)
+                    state.val_loss = _compute_loss_value(self.loss, val_pred, y_val, quantile_alpha=self.quantile_alpha, tweedie_rho=self.tweedie_rho)
+
+                # Check if callbacks want to stop
+                if not cb_manager.on_round_end(state):
+                    break
 
         cb_manager.on_train_end(state)
 
@@ -907,6 +941,19 @@ def predict(self, X: NDArray | BinnedArray) -> NDArray:
         
         # Accumulate tree predictions with base score
         base = getattr(self, 'base_score_', np.float32(0.0))
+
+        # Use GPU accumulation when data is on GPU
+        if is_cuda() and hasattr(X_binned.data, '__cuda_array_interface__'):
+            from numba import cuda
+
+            from .._core._predict import _add_inplace_cuda, _fill_cuda
+            pred_gpu = cuda.device_array(n_samples, dtype=np.float32)
+            _fill_cuda(pred_gpu, float(base))
+            for tree in self.trees_:
+                tree_pred = tree(X_binned)
+                _add_inplace_cuda(pred_gpu, tree_pred, self.learning_rate)
+            return pred_gpu.copy_to_host()
+
         pred = np.full(n_samples, base, dtype=np.float32)
         for tree in self.trees_:
             tree_pred = tree(X_binned)
diff --git a/src/openboost/_profiler.py b/src/openboost/_profiler.py
new file mode 100644
index 0000000..b3e429f
--- /dev/null
+++ b/src/openboost/_profiler.py
@@ -0,0 +1,495 @@
+"""Profiling callback for OpenBoost training.
+
+Instruments the training loop to produce structured JSON reports that
+break down time by phase (histogram building, split finding, partitioning,
+etc.). Designed for self-recursive improvement loops: profile → identify
+bottleneck → optimize → re-profile → verify improvement.
+
+Usage:
+    # Explicit callback
+    from openboost import GradientBoosting, ProfilingCallback
+    profiler = ProfilingCallback(output_dir="logs/")
+    model = GradientBoosting(n_trees=100)
+    model.fit(X, y, callbacks=[profiler])
+    print(profiler.report_path)
+
+    # Environment variable (zero-code-change)
+    OPENBOOST_PROFILE=1 uv run python train.py
+"""
+
+from __future__ import annotations
+
+import json
+import os
+import platform
+import subprocess
+import time
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any
+
+from ._callbacks import Callback, TrainingState
+
+
+# =============================================================================
+# Phase timer
+# =============================================================================
+
+class PhaseTimer:
+    """Accumulates time for a named phase across multiple calls."""
+
+    __slots__ = ("name", "_use_cuda", "_times", "_start")
+
+    def __init__(self, name: str, use_cuda: bool = False):
+        self.name = name
+        self._use_cuda = use_cuda
+        self._times: list[float] = []
+        self._start: float | None = None
+
+    def start(self) -> None:
+        if self._use_cuda:
+            from numba import cuda
+            cuda.synchronize()
+        self._start = time.perf_counter()
+
+    def stop(self) -> float:
+        if self._use_cuda:
+            from numba import cuda
+            cuda.synchronize()
+        elapsed = time.perf_counter() - self._start
+        self._times.append(elapsed)
+        self._start = None
+        return elapsed
+
+    @property
+    def total(self) -> float:
+        return sum(self._times)
+
+    @property
+    def count(self) -> int:
+        return len(self._times)
+
+    @property
+    def mean(self) -> float:
+        return self.total / self.count if self._times else 0.0
+
+    def to_dict(self, total_time: float) -> dict:
+        return {
+            "total_s": round(self.total, 6),
+            "pct": round(100 * self.total / total_time, 2) if total_time > 0 else 0,
+            "calls": self.count,
+            "mean_s": round(self.mean, 6),
+        }
+
+
+# =============================================================================
+# Bottleneck recommendations
+# =============================================================================
+
+PHASE_RECOMMENDATIONS: dict[str, tuple[str, str]] = {
+    "histogram_build": (
+        "_backends/_cpu.py:build_histogram_cpu, _backends/_cuda.py:_build_histogram_shared_kernel",
+        "shared-memory tiling, feature batching, reducing n_bins",
+    ),
+    "split_find": (
+        "_core/_primitives.py:find_node_splits, _core/_split.py:find_best_split",
+        "GPU parallel scan, vectorized prefix-sum split evaluation",
+    ),
+    "partition": (
+        "_core/_primitives.py:partition_samples",
+        "radix-sort-based partitioning, sorted index schemes",
+    ),
+    "gradient_compute": (
+        "_loss.py loss functions",
+        "fused GPU kernels, avoiding CPU-GPU copies for custom losses",
+    ),
+    "prediction_update": (
+        "_models/_boosting.py prediction update loop",
+        "fusing tree traversal + add, batching prediction updates",
+    ),
+    "leaf_values": (
+        "_core/_primitives.py:compute_leaf_values",
+        "GPU reduction kernel, batch leaf computation",
+    ),
+    "tree_overhead": (
+        "_core/_tree.py:fit_tree, _core/_growth.py:LevelWiseGrowth.grow",
+        "reduce Python overhead in growth loop, minimize object allocation",
+    ),
+    "grad_pred_loss": (
+        "_models/_boosting.py training loop (loss_fn, tree predict, loss eval)",
+        "fuse gradient+prediction, skip loss eval when no callbacks need it",
+    ),
+}
+
+
+# =============================================================================
+# Hardware info
+# =============================================================================
+
+def _collect_hardware_info() -> dict:
+    info: dict[str, Any] = {
+        "cpu": platform.processor() or platform.machine(),
+        "cpu_cores": os.cpu_count(),
+        "ram_gb": None,
+        "gpu": None,
+        "gpu_memory_gb": None,
+    }
+    if platform.system() == "Darwin":
+        try:
+            result = subprocess.run(
+                ["sysctl", "-n", "hw.memsize"],
+                capture_output=True, text=True, timeout=5,
+            )
+            info["ram_gb"] = round(int(result.stdout.strip()) / (1024**3), 1)
+        except Exception:
+            pass
+    elif platform.system() == "Linux":
+        try:
+            with open("/proc/meminfo") as f:
+                for line in f:
+                    if line.startswith("MemTotal"):
+                        info["ram_gb"] = round(int(line.split()[1]) / (1024**2), 1)
+                        break
+        except Exception:
+            pass
+    try:
+        from numba import cuda
+        if cuda.is_available():
+            dev = cuda.get_current_device()
+            info["gpu"] = dev.name.decode() if isinstance(dev.name, bytes) else str(dev.name)
+    except Exception:
+        pass
+    return info
+
+
+def _get_git_sha() -> str | None:
+    try:
+        result = subprocess.run(
+            ["git", "rev-parse", "--short", "HEAD"],
+            capture_output=True, text=True, timeout=5,
+        )
+        return result.stdout.strip() if result.returncode == 0 else None
+    except Exception:
+        return None
+
+
+# =============================================================================
+# Profiling callback
+# =============================================================================
+
+_PRIMITIVES_TO_WRAP = [
+    "build_node_histograms",
+    "find_node_splits",
+    "partition_samples",
+    "compute_leaf_values",
+]
+
+_PHASE_NAMES = {
+    "build_node_histograms": "histogram_build",
+    "find_node_splits": "split_find",
+    "partition_samples": "partition",
+    "compute_leaf_values": "leaf_values",
+}
+
+# Modules where fit_tree is imported and needs wrapping
+_FIT_TREE_MODULES = [
+    "openboost._core._tree",
+    "openboost._models._boosting",
+]
+
+
+class ProfilingCallback(Callback):
+    """Profile training phases and produce structured JSON reports.
+
+    Wraps core primitive functions with timers during training to measure
+    per-phase time breakdown. Writes a JSON report to output_dir on completion.
+
+    Args:
+        output_dir: Directory for profile JSON files. Created if missing.
+        compare_last: If True, compare with the most recent previous profile.
+    """
+
+    def __init__(self, output_dir: str = "logs/", compare_last: bool = True):
+        self.output_dir = Path(output_dir)
+        self.compare_last = compare_last
+
+        self._timers: dict[str, PhaseTimer] = {}
+        self._tree_timers: list[dict[str, float]] = []
+        self._round_start: float = 0.0
+        self._round_phase_snapshot: dict[str, float] = {}
+        self._train_start: float = 0.0
+        self._originals: dict[str, Any] = {}
+        self._use_cuda: bool = False
+
+        self.report_path: Path | None = None
+        self.report: dict | None = None
+
+    def _get_timer(self, name: str) -> PhaseTimer:
+        if name not in self._timers:
+            self._timers[name] = PhaseTimer(name, use_cuda=self._use_cuda)
+        return self._timers[name]
+
+    # ----- wrapping / unwrapping -----
+
+    def _wrap_primitives(self) -> None:
+        import sys
+        import openboost._core._primitives as prims_mod
+        import openboost._core._growth as growth_mod
+
+        # Wrap the 4 core primitives
+        for func_name in _PRIMITIVES_TO_WRAP:
+            original = getattr(prims_mod, func_name)
+            self._originals[("prim", func_name)] = original
+            phase_name = _PHASE_NAMES[func_name]
+            timer = self._get_timer(phase_name)
+
+            def make_wrapper(orig, tmr):
+                def wrapper(*args, **kwargs):
+                    tmr.start()
+                    result = orig(*args, **kwargs)
+                    tmr.stop()
+                    return result
+                return wrapper
+
+            wrapped = make_wrapper(original, timer)
+            setattr(prims_mod, func_name, wrapped)
+            if hasattr(growth_mod, func_name):
+                setattr(growth_mod, func_name, wrapped)
+
+        # Wrap fit_tree to capture total tree-building time (includes orchestration)
+        fit_tree_timer = self._get_timer("fit_tree")
+        for mod_name in _FIT_TREE_MODULES:
+            mod = sys.modules.get(mod_name)
+            if mod and hasattr(mod, "fit_tree"):
+                original_ft = getattr(mod, "fit_tree")
+                self._originals[("fit_tree", mod_name)] = original_ft
+                wrapped_ft = make_wrapper(original_ft, fit_tree_timer)
+                setattr(mod, "fit_tree", wrapped_ft)
+
+    def _unwrap_primitives(self) -> None:
+        import sys
+        import openboost._core._primitives as prims_mod
+        import openboost._core._growth as growth_mod
+
+        for key, original in self._originals.items():
+            kind, name = key
+            if kind == "prim":
+                setattr(prims_mod, name, original)
+                if hasattr(growth_mod, name):
+                    setattr(growth_mod, name, original)
+            elif kind == "fit_tree":
+                mod = sys.modules.get(name)
+                if mod:
+                    setattr(mod, "fit_tree", original)
+        self._originals.clear()
+
+    # ----- callback hooks -----
+
+    def on_train_begin(self, state: TrainingState) -> None:
+        from ._backends import is_cuda
+        self._use_cuda = is_cuda()
+        self._timers.clear()
+        self._tree_timers.clear()
+        self._wrap_primitives()
+        self._train_start = time.perf_counter()
+
+    def on_round_begin(self, state: TrainingState) -> None:
+        self._round_start = time.perf_counter()
+        self._round_phase_snapshot = {
+            name: timer.total for name, timer in self._timers.items()
+        }
+
+    def on_round_end(self, state: TrainingState) -> bool:
+        round_total = time.perf_counter() - self._round_start
+        tree_entry: dict[str, float] = {"round": state.round_idx, "total_s": round_total}
+        for name, timer in self._timers.items():
+            prev = self._round_phase_snapshot.get(name, 0.0)
+            tree_entry[f"{name}_s"] = round(timer.total - prev, 6)
+        # Compute per-tree derived phases
+        ft = tree_entry.get("fit_tree_s", 0)
+        prims = sum(tree_entry.get(f"{p}_s", 0) for p in
+                     ("histogram_build", "split_find", "partition", "leaf_values"))
+        tree_entry["tree_overhead_s"] = round(max(0.0, ft - prims), 6)
+        tree_entry["grad_pred_loss_s"] = round(max(0.0, round_total - ft), 6)
+        self._tree_timers.append(tree_entry)
+        return True
+
+    def on_train_end(self, state: TrainingState) -> None:
+        total_time = time.perf_counter() - self._train_start
+        self._unwrap_primitives()
+
+        # Compute derived phases from per-tree data
+        # round_total = gradient_compute + fit_tree + prediction_update + loss_eval
+        # fit_tree = primitives + orchestration_overhead
+        total_round_time = sum(t["total_s"] for t in self._tree_timers)
+        fit_tree_total = self._timers["fit_tree"].total if "fit_tree" in self._timers else 0
+        # Time outside fit_tree but inside rounds = grad compute + pred update + loss eval
+        outside_tree = max(0.0, total_round_time - fit_tree_total)
+        # Primitives total
+        prims_total = sum(
+            self._timers[p].total for p in ("histogram_build", "split_find", "partition", "leaf_values")
+            if p in self._timers
+        )
+        # Orchestration = fit_tree - primitives (Python overhead in growth strategies)
+        orchestration = max(0.0, fit_tree_total - prims_total)
+
+        # Build phases dict (show the most useful breakdown)
+        phases = {}
+        for name in ("histogram_build", "split_find", "partition", "leaf_values"):
+            if name in self._timers:
+                phases[name] = self._timers[name].to_dict(total_time)
+        # Add fit_tree orchestration overhead
+        if orchestration > 0:
+            n_trees = self._timers["fit_tree"].count if "fit_tree" in self._timers else 0
+            phases["tree_overhead"] = {
+                "total_s": round(orchestration, 6),
+                "pct": round(100 * orchestration / total_time, 2) if total_time > 0 else 0,
+                "calls": n_trees,
+                "mean_s": round(orchestration / n_trees, 6) if n_trees > 0 else 0,
+            }
+        # Add outside-tree time (gradient + prediction + loss eval)
+        if outside_tree > 0:
+            n_rounds = len(self._tree_timers)
+            phases["grad_pred_loss"] = {
+                "total_s": round(outside_tree, 6),
+                "pct": round(100 * outside_tree / total_time, 2) if total_time > 0 else 0,
+                "calls": n_rounds,
+                "mean_s": round(outside_tree / n_rounds, 6) if n_rounds > 0 else 0,
+            }
+        # Other: time outside the training loop entirely (setup, teardown)
+        accounted = fit_tree_total + outside_tree
+        other_time = max(0.0, total_time - accounted)
+        if other_time > 0.001:
+            phases["other"] = {
+                "total_s": round(other_time, 6),
+                "pct": round(100 * other_time / total_time, 2) if total_time > 0 else 0,
+                "calls": None,
+                "mean_s": None,
+            }
+
+        # Bottlenecks: top 3 phases by pct (excluding "other")
+        ranked = sorted(
+            [(name, data) for name, data in phases.items() if name != "other"],
+            key=lambda x: x[1]["pct"],
+            reverse=True,
+        )
+        bottlenecks = []
+        for rank, (phase, data) in enumerate(ranked[:3], 1):
+            target, rec = PHASE_RECOMMENDATIONS.get(phase, ("unknown", "investigate"))
+            bottlenecks.append({
+                "rank": rank,
+                "phase": phase,
+                "pct": data["pct"],
+                "target": target,
+                "recommendation": rec,
+            })
+
+        # Dataset / model info
+        model = state.model
+        n_trees_actual = len(getattr(model, "trees_", []))
+        dataset_info = {
+            "n_samples": (model.X_binned_.n_samples
+                          if getattr(model, "X_binned_", None) else None),
+            "n_features": getattr(model, "n_features_in_", None),
+            "n_trees": n_trees_actual,
+            "max_depth": getattr(model, "max_depth", None),
+            "learning_rate": getattr(model, "learning_rate", None),
+            "loss": str(getattr(model, "loss", None)),
+            "backend": "cuda" if self._use_cuda else "cpu",
+        }
+
+        report = {
+            "version": "1.0",
+            "timestamp": datetime.now(timezone.utc).isoformat(),
+            "git_sha": _get_git_sha(),
+            "hardware": _collect_hardware_info(),
+            "dataset": dataset_info,
+            "total_time_s": round(total_time, 6),
+            "phases": phases,
+            "per_tree": self._tree_timers,
+            "bottlenecks": bottlenecks,
+        }
+
+        # Comparison with previous run
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+        if self.compare_last:
+            comparison = self._compare_with_previous(report)
+            if comparison:
+                report["comparison"] = comparison
+
+        # Write report
+        ts = datetime.now().strftime("%Y%m%d_%H%M%S")
+        self.report_path = self.output_dir / f"profile_{ts}.json"
+        with open(self.report_path, "w") as f:
+            json.dump(report, f, indent=2)
+
+        self.report = report
+
+    # ----- comparison -----
+
+    def _compare_with_previous(self, current: dict) -> dict | None:
+        existing = sorted(self.output_dir.glob("profile_*.json"))
+        if not existing:
+            return None
+        prev_path = existing[-1]
+        try:
+            with open(prev_path) as f:
+                prev = json.load(f)
+        except (json.JSONDecodeError, OSError):
+            return None
+
+        prev_total = prev.get("total_time_s", 0)
+        cur_total = current["total_time_s"]
+
+        comparison: dict[str, Any] = {
+            "previous_run": str(prev_path),
+            "delta_total_pct": _pct_delta(prev_total, cur_total),
+            "phase_deltas": {},
+        }
+        for phase in current["phases"]:
+            if phase in prev.get("phases", {}):
+                prev_s = prev["phases"][phase]["total_s"]
+                cur_s = current["phases"][phase]["total_s"]
+                comparison["phase_deltas"][phase] = {
+                    "previous_s": prev_s,
+                    "current_s": cur_s,
+                    "delta_pct": _pct_delta(prev_s, cur_s),
+                }
+        return comparison
+
+
+def _pct_delta(old: float, new: float) -> float:
+    if old == 0:
+        return 0.0
+    return round(100 * (new - old) / old, 2)
+
+
+# =============================================================================
+# Summary printer (machine-readable for improvement loops)
+# =============================================================================
+
+def print_profile_summary(report: dict) -> None:
+    """Print a machine-readable summary of a profile report."""
+    print("=== PROFILE SUMMARY ===")
+    print(f"TOTAL: {report['total_time_s']:.2f}s")
+    print(f"BACKEND: {report['dataset'].get('backend', 'unknown')}")
+
+    if report.get("bottlenecks"):
+        top = report["bottlenecks"][0]
+        print(f"TOP BOTTLENECK: {top['phase']} ({top['pct']}%)")
+        print(f"TARGET: {top['target']}")
+        print(f"RECOMMENDATION: {top['recommendation']}")
+
+    if report.get("comparison"):
+        comp = report["comparison"]
+        delta = comp["delta_total_pct"]
+        sign = "+" if delta > 0 else ""
+        print(f"DELTA vs PREVIOUS: {sign}{delta}% total")
+        for phase, pd in comp.get("phase_deltas", {}).items():
+            if abs(pd["delta_pct"]) >= 5:
+                s = "+" if pd["delta_pct"] > 0 else ""
+                print(f"  {phase}: {s}{pd['delta_pct']}%")
+    else:
+        print("DELTA vs PREVIOUS: (no previous run)")
+
+    print(f"REPORT: {report.get('_path', 'N/A')}")
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 0000000..13bf9a4
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,188 @@
+"""Shared test fixtures for OpenBoost.
+
+Centralizes dataset generation, pre-binned arrays, and gradient fixtures
+to eliminate duplication across test files. All data fixtures use explicit
+RandomState objects (not np.random.seed) to avoid cross-test contamination.
+"""
+
+import numpy as np
+import pytest
+
+import openboost as ob
+
+# =============================================================================
+# CUDA detection and auto-skip
+# =============================================================================
+
+try:
+    from numba import cuda
+    CUDA_AVAILABLE = cuda.is_available()
+except Exception:
+    CUDA_AVAILABLE = False
+
+
+def pytest_collection_modifyitems(config, items):
+    """Auto-skip GPU and parity tests when CUDA is unavailable."""
+    if not CUDA_AVAILABLE:
+        skip_gpu = pytest.mark.skip(reason="CUDA not available")
+        for item in items:
+            if "gpu" in item.keywords or "parity" in item.keywords:
+                item.add_marker(skip_gpu)
+
+
+# =============================================================================
+# Regression datasets
+# =============================================================================
+
+@pytest.fixture(scope="session")
+def regression_100x5():
+    """Small regression dataset: 100 samples, 5 features, linear target.
+
+    y = X[:,0] + 0.5 * X[:,1] + noise(0.1)
+    """
+    rng = np.random.RandomState(42)
+    X = rng.randn(100, 5).astype(np.float32)
+    y = (X[:, 0] + 0.5 * X[:, 1] + rng.randn(100).astype(np.float32) * 0.1).astype(np.float32)
+    return X, y
+
+
+@pytest.fixture(scope="session")
+def regression_200x10():
+    """Medium regression dataset: 200 samples, 10 features, linear target.
+
+    y = X[:,0] + 0.5 * X[:,1] - 0.3 * X[:,2] + noise(0.1)
+    """
+    rng = np.random.RandomState(42)
+    X = rng.randn(200, 10).astype(np.float32)
+    y = (X[:, 0] + 0.5 * X[:, 1] - 0.3 * X[:, 2] + rng.randn(200).astype(np.float32) * 0.1).astype(np.float32)
+    return X, y
+
+
+@pytest.fixture(scope="session")
+def regression_500x10():
+    """Larger regression dataset: 500 samples, 10 features.
+
+    y = X[:,0] + 0.5 * X[:,1] - 0.3 * X[:,2] + noise(0.1)
+    """
+    rng = np.random.RandomState(42)
+    X = rng.randn(500, 10).astype(np.float32)
+    y = (X[:, 0] + 0.5 * X[:, 1] - 0.3 * X[:, 2] + rng.randn(500).astype(np.float32) * 0.1).astype(np.float32)
+    return X, y
+
+
+# =============================================================================
+# Classification datasets
+# =============================================================================
+
+@pytest.fixture(scope="session")
+def binary_500x10():
+    """Binary classification dataset: 500 samples, 10 features.
+
+    Labels derived from a linear boundary on first two features.
+    """
+    rng = np.random.RandomState(42)
+    X = rng.randn(500, 10).astype(np.float32)
+    logits = X[:, 0] + 0.5 * X[:, 1] - 0.3 * X[:, 2]
+    y = (logits > 0).astype(np.float32)
+    return X, y
+
+
+@pytest.fixture(scope="session")
+def multiclass_300x5():
+    """3-class classification dataset: 300 samples, 5 features."""
+    rng = np.random.RandomState(42)
+    X = rng.randn(300, 5).astype(np.float32)
+    # 3 classes based on which of 3 linear combos is largest
+    scores = np.column_stack([X[:, 0], X[:, 1], X[:, 2]])
+    y = scores.argmax(axis=1).astype(np.float32)
+    return X, y
+
+
+# =============================================================================
+# Specialized datasets
+# =============================================================================
+
+@pytest.fixture(scope="session")
+def count_data_200x5():
+    """Poisson count data: 200 samples, 5 features.
+
+    y ~ Poisson(exp(0.5 * X[:,0] + 0.3 * X[:,1]))
+    """
+    rng = np.random.RandomState(42)
+    X = rng.randn(200, 5).astype(np.float32)
+    rate = np.exp(0.5 * X[:, 0] + 0.3 * X[:, 1])
+    y = rng.poisson(rate).astype(np.float32)
+    return X, y
+
+
+@pytest.fixture(scope="session")
+def positive_continuous_200x5():
+    """Positive continuous data for Gamma/Tweedie: 200 samples, 5 features.
+
+    y = exp(0.5 * X[:,0] + 0.3 * X[:,1]) + noise
+    """
+    rng = np.random.RandomState(42)
+    X = rng.randn(200, 5).astype(np.float32)
+    y = (np.exp(0.5 * X[:, 0] + 0.3 * X[:, 1]) + rng.exponential(0.1, 200)).astype(np.float32)
+    return X, y
+
+
+# =============================================================================
+# Pre-binned datasets
+# =============================================================================
+
+@pytest.fixture(scope="session")
+def binned_100x5(regression_100x5):
+    """Pre-binned version of regression_100x5."""
+    X, y = regression_100x5
+    return ob.array(X), y
+
+
+@pytest.fixture(scope="session")
+def binned_200x10(regression_200x10):
+    """Pre-binned version of regression_200x10."""
+    X, y = regression_200x10
+    return ob.array(X), y
+
+
+# =============================================================================
+# Gradient/hessian fixtures
+# =============================================================================
+
+@pytest.fixture
+def mse_grads_100(regression_100x5):
+    """MSE gradients from zero predictions for regression_100x5.
+
+    Returns (grad, hess) with grad = 2*(0-y) = -2y, hess = 2.
+    """
+    _, y = regression_100x5
+    pred = np.zeros(100, dtype=np.float32)
+    grad = (2 * (pred - y)).astype(np.float32)
+    hess = np.ones(100, dtype=np.float32) * 2
+    return grad, hess
+
+
+@pytest.fixture
+def mse_grads_200(regression_200x10):
+    """MSE gradients from zero predictions for regression_200x10."""
+    _, y = regression_200x10
+    pred = np.zeros(200, dtype=np.float32)
+    grad = (2 * (pred - y)).astype(np.float32)
+    hess = np.ones(200, dtype=np.float32) * 2
+    return grad, hess
+
+
+# =============================================================================
+# Pre-fitted model fixtures
+# =============================================================================
+
+@pytest.fixture
+def fitted_regressor(regression_500x10):
+    """Pre-fitted OpenBoostRegressor (20 trees, max_depth=4).
+
+    Function-scoped: fresh model for each test.
+    """
+    X, y = regression_500x10
+    model = ob.GradientBoosting(n_trees=20, max_depth=4, learning_rate=0.1)
+    model.fit(X, y)
+    return model, X, y
diff --git a/tests/test_binning_correctness.py b/tests/test_binning_correctness.py
new file mode 100644
index 0000000..b116338
--- /dev/null
+++ b/tests/test_binning_correctness.py
@@ -0,0 +1,170 @@
+"""BinnedArray correctness tests for OpenBoost.
+
+Verifies that data binning (quantization to uint8) is correct,
+consistent between fit and transform, and handles edge cases.
+"""
+
+import numpy as np
+import pytest
+
+import openboost as ob
+
+
+class TestBinningConsistency:
+    """Verify that binning is consistent between fit and transform."""
+
+    def test_transform_matches_training_bins(self):
+        """Re-binning training data with transform should reproduce original bins."""
+        rng = np.random.RandomState(42)
+        X = rng.randn(200, 5).astype(np.float32)
+
+        binned = ob.array(X)
+        # Transform the same data using training bin edges
+        re_binned = binned.transform(X)
+
+        np.testing.assert_array_equal(
+            binned.data, re_binned.data,
+            err_msg="Re-binning training data should produce identical bins"
+        )
+
+    def test_transform_preserves_shape(self):
+        """Transform output should have correct shape."""
+        rng = np.random.RandomState(42)
+        X_train = rng.randn(100, 5).astype(np.float32)
+        X_test = rng.randn(50, 5).astype(np.float32)
+
+        binned = ob.array(X_train)
+        test_binned = binned.transform(X_test)
+
+        assert test_binned.n_samples == 50
+        assert test_binned.n_features == 5
+        assert test_binned.data.shape == (5, 50)
+        assert test_binned.data.dtype == np.uint8
+
+    def test_transform_out_of_range_values(self):
+        """Values outside training range should be clipped to valid bins."""
+        X_train = np.array([[0.0], [1.0], [2.0], [3.0]], dtype=np.float32)
+        X_test = np.array([[-10.0], [10.0]], dtype=np.float32)
+
+        binned = ob.array(X_train)
+        test_binned = binned.transform(X_test)
+
+        # Should be valid bin values (not 255 since no NaN)
+        assert np.all(test_binned.data < 255), "Out-of-range values should not be missing bin"
+        assert np.all(test_binned.data >= 0), "Bins should be non-negative"
+
+
+class TestBinEdges:
+    """Verify bin edge properties."""
+
+    def test_bin_edges_monotonic(self):
+        """Bin edges should be strictly increasing per feature."""
+        rng = np.random.RandomState(42)
+        X = rng.randn(500, 5).astype(np.float32)
+        binned = ob.array(X)
+
+        for feat_idx, edges in enumerate(binned.bin_edges):
+            edges_arr = np.array(edges)
+            if len(edges_arr) > 1:
+                diffs = np.diff(edges_arr)
+                assert np.all(diffs > 0), (
+                    f"Feature {feat_idx}: bin edges not strictly increasing"
+                )
+
+    def test_bin_count_respects_max(self):
+        """Number of bins should not exceed 254 (bin 255 is reserved)."""
+        rng = np.random.RandomState(42)
+        X = rng.randn(1000, 3).astype(np.float32)
+        binned = ob.array(X)
+
+        # No sample should have bin 255 (no NaN in this data)
+        assert np.max(binned.data) < 255, "Max bin should be < 255 when no NaN"
+
+
+class TestMissingValues:
+    """Verify NaN handling in binning."""
+
+    def test_nan_maps_to_missing_bin(self):
+        """NaN values should be binned as 255 (MISSING_BIN)."""
+        X = np.array([
+            [1.0, np.nan],
+            [2.0, 3.0],
+            [np.nan, 4.0],
+        ], dtype=np.float32)
+
+        binned = ob.array(X)
+
+        # Feature 0, sample 2 should be 255
+        assert binned.data[0, 2] == 255, "NaN in feature 0, sample 2 should be bin 255"
+        # Feature 1, sample 0 should be 255
+        assert binned.data[1, 0] == 255, "NaN in feature 1, sample 0 should be bin 255"
+        # Non-NaN values should not be 255
+        assert binned.data[0, 0] != 255, "Non-NaN value should not be bin 255"
+        assert binned.data[0, 1] != 255, "Non-NaN value should not be bin 255"
+
+    def test_no_nan_means_no_missing_bin(self):
+        """Without NaN, no sample should land in bin 255."""
+        rng = np.random.RandomState(42)
+        X = rng.randn(200, 5).astype(np.float32)
+
+        binned = ob.array(X)
+
+        assert np.all(binned.data != 255), "No bin 255 when no NaN in data"
+
+    def test_all_nan_feature(self):
+        """A feature that is all NaN should have all samples in bin 255."""
+        X = np.array([
+            [1.0, np.nan],
+            [2.0, np.nan],
+            [3.0, np.nan],
+        ], dtype=np.float32)
+
+        binned = ob.array(X)
+
+        assert np.all(binned.data[1, :] == 255), "All-NaN feature should be all bin 255"
+
+
+class TestBinningEdgeCases:
+    """Edge cases for binning."""
+
+    def test_constant_feature(self):
+        """Constant feature should produce valid binning."""
+        X = np.ones((50, 2), dtype=np.float32)
+        X[:, 1] = np.arange(50, dtype=np.float32)  # Feature 1 varies
+
+        binned = ob.array(X)
+
+        # Feature 0 (constant) should have all samples in the same bin
+        unique_bins = np.unique(binned.data[0, :])
+        assert len(unique_bins) == 1, f"Constant feature should have 1 bin, got {len(unique_bins)}"
+
+    def test_two_unique_values(self):
+        """Two distinct values should produce two bins."""
+        X = np.array([[0.0], [0.0], [1.0], [1.0]], dtype=np.float32)
+
+        binned = ob.array(X)
+
+        unique_bins = np.unique(binned.data[0, :])
+        assert len(unique_bins) == 2, f"Two values should produce 2 bins, got {len(unique_bins)}"
+
+    def test_very_large_values(self):
+        """Large values should not cause overflow."""
+        X = np.array([[1e10, -1e10], [1e15, -1e15]], dtype=np.float32)
+
+        binned = ob.array(X)
+
+        assert binned.data.dtype == np.uint8
+        assert np.all(np.isfinite(binned.data.astype(float)))
+
+    def test_single_sample(self):
+        """Single sample should bin correctly."""
+        X = np.array([[1.0, 2.0, 3.0]], dtype=np.float32)
+
+        binned = ob.array(X)
+
+        assert binned.n_samples == 1
+        assert binned.n_features == 3
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
diff --git a/tests/test_callbacks.py b/tests/test_callbacks.py
new file mode 100644
index 0000000..955b4d7
--- /dev/null
+++ b/tests/test_callbacks.py
@@ -0,0 +1,193 @@
+"""Tests for the callback system.
+
+Verifies EarlyStopping, Logger, ModelCheckpoint, LearningRateScheduler,
+HistoryCallback, and custom callbacks work correctly.
+"""
+
+import pytest
+
+import openboost as ob
+
+
+class TestEarlyStopping:
+    """Tests for EarlyStopping callback."""
+
+    def test_stops_when_val_loss_plateaus(self, regression_500x10):
+        """Training should stop before n_trees when loss plateaus."""
+        X, y = regression_500x10
+        X_val, y_val = X[:100], y[:100]
+
+        es = ob.EarlyStopping(patience=5)
+        model = ob.GradientBoosting(n_trees=1000, max_depth=4, learning_rate=0.3)
+        model.fit(X, y, callbacks=[es], eval_set=[(X_val, y_val)])
+
+        # Should have stopped early (well before 1000 trees)
+        assert len(model.trees_) < 1000, (
+            f"Should stop early but trained all {len(model.trees_)} trees"
+        )
+
+    def test_patience_respected(self, regression_500x10):
+        """With higher patience, training should run longer."""
+        X, y = regression_500x10
+        X_val, y_val = X[:100], y[:100]
+
+        es_short = ob.EarlyStopping(patience=3)
+        model_short = ob.GradientBoosting(n_trees=500, max_depth=4, learning_rate=0.3)
+        model_short.fit(X, y, callbacks=[es_short], eval_set=[(X_val, y_val)])
+
+        es_long = ob.EarlyStopping(patience=20)
+        model_long = ob.GradientBoosting(n_trees=500, max_depth=4, learning_rate=0.3)
+        model_long.fit(X, y, callbacks=[es_long], eval_set=[(X_val, y_val)])
+
+        # Longer patience should train at least as many trees
+        assert len(model_long.trees_) >= len(model_short.trees_)
+
+    def test_restore_best(self, regression_500x10):
+        """With restore_best=True, model should use best iteration's trees."""
+        X, y = regression_500x10
+        X_val, y_val = X[:100], y[:100]
+
+        es = ob.EarlyStopping(patience=5, restore_best=True)
+        model = ob.GradientBoosting(n_trees=500, max_depth=4, learning_rate=0.3)
+        model.fit(X, y, callbacks=[es], eval_set=[(X_val, y_val)])
+
+        # If early stopping fired, best_iteration should be set
+        if hasattr(model, 'best_iteration_') and model.best_iteration_ is not None:
+            assert len(model.trees_) <= model.best_iteration_ + 1 + 5
+
+
+class TestHistoryCallback:
+    """Tests for HistoryCallback."""
+
+    def test_records_train_loss(self, regression_200x10):
+        """Should record training loss each round."""
+        X, y = regression_200x10
+
+        history = ob.HistoryCallback()
+        model = ob.GradientBoosting(n_trees=10, max_depth=3)
+        model.fit(X, y, callbacks=[history])
+
+        assert 'train_loss' in history.history
+        assert len(history.history['train_loss']) == 10
+
+    def test_records_val_loss(self, regression_200x10):
+        """Should record validation loss when eval_set provided."""
+        X, y = regression_200x10
+        X_val, y_val = X[:50], y[:50]
+
+        history = ob.HistoryCallback()
+        model = ob.GradientBoosting(n_trees=10, max_depth=3)
+        model.fit(X, y, callbacks=[history], eval_set=[(X_val, y_val)])
+
+        assert 'val_loss' in history.history
+        assert len(history.history['val_loss']) == 10
+
+    def test_train_loss_decreases(self, regression_200x10):
+        """Recorded training loss should generally decrease."""
+        X, y = regression_200x10
+
+        history = ob.HistoryCallback()
+        model = ob.GradientBoosting(n_trees=20, max_depth=3, learning_rate=0.1)
+        model.fit(X, y, callbacks=[history])
+
+        losses = history.history['train_loss']
+        # First loss should be > last loss
+        assert losses[-1] < losses[0], (
+            f"Training loss should decrease: first={losses[0]:.4f}, last={losses[-1]:.4f}"
+        )
+
+
+class TestLoggerCallback:
+    """Tests for Logger callback."""
+
+    def test_logger_does_not_crash(self, regression_100x5, capsys):
+        """Logger should print without crashing."""
+        X, y = regression_100x5
+
+        logger = ob.Logger(period=5)
+        model = ob.GradientBoosting(n_trees=10, max_depth=2)
+        model.fit(X, y, callbacks=[logger])
+
+        # Just verify it didn't crash — output format may vary
+
+
+class TestMultipleCallbacks:
+    """Tests for using multiple callbacks together."""
+
+    def test_early_stopping_and_history(self, regression_500x10):
+        """EarlyStopping + HistoryCallback should work together."""
+        X, y = regression_500x10
+        X_val, y_val = X[:100], y[:100]
+
+        es = ob.EarlyStopping(patience=5)
+        history = ob.HistoryCallback()
+
+        model = ob.GradientBoosting(n_trees=500, max_depth=4, learning_rate=0.3)
+        model.fit(X, y, callbacks=[es, history], eval_set=[(X_val, y_val)])
+
+        # History should only have as many entries as rounds trained
+        n_trained = len(model.trees_)
+        assert len(history.history['train_loss']) == n_trained
+
+    def test_all_callbacks_together(self, regression_200x10):
+        """Multiple callbacks should all receive events."""
+        X, y = regression_200x10
+
+        history = ob.HistoryCallback()
+        logger = ob.Logger(period=100)  # Don't spam output
+
+        model = ob.GradientBoosting(n_trees=10, max_depth=3)
+        model.fit(X, y, callbacks=[history, logger])
+
+        assert len(history.history['train_loss']) == 10
+
+
+class TestCustomCallback:
+    """Tests for custom callback classes."""
+
+    def test_custom_callback_receives_events(self, regression_100x5):
+        """Custom callback should receive on_train_begin and on_round_end."""
+        class EventTracker(ob.Callback):
+            def __init__(self):
+                self.began = False
+                self.round_count = 0
+                self.ended = False
+
+            def on_train_begin(self, state):
+                self.began = True
+
+            def on_round_end(self, state):
+                self.round_count += 1
+                return True
+
+            def on_train_end(self, state):
+                self.ended = True
+
+        X, y = regression_100x5
+        tracker = EventTracker()
+
+        model = ob.GradientBoosting(n_trees=5, max_depth=2)
+        model.fit(X, y, callbacks=[tracker])
+
+        assert tracker.began, "on_train_begin should be called"
+        assert tracker.round_count == 5, f"on_round_end called {tracker.round_count} times, expected 5"
+        assert tracker.ended, "on_train_end should be called"
+
+    def test_custom_callback_can_stop_training(self, regression_100x5):
+        """Custom callback returning False should stop training."""
+        class StopAtThree(ob.Callback):
+            def on_round_end(self, state):
+                return state.round_idx < 2  # Stop after 3 rounds (0, 1, 2)
+
+        X, y = regression_100x5
+
+        model = ob.GradientBoosting(n_trees=100, max_depth=2)
+        model.fit(X, y, callbacks=[StopAtThree()])
+
+        assert len(model.trees_) <= 3, (
+            f"Should stop at 3 trees, got {len(model.trees_)}"
+        )
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
diff --git a/tests/test_core.py b/tests/test_core.py
index 23ab40e..a7ce7d6 100644
--- a/tests/test_core.py
+++ b/tests/test_core.py
@@ -1,6 +1,7 @@
 """Core tests for OpenBoost.
 
 These tests run on CPU (for Mac development) and verify basic functionality.
+Uses shared fixtures from conftest.py for data generation.
 """
 
 import numpy as np
@@ -11,125 +12,110 @@
 
 class TestArray:
     """Tests for ob.array() and binning."""
-    
-    def test_basic_binning(self):
+
+    def test_basic_binning(self, regression_100x5):
         """Test that array() bins data correctly."""
-        np.random.seed(42)
-        X = np.random.randn(100, 5)
-        
+        X, _ = regression_100x5
         binned = ob.array(X, n_bins=256)
-        
+
         assert binned.n_samples == 100
         assert binned.n_features == 5
         assert binned.data.shape == (5, 100)  # Feature-major
         assert binned.data.dtype == np.uint8
-    
+
     def test_bin_range(self):
         """Test that bin values are in valid range."""
-        X = np.random.randn(1000, 10)
+        rng = np.random.RandomState(99)
+        X = rng.randn(1000, 10)
         binned = ob.array(X)
-        
+
         assert binned.data.min() >= 0
         assert binned.data.max() <= 255
-    
+
     def test_bin_edges_stored(self):
         """Test that bin edges are stored for inverse transform."""
-        X = np.random.randn(100, 3)
+        rng = np.random.RandomState(99)
+        X = rng.randn(100, 3)
         binned = ob.array(X)
-        
+
         assert len(binned.bin_edges) == 3
         for edges in binned.bin_edges:
             assert len(edges) > 0  # At least some bin edges
-    
+
     def test_n_bins_capped_at_255(self):
-        """Test that n_bins > 255 is capped (Phase 14: bin 255 reserved for NaN)."""
-        X = np.random.randn(10, 2)
-        
-        # n_bins > 255 should be silently capped to 255
-        # (bin 255 is reserved for missing values)
+        """Test that n_bins > 255 is capped (bin 255 reserved for NaN)."""
+        rng = np.random.RandomState(99)
+        X = rng.randn(10, 2)
+
         binned = ob.array(X, n_bins=300)
-        
+
         # Data should not contain bin 255 (no NaN in this data)
         assert np.max(binned.data) < 255
-    
+
     def test_invalid_shape(self):
         """Test that 1D input raises error."""
-        X = np.random.randn(100)
-        
+        rng = np.random.RandomState(99)
+        X = rng.randn(100)
+
         with pytest.raises(ValueError, match="must be 2D"):
             ob.array(X)
 
 
 class TestFitTree:
     """Tests for ob.fit_tree()."""
-    
-    def test_basic_fit(self):
+
+    def test_basic_fit(self, binned_100x5, mse_grads_100):
         """Test basic tree fitting."""
-        np.random.seed(42)
-        X = np.random.randn(100, 5)
-        y = X[:, 0] + 0.5 * X[:, 1]  # Simple linear target
-        
-        binned = ob.array(X)
-        
-        # MSE gradients
-        pred = np.zeros(100, dtype=np.float32)
-        grad = (2 * (pred - y)).astype(np.float32)
-        hess = np.ones(100, dtype=np.float32) * 2
-        
+        binned, _ = binned_100x5
+        grad, hess = mse_grads_100
+
         tree = ob.fit_tree(binned, grad, hess, max_depth=3)
-        
+
         assert tree.n_nodes > 0
         assert tree.depth <= 3
-    
-    def test_tree_prediction(self):
+
+    def test_tree_prediction(self, regression_100x5):
         """Test that tree predictions work."""
-        np.random.seed(42)
-        X = np.random.randn(100, 5)
-        y = X[:, 0]  # Simple target
-        
+        X, y = regression_100x5
         binned = ob.array(X)
-        
+
         pred = np.zeros(100, dtype=np.float32)
         grad = (2 * (pred - y)).astype(np.float32)
         hess = np.ones(100, dtype=np.float32) * 2
-        
+
         tree = ob.fit_tree(binned, grad, hess, max_depth=4)
         predictions = tree(binned)
-        
+
         assert predictions.shape == (100,)
         assert predictions.dtype == np.float32
-    
-    def test_training_reduces_loss(self):
+
+    def test_training_reduces_loss(self, regression_200x10):
         """Test that multiple rounds reduce loss."""
-        np.random.seed(42)
-        X = np.random.randn(200, 10)
-        y = X[:, 0] + 0.5 * X[:, 1] - 0.3 * X[:, 2]
-        y = y.astype(np.float32)
-        
+        X, y = regression_200x10
         binned = ob.array(X)
         pred = np.zeros(200, dtype=np.float32)
-        
+
         initial_loss = np.mean((pred - y) ** 2)
-        
-        # Train for a few rounds
+
         for _ in range(10):
             grad = (2 * (pred - y)).astype(np.float32)
             hess = np.ones(200, dtype=np.float32) * 2
             tree = ob.fit_tree(binned, grad, hess, max_depth=4)
             pred = pred + 0.3 * tree(binned)
-        
+
         final_loss = np.mean((pred - y) ** 2)
-        
+
         assert final_loss < initial_loss, f"Loss should decrease: {final_loss} < {initial_loss}"
-    
+
     def test_max_depth_respected(self):
         """Test that max_depth is respected."""
-        X = np.random.randn(100, 5)
+        rng = np.random.RandomState(99)
+        X = rng.randn(100, 5)
         binned = ob.array(X)
-        
-        grad = np.random.randn(100).astype(np.float32)
+
+        grad = rng.randn(100).astype(np.float32)
         hess = np.ones(100, dtype=np.float32)
-        
+
         for depth in [1, 2, 3, 5]:
             tree = ob.fit_tree(binned, grad, hess, max_depth=depth)
             assert tree.depth <= depth, f"Tree depth {tree.depth} > max_depth {depth}"
@@ -137,12 +123,12 @@ def test_max_depth_respected(self):
 
 class TestBackend:
     """Tests for backend detection and dispatch."""
-    
+
     def test_get_backend(self):
         """Test that get_backend returns valid value."""
         backend = ob.get_backend()
         assert backend in ("cuda", "cpu")
-    
+
     def test_set_backend_cpu(self):
         """Test forcing CPU backend."""
         original = ob.get_backend()
@@ -152,13 +138,11 @@ def test_set_backend_cpu(self):
             assert ob.is_cpu()
             assert not ob.is_cuda()
         finally:
-            # Restore
             if original == "cuda":
-                try:
+                import contextlib
+                with contextlib.suppress(RuntimeError):
                     ob.set_backend("cuda")
-                except RuntimeError:
-                    pass  # CUDA not available
-    
+
     def test_invalid_backend(self):
         """Test that invalid backend raises error."""
         with pytest.raises(ValueError, match="must be 'cuda' or 'cpu'"):
@@ -167,122 +151,115 @@ def test_invalid_backend(self):
 
 class TestEdgeCases:
     """Tests for edge cases."""
-    
+
     def test_constant_feature(self):
         """Test handling of constant features."""
-        X = np.random.randn(100, 3)
+        rng = np.random.RandomState(99)
+        X = rng.randn(100, 3)
         X[:, 1] = 5.0  # Constant feature
-        
+
         binned = ob.array(X)
-        
-        # Should still work
-        grad = np.random.randn(100).astype(np.float32)
+
+        grad = rng.randn(100).astype(np.float32)
         hess = np.ones(100, dtype=np.float32)
         tree = ob.fit_tree(binned, grad, hess, max_depth=3)
-        
+
         assert tree.n_nodes > 0
-    
+
     def test_small_dataset(self):
         """Test with very small dataset."""
-        X = np.random.randn(10, 2)
+        rng = np.random.RandomState(99)
+        X = rng.randn(10, 2)
         binned = ob.array(X)
-        
-        grad = np.random.randn(10).astype(np.float32)
+
+        grad = rng.randn(10).astype(np.float32)
         hess = np.ones(10, dtype=np.float32)
-        
+
         tree = ob.fit_tree(binned, grad, hess, max_depth=2)
         pred = tree(binned)
-        
+
         assert pred.shape == (10,)
-    
+
     def test_single_feature(self):
         """Test with single feature."""
-        X = np.random.randn(100, 1)
+        rng = np.random.RandomState(99)
+        X = rng.randn(100, 1)
         binned = ob.array(X)
-        
-        grad = np.random.randn(100).astype(np.float32)
+
+        grad = rng.randn(100).astype(np.float32)
         hess = np.ones(100, dtype=np.float32)
-        
+
         tree = ob.fit_tree(binned, grad, hess, max_depth=3)
         pred = tree(binned)
-        
+
         assert pred.shape == (100,)
 
 
 class TestGradientBoosting:
     """Tests for high-level GradientBoosting API."""
-    
+
     def test_import(self):
         """Test that openboost can be imported."""
         assert hasattr(ob, "GradientBoosting")
         assert ob.__version__ == "1.0.0rc1"
-    
-    def test_fit_predict_small(self):
+
+    def test_fit_predict_small(self, regression_100x5):
         """Test fit/predict on small data."""
-        np.random.seed(42)
-        X = np.random.randn(100, 5).astype(np.float32)
-        y = np.random.randn(100).astype(np.float32)
-        
+        X, y = regression_100x5
+
         model = ob.GradientBoosting(n_trees=3, max_depth=3)
         model.fit(X, y)
-        
+
         pred = model.predict(X)
-        
+
         assert pred.shape == y.shape
         assert pred.dtype == np.float32
-    
-    def test_mse_decreases(self):
+
+    def test_mse_decreases(self, regression_200x10):
         """Test that MSE decreases with more trees."""
-        np.random.seed(42)
-        X = np.random.randn(200, 10).astype(np.float32)
-        y = X[:, 0] + 0.5 * X[:, 1] + np.random.randn(200).astype(np.float32) * 0.1
-        
+        X, y = regression_200x10
+
         model_few = ob.GradientBoosting(n_trees=2, max_depth=3)
         model_few.fit(X, y)
         mse_few = np.mean((model_few.predict(X) - y) ** 2)
-        
+
         model_many = ob.GradientBoosting(n_trees=10, max_depth=3)
         model_many.fit(X, y)
         mse_many = np.mean((model_many.predict(X) - y) ** 2)
-        
+
         assert mse_many < mse_few, f"More trees should reduce MSE: {mse_many} >= {mse_few}"
-    
-    def test_learning_rate_effect(self):
+
+    def test_learning_rate_effect(self, regression_100x5):
         """Test that lower learning rate requires more trees."""
-        np.random.seed(42)
-        X = np.random.randn(100, 5).astype(np.float32)
-        y = X[:, 0] * 2 + np.random.randn(100).astype(np.float32) * 0.1
-        
-        # High LR, few trees
+        X, y = regression_100x5
+
         model_high_lr = ob.GradientBoosting(n_trees=5, max_depth=3, learning_rate=0.5)
         model_high_lr.fit(X, y)
         pred_high = model_high_lr.predict(X)
-        
-        # Low LR, few trees - should fit worse
+
         model_low_lr = ob.GradientBoosting(n_trees=5, max_depth=3, learning_rate=0.01)
         model_low_lr.fit(X, y)
         pred_low = model_low_lr.predict(X)
-        
+
         mse_high = np.mean((pred_high - y) ** 2)
         mse_low = np.mean((pred_low - y) ** 2)
-        
-        # With same trees, high LR should fit better (for small n_trees)
+
         assert mse_high < mse_low
-    
+
     def test_deterministic(self):
         """Test that results are deterministic."""
-        np.random.seed(42)
-        X = np.random.randn(50, 3).astype(np.float32)
-        y = np.random.randn(50).astype(np.float32)
-        
+        rng = np.random.RandomState(42)
+        X = rng.randn(50, 3).astype(np.float32)
+        y = rng.randn(50).astype(np.float32)
+
         model1 = ob.GradientBoosting(n_trees=3, max_depth=2)
         model1.fit(X, y)
         pred1 = model1.predict(X)
-        
+
         model2 = ob.GradientBoosting(n_trees=3, max_depth=2)
         model2.fit(X, y)
         pred2 = model2.predict(X)
-        
+
         np.testing.assert_array_equal(pred1, pred2)
 
 
diff --git a/tests/test_gam.py b/tests/test_gam.py
new file mode 100644
index 0000000..a9fe1e6
--- /dev/null
+++ b/tests/test_gam.py
@@ -0,0 +1,192 @@
+"""Tests for OpenBoostGAM model.
+
+Verifies that the GPU-accelerated Generalized Additive Model works
+correctly on CPU. These are the first CPU tests for this model variant.
+"""
+
+import numpy as np
+import pytest
+
+import openboost as ob
+
+
+class TestGAMBasic:
+    """Basic functionality tests."""
+
+    def test_basic_fit_predict(self, regression_200x10):
+        """Fit and predict should produce correct shapes."""
+        X, y = regression_200x10
+
+        gam = ob.OpenBoostGAM(n_rounds=50, learning_rate=0.05, reg_lambda=1.0)
+        gam.fit(X, y)
+        pred = gam.predict(X)
+
+        assert pred.shape == y.shape
+        assert pred.dtype == np.float32
+        assert np.all(np.isfinite(pred))
+
+    def test_shape_values_shape(self, regression_200x10):
+        """shape_values_ should be (n_features, 256)."""
+        X, y = regression_200x10
+
+        gam = ob.OpenBoostGAM(n_rounds=20, learning_rate=0.05)
+        gam.fit(X, y)
+
+        assert gam.shape_values_ is not None
+        assert gam.shape_values_.shape == (10, 256), (
+            f"Expected shape (10, 256), got {gam.shape_values_.shape}"
+        )
+
+    def test_training_reduces_loss(self, regression_200x10):
+        """Training should reduce loss compared to baseline."""
+        X, y = regression_200x10
+
+        gam = ob.OpenBoostGAM(n_rounds=100, learning_rate=0.05)
+        gam.fit(X, y)
+        pred = gam.predict(X)
+
+        mse = np.mean((pred - y) ** 2)
+        baseline_mse = np.var(y)
+
+        assert mse < baseline_mse * 0.5, (
+            f"GAM MSE ({mse:.4f}) should be well below baseline ({baseline_mse:.4f})"
+        )
+
+    def test_deterministic(self, regression_100x5):
+        """Same input should produce identical output."""
+        X, y = regression_100x5
+
+        gam1 = ob.OpenBoostGAM(n_rounds=20, learning_rate=0.05)
+        gam1.fit(X, y)
+        pred1 = gam1.predict(X)
+
+        gam2 = ob.OpenBoostGAM(n_rounds=20, learning_rate=0.05)
+        gam2.fit(X, y)
+        pred2 = gam2.predict(X)
+
+        np.testing.assert_array_equal(pred1, pred2)
+
+
+class TestGAMInterpretability:
+    """Verify GAM interpretability properties."""
+
+    def test_shape_functions_capture_correct_features(self):
+        """When y = f(X[:,0]), feature 0's shape function should be most active."""
+        rng = np.random.RandomState(42)
+        X = rng.randn(300, 5).astype(np.float32)
+        y = np.sin(X[:, 0]).astype(np.float32)
+
+        gam = ob.OpenBoostGAM(n_rounds=500, learning_rate=0.03)
+        gam.fit(X, y)
+
+        # Feature 0 should have the largest shape function range
+        ranges = [np.ptp(gam.shape_values_[f]) for f in range(5)]
+        assert np.argmax(ranges) == 0, (
+            f"Feature 0 should have largest range but ranges are: {ranges}"
+        )
+
+    def test_additive_prediction_structure(self, regression_100x5):
+        """Predictions should be sum of shape functions + base score."""
+        X, y = regression_100x5
+
+        gam = ob.OpenBoostGAM(n_rounds=30, learning_rate=0.05)
+        gam.fit(X, y)
+
+        # Get predictions the normal way
+        pred_normal = gam.predict(X)
+
+        # Manually compute from shape functions
+        binned = gam.X_binned_
+        binned_data = binned.data
+        if hasattr(binned_data, 'copy_to_host'):
+            binned_data = binned_data.copy_to_host()
+        binned_data = np.asarray(binned_data)
+
+        base = getattr(gam, 'base_score_', np.float32(0.0))
+        pred_manual = np.full(len(y), base, dtype=np.float32)
+        for f in range(X.shape[1]):
+            pred_manual += gam.shape_values_[f, binned_data[f, :]]
+
+        np.testing.assert_allclose(pred_normal, pred_manual, atol=1e-5)
+
+
+class TestGAMClassification:
+    """GAM with classification loss."""
+
+    def test_logloss(self, binary_500x10):
+        """GAM should work with logloss for binary classification."""
+        X, y = binary_500x10
+
+        gam = ob.OpenBoostGAM(n_rounds=100, learning_rate=0.05, loss='logloss')
+        gam.fit(X, y)
+        pred_raw = gam.predict(X)
+
+        # Convert to probabilities
+        prob = 1.0 / (1.0 + np.exp(-pred_raw))
+        labels = (prob > 0.5).astype(float)
+        accuracy = np.mean(labels == y)
+
+        assert accuracy > 0.70, f"GAM classification accuracy {accuracy:.3f} < 0.70"
+
+
+class TestGAMEdgeCases:
+    """Edge cases for OpenBoostGAM."""
+
+    def test_predict_before_fit_raises(self):
+        """Predict on unfitted model should raise."""
+        gam = ob.OpenBoostGAM(n_rounds=10)
+        rng = np.random.RandomState(42)
+        X = rng.randn(10, 3).astype(np.float32)
+
+        with pytest.raises(RuntimeError, match="not fitted"):
+            gam.predict(X)
+
+    def test_single_round(self):
+        """Should work with a single boosting round."""
+        rng = np.random.RandomState(42)
+        X = rng.randn(50, 3).astype(np.float32)
+        y = rng.randn(50).astype(np.float32)
+
+        gam = ob.OpenBoostGAM(n_rounds=1, learning_rate=0.1)
+        gam.fit(X, y)
+        pred = gam.predict(X)
+
+        assert pred.shape == y.shape
+        assert np.all(np.isfinite(pred))
+
+    def test_constant_target(self):
+        """GAM with constant target should predict that constant."""
+        rng = np.random.RandomState(42)
+        X = rng.randn(100, 3).astype(np.float32)
+        y = np.full(100, 2.5, dtype=np.float32)
+
+        gam = ob.OpenBoostGAM(n_rounds=50, learning_rate=0.1)
+        gam.fit(X, y)
+        pred = gam.predict(X)
+
+        np.testing.assert_allclose(pred, 2.5, atol=0.2,
+                                   err_msg="Should converge to constant target")
+
+
+class TestGAMPersistence:
+    """Save/load functionality."""
+
+    def test_save_load_roundtrip(self, regression_100x5, tmp_path):
+        """Predictions should match after save/load."""
+        X, y = regression_100x5
+
+        gam = ob.OpenBoostGAM(n_rounds=10, learning_rate=0.05)
+        gam.fit(X, y)
+        pred_before = gam.predict(X)
+
+        path = str(tmp_path / "gam_model.json")
+        gam.save(path)
+
+        loaded = ob.OpenBoostGAM.load(path)
+        pred_after = loaded.predict(X)
+
+        np.testing.assert_array_equal(pred_before, pred_after)
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
diff --git a/tests/test_kernel_correctness.py b/tests/test_kernel_correctness.py
new file mode 100644
index 0000000..8c7ea85
--- /dev/null
+++ b/tests/test_kernel_correctness.py
@@ -0,0 +1,475 @@
+"""Kernel-level correctness tests for OpenBoost.
+
+Verifies that the lowest-level computational kernels (histograms, split finding,
+leaf values) produce correct results against hand-computed reference values.
+These tests catch bugs in the core algorithms that affect all models.
+"""
+
+import numpy as np
+import pytest
+
+import openboost as ob
+from openboost._core._split import compute_leaf_value, find_best_split
+
+# =============================================================================
+# Histogram Correctness
+# =============================================================================
+
+
+class TestHistogramCorrectness:
+    """Verify histogram building produces correct aggregations."""
+
+    def test_histogram_sum_equals_gradient_sum(self, binned_100x5, mse_grads_100):
+        """Sum of histogram gradients must equal sum of input gradients."""
+        binned, _ = binned_100x5
+        grad, hess = mse_grads_100
+
+        sample_node_ids = ob.init_sample_node_ids(100)
+        histograms = ob.build_node_histograms(
+            binned.data, grad, hess, sample_node_ids, [0]
+        )
+
+        hist = histograms[0]
+        # Sum across all bins for each feature should equal total gradient
+        for feat in range(5):
+            feat_grad_sum = np.sum(hist.hist_grad[feat, :])
+            feat_hess_sum = np.sum(hist.hist_hess[feat, :])
+            np.testing.assert_almost_equal(
+                feat_grad_sum, np.sum(grad), decimal=4,
+                err_msg=f"Feature {feat}: hist grad sum != input grad sum"
+            )
+            np.testing.assert_almost_equal(
+                feat_hess_sum, np.sum(hess), decimal=4,
+                err_msg=f"Feature {feat}: hist hess sum != input hess sum"
+            )
+
+    def test_histogram_per_bin_counts(self):
+        """Hand-crafted data: verify each bin's grad/hess matches manual sum."""
+        # Create data where we know exactly which sample goes to which bin
+        # 10 samples, 2 features, carefully crafted to land in known bins
+        rng = np.random.RandomState(42)
+        n_samples = 20
+        X = rng.randn(n_samples, 2).astype(np.float32)
+        binned = ob.array(X)
+
+        # Known gradients
+        grad = np.arange(n_samples, dtype=np.float32)
+        hess = np.ones(n_samples, dtype=np.float32)
+
+        sample_node_ids = ob.init_sample_node_ids(n_samples)
+        histograms = ob.build_node_histograms(
+            binned.data, grad, hess, sample_node_ids, [0]
+        )
+
+        hist = histograms[0]
+
+        # For each feature, verify that the samples in each bin sum correctly
+        for feat in range(2):
+            bin_values = binned.data[feat, :]  # bin assignment for each sample
+            for b in range(256):
+                mask = bin_values == b
+                expected_grad = np.sum(grad[mask])
+                expected_hess = np.sum(hess[mask])
+                np.testing.assert_almost_equal(
+                    hist.hist_grad[feat, b], expected_grad, decimal=5,
+                    err_msg=f"Feature {feat}, bin {b}: grad mismatch"
+                )
+                np.testing.assert_almost_equal(
+                    hist.hist_hess[feat, b], expected_hess, decimal=5,
+                    err_msg=f"Feature {feat}, bin {b}: hess mismatch"
+                )
+
+    def test_histogram_with_missing_bin_isolated(self):
+        """NaN samples must accumulate only in bin 255 (MISSING_BIN)."""
+        rng = np.random.RandomState(42)
+        n_samples = 50
+        X = rng.randn(n_samples, 3).astype(np.float32)
+        # Inject NaN in feature 0 for first 10 samples
+        X[:10, 0] = np.nan
+
+        binned = ob.array(X)
+        grad = np.ones(n_samples, dtype=np.float32)
+        hess = np.ones(n_samples, dtype=np.float32)
+
+        sample_node_ids = ob.init_sample_node_ids(n_samples)
+        histograms = ob.build_node_histograms(
+            binned.data, grad, hess, sample_node_ids, [0]
+        )
+
+        hist = histograms[0]
+
+        # For feature 0: bin 255 should have grad=10 (10 NaN samples * grad=1)
+        np.testing.assert_almost_equal(
+            hist.hist_grad[0, 255], 10.0, decimal=4,
+            err_msg="Missing bin should accumulate exactly NaN samples"
+        )
+        np.testing.assert_almost_equal(
+            hist.hist_hess[0, 255], 10.0, decimal=4,
+        )
+
+        # Non-NaN features should have 0 in bin 255
+        np.testing.assert_almost_equal(
+            hist.hist_grad[1, 255], 0.0, decimal=4,
+            err_msg="Feature without NaN should have 0 in missing bin"
+        )
+
+    def test_histogram_constant_feature(self):
+        """A constant feature should have all samples in a single bin."""
+        n_samples = 50
+        X = np.zeros((n_samples, 2), dtype=np.float32)
+        X[:, 0] = 5.0  # Constant
+        X[:, 1] = np.arange(n_samples, dtype=np.float32)  # Varying
+
+        binned = ob.array(X)
+        grad = np.ones(n_samples, dtype=np.float32) * 3.0
+        hess = np.ones(n_samples, dtype=np.float32)
+
+        sample_node_ids = ob.init_sample_node_ids(n_samples)
+        histograms = ob.build_node_histograms(
+            binned.data, grad, hess, sample_node_ids, [0]
+        )
+
+        hist = histograms[0]
+
+        # Feature 0 (constant): exactly one bin should have all grad/hess
+        nonzero_bins = np.sum(hist.hist_hess[0, :] > 0)
+        assert nonzero_bins == 1, f"Constant feature should have 1 non-zero bin, got {nonzero_bins}"
+        np.testing.assert_almost_equal(
+            np.sum(hist.hist_grad[0, :]), 3.0 * n_samples, decimal=4
+        )
+
+    def test_histogram_subtraction(self, binned_100x5, mse_grads_100):
+        """Parent histogram - left child histogram = right child histogram."""
+        binned, _ = binned_100x5
+        grad, hess = mse_grads_100
+
+        sample_node_ids = ob.init_sample_node_ids(100)
+
+        # Build parent histogram
+        parent_hists = ob.build_node_histograms(
+            binned.data, grad, hess, sample_node_ids, [0]
+        )
+        parent = parent_hists[0]
+
+        # Do a split to create children
+        splits = ob.find_node_splits(parent_hists)
+        if splits and 0 in splits and splits[0].split.is_valid:
+            new_node_ids = ob.partition_samples(binned.data, sample_node_ids, splits)
+            left_id = splits[0].left_child
+            right_id = splits[0].right_child
+
+            child_hists = ob.build_node_histograms(
+                binned.data, grad, hess, new_node_ids, [left_id, right_id]
+            )
+
+            if left_id in child_hists and right_id in child_hists:
+                left = child_hists[left_id]
+                right = child_hists[right_id]
+
+                # Parent = left + right
+                np.testing.assert_almost_equal(
+                    parent.hist_grad, left.hist_grad + right.hist_grad,
+                    decimal=4, err_msg="Parent grad != left + right"
+                )
+                np.testing.assert_almost_equal(
+                    parent.hist_hess, left.hist_hess + right.hist_hess,
+                    decimal=4, err_msg="Parent hess != left + right"
+                )
+
+
+# =============================================================================
+# Split Finding Correctness
+# =============================================================================
+
+
+class TestSplitFindingCorrectness:
+    """Verify split finding selects the optimal split."""
+
+    def test_split_gain_formula_exact(self):
+        """Verify split gain matches the formula: left_score + right_score - parent_score."""
+        # Construct a histogram with known values
+        n_features = 2
+        hist_grad = np.zeros((n_features, 256), dtype=np.float32)
+        hist_hess = np.zeros((n_features, 256), dtype=np.float32)
+
+        # Feature 0: bins 0-9 have grad=1, hess=1 each; bins 10-19 have grad=-1, hess=1
+        for b in range(10):
+            hist_grad[0, b] = 1.0
+            hist_hess[0, b] = 1.0
+        for b in range(10, 20):
+            hist_grad[0, b] = -1.0
+            hist_hess[0, b] = 1.0
+
+        # Feature 1: spread evenly (poor split)
+        for b in range(20):
+            hist_grad[1, b] = 0.0
+            hist_hess[1, b] = 1.0
+
+        total_grad = float(np.sum(hist_grad[0]))  # 0.0
+        total_hess = float(np.sum(hist_hess[0]))  # 20.0
+
+        reg_lambda = 1.0
+        split = find_best_split(
+            hist_grad, hist_hess, total_grad, total_hess,
+            reg_lambda=reg_lambda, min_child_weight=0.0,
+        )
+
+        assert split.feature == 0, f"Should split on feature 0, got {split.feature}"
+
+        # Manual gain computation for the best split on feature 0 at threshold=9
+        # Left: grad=10, hess=10 -> score = 10^2/(10+1) = 100/11
+        # Right: grad=-10, hess=10 -> score = (-10)^2/(10+1) = 100/11
+        # Parent: grad=0, hess=20 -> score = 0^2/(20+1) = 0
+        # Gain = 100/11 + 100/11 - 0 = 200/11 ≈ 18.18
+        expected_gain = 100.0 / 11.0 + 100.0 / 11.0 - 0.0
+        np.testing.assert_almost_equal(
+            split.gain, expected_gain, decimal=3,
+            err_msg=f"Gain should be {expected_gain}, got {split.gain}"
+        )
+
+    def test_split_selects_optimal_feature(self):
+        """Feature with highest gain should be selected."""
+        n_features = 3
+        hist_grad = np.zeros((n_features, 256), dtype=np.float32)
+        hist_hess = np.zeros((n_features, 256), dtype=np.float32)
+
+        # Feature 0: weak split
+        hist_grad[0, :5] = 0.1
+        hist_hess[0, :5] = 1.0
+        hist_grad[0, 5:10] = -0.1
+        hist_hess[0, 5:10] = 1.0
+
+        # Feature 1: NO split possible (constant)
+        hist_grad[1, 0] = 0.0
+        hist_hess[1, 0] = 10.0
+
+        # Feature 2: strong split (large gradient difference)
+        hist_grad[2, :5] = 5.0
+        hist_hess[2, :5] = 1.0
+        hist_grad[2, 5:10] = -5.0
+        hist_hess[2, 5:10] = 1.0
+
+        total_grad = float(np.sum(hist_grad[0]))
+        total_hess = float(np.sum(hist_hess[0]))
+
+        split = find_best_split(
+            hist_grad, hist_hess, total_grad, total_hess,
+            reg_lambda=1.0, min_child_weight=0.0,
+        )
+
+        assert split.feature == 2, f"Should pick feature 2 (strongest), got {split.feature}"
+
+    def test_split_min_child_weight_enforcement(self):
+        """Splits that violate min_child_weight should be rejected."""
+        n_features = 1
+        hist_grad = np.zeros((n_features, 256), dtype=np.float32)
+        hist_hess = np.zeros((n_features, 256), dtype=np.float32)
+
+        # Only one sample in bin 0, rest in bin 1
+        hist_grad[0, 0] = 5.0
+        hist_hess[0, 0] = 0.5  # Below min_child_weight=1.0
+        hist_grad[0, 1] = -5.0
+        hist_hess[0, 1] = 10.0
+
+        total_grad = 0.0
+        total_hess = 10.5
+
+        split = find_best_split(
+            hist_grad, hist_hess, total_grad, total_hess,
+            reg_lambda=1.0, min_child_weight=1.0,
+        )
+
+        # Split at threshold=0 would put hess=0.5 in left, violating min_child_weight=1.0
+        # Should either find no split or a different threshold
+        if split.is_valid and split.threshold == 0:
+            pytest.fail("Split at threshold=0 should be rejected (hess=0.5 < min_child_weight=1.0)")
+
+
+# =============================================================================
+# Leaf Value Correctness
+# =============================================================================
+
+
+class TestLeafValueCorrectness:
+    """Verify leaf value computation follows Newton-Raphson formula."""
+
+    def test_newton_raphson_formula(self):
+        """leaf_value = -sum_grad / (sum_hess + lambda)."""
+        # Case 1: simple
+        val = compute_leaf_value(sum_grad=6.0, sum_hess=3.0, reg_lambda=1.0)
+        expected = -6.0 / (3.0 + 1.0)  # -1.5
+        np.testing.assert_almost_equal(val, expected, decimal=10)
+
+        # Case 2: negative gradient
+        val = compute_leaf_value(sum_grad=-3.0, sum_hess=2.0, reg_lambda=1.0)
+        expected = 3.0 / (2.0 + 1.0)  # 1.0
+        np.testing.assert_almost_equal(val, expected, decimal=10)
+
+        # Case 3: zero gradient
+        val = compute_leaf_value(sum_grad=0.0, sum_hess=5.0, reg_lambda=1.0)
+        np.testing.assert_almost_equal(val, 0.0, decimal=10)
+
+        # Case 4: large lambda
+        val = compute_leaf_value(sum_grad=10.0, sum_hess=2.0, reg_lambda=100.0)
+        expected = -10.0 / (2.0 + 100.0)  # -0.098...
+        np.testing.assert_almost_equal(val, expected, decimal=10)
+
+    def test_l1_soft_thresholding_below_threshold(self):
+        """When |sum_grad| <= reg_alpha, leaf value should be 0."""
+        val = compute_leaf_value(sum_grad=0.5, sum_hess=5.0, reg_lambda=1.0, reg_alpha=1.0)
+        np.testing.assert_almost_equal(val, 0.0, decimal=10)
+
+        val = compute_leaf_value(sum_grad=-0.3, sum_hess=5.0, reg_lambda=1.0, reg_alpha=0.5)
+        np.testing.assert_almost_equal(val, 0.0, decimal=10)
+
+    def test_l1_soft_thresholding_above_threshold(self):
+        """When |sum_grad| > reg_alpha, apply soft-thresholding."""
+        # Positive gradient above threshold
+        val = compute_leaf_value(sum_grad=2.0, sum_hess=3.0, reg_lambda=1.0, reg_alpha=0.5)
+        expected = -(2.0 - 0.5) / (3.0 + 1.0)  # -0.375
+        np.testing.assert_almost_equal(val, expected, decimal=10)
+
+        # Negative gradient above threshold
+        val = compute_leaf_value(sum_grad=-2.0, sum_hess=3.0, reg_lambda=1.0, reg_alpha=0.5)
+        expected = -(-2.0 + 0.5) / (3.0 + 1.0)  # 0.375
+        np.testing.assert_almost_equal(val, expected, decimal=10)
+
+
+# =============================================================================
+# Partition Correctness
+# =============================================================================
+
+
+class TestPartitionCorrectness:
+    """Verify sample partitioning preserves counts and is consistent."""
+
+    def test_partition_conserves_samples(self, binned_100x5, mse_grads_100):
+        """After partitioning, n_left + n_right = n_total."""
+        binned, _ = binned_100x5
+        grad, hess = mse_grads_100
+
+        sample_node_ids = ob.init_sample_node_ids(100)
+
+        histograms = ob.build_node_histograms(
+            binned.data, grad, hess, sample_node_ids, [0]
+        )
+        splits = ob.find_node_splits(histograms)
+
+        if splits and 0 in splits and splits[0].split.is_valid:
+            new_node_ids = ob.partition_samples(binned.data, sample_node_ids, splits)
+            left_id = splits[0].left_child
+            right_id = splits[0].right_child
+
+            n_left = np.sum(new_node_ids == left_id)
+            n_right = np.sum(new_node_ids == right_id)
+
+            assert n_left + n_right == 100, (
+                f"Partition should conserve samples: {n_left} + {n_right} != 100"
+            )
+            assert n_left > 0, "Left child should have at least 1 sample"
+            assert n_right > 0, "Right child should have at least 1 sample"
+
+    def test_partition_deterministic(self, binned_100x5, mse_grads_100):
+        """Same data should produce same partition."""
+        binned, _ = binned_100x5
+        grad, hess = mse_grads_100
+
+        results = []
+        for _ in range(2):
+            sample_node_ids = ob.init_sample_node_ids(100)
+            histograms = ob.build_node_histograms(
+                binned.data, grad, hess, sample_node_ids, [0]
+            )
+            splits = ob.find_node_splits(histograms)
+            if splits and 0 in splits and splits[0].split.is_valid:
+                new_node_ids = ob.partition_samples(binned.data, sample_node_ids, splits)
+                results.append(new_node_ids.copy())
+
+        if len(results) == 2:
+            np.testing.assert_array_equal(results[0], results[1])
+
+    def test_tree_depth_matches_max_depth(self, regression_100x5):
+        """Trees must respect max_depth constraint."""
+        X, y = regression_100x5
+        binned = ob.array(X)
+        grad = (2 * (np.zeros(100, dtype=np.float32) - y)).astype(np.float32)
+        hess = np.ones(100, dtype=np.float32) * 2
+
+        for depth in [1, 2, 3, 4, 5]:
+            tree = ob.fit_tree(binned, grad, hess, max_depth=depth)
+            assert tree.depth <= depth, f"Tree depth {tree.depth} > max_depth {depth}"
+
+
+# =============================================================================
+# End-to-End Algorithmic Correctness
+# =============================================================================
+
+
+class TestAlgorithmicCorrectness:
+    """End-to-end correctness of the boosting algorithm."""
+
+    def test_boosting_monotonic_loss_decrease(self, regression_200x10):
+        """Loss should decrease every round (for reasonable settings)."""
+        X, y = regression_200x10
+        binned = ob.array(X)
+        pred = np.zeros(200, dtype=np.float32)
+
+        losses = []
+        for _ in range(5):
+            loss = float(np.mean((pred - y) ** 2))
+            losses.append(loss)
+            grad = (2 * (pred - y)).astype(np.float32)
+            hess = np.ones(200, dtype=np.float32) * 2
+            tree = ob.fit_tree(binned, grad, hess, max_depth=4)
+            pred = pred + 0.1 * tree(binned)
+
+        # Each subsequent loss should be lower
+        for i in range(1, len(losses)):
+            assert losses[i] < losses[i - 1], (
+                f"Loss should decrease monotonically: round {i}: {losses[i]} >= {losses[i-1]}"
+            )
+
+    def test_converges_to_mean_for_constant_target(self):
+        """For constant y, predictions should converge to that constant."""
+        rng = np.random.RandomState(42)
+        X = rng.randn(100, 5).astype(np.float32)
+        y = np.full(100, 3.7, dtype=np.float32)
+
+        model = ob.GradientBoosting(n_trees=50, max_depth=2, learning_rate=0.3)
+        model.fit(X, y)
+        pred = model.predict(X)
+
+        # Predictions should be very close to 3.7
+        np.testing.assert_allclose(pred, 3.7, atol=0.1,
+                                   err_msg="Predictions should converge to constant target value")
+
+    def test_single_split_tree_matches_manual(self):
+        """A depth-1 tree with simple data should produce predictable splits."""
+        # Feature 0 clearly splits the target
+        X = np.array([
+            [-2.0, 0.0],
+            [-1.0, 0.0],
+            [1.0, 0.0],
+            [2.0, 0.0],
+        ], dtype=np.float32)
+        y = np.array([-1.0, -1.0, 1.0, 1.0], dtype=np.float32)
+
+        binned = ob.array(X)
+        grad = (2 * (np.zeros(4, dtype=np.float32) - y)).astype(np.float32)  # [-2, -2, 2, 2] * -1 = [2, 2, -2, -2]
+        hess = np.ones(4, dtype=np.float32) * 2
+
+        tree = ob.fit_tree(binned, grad, hess, max_depth=1)
+
+        # Should split on feature 0
+        assert tree.n_nodes >= 3, "Depth-1 tree should have at least 3 nodes (root + 2 leaves)"
+        assert tree.depth == 1
+
+        # Predictions for left vs right should have opposite signs
+        pred = tree(binned)
+        assert pred[0] * pred[2] < 0 or np.abs(pred[0] - pred[2]) > 0.1, (
+            "Left and right predictions should differ for this clear split"
+        )
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
diff --git a/tests/test_linear_leaf.py b/tests/test_linear_leaf.py
new file mode 100644
index 0000000..e2dea7a
--- /dev/null
+++ b/tests/test_linear_leaf.py
@@ -0,0 +1,180 @@
+"""Tests for LinearLeafGBDT model.
+
+Verifies that gradient boosting with linear models in leaves works correctly
+on CPU. These are the first CPU tests for this model variant.
+"""
+
+import numpy as np
+import pytest
+
+import openboost as ob
+
+
+class TestLinearLeafBasic:
+    """Basic functionality tests."""
+
+    def test_basic_fit_predict(self, regression_200x10):
+        """Fit and predict should produce correct shapes and dtypes."""
+        X, y = regression_200x10
+
+        model = ob.LinearLeafGBDT(n_trees=10, max_depth=3, learning_rate=0.1)
+        model.fit(X, y)
+        pred = model.predict(X)
+
+        assert pred.shape == y.shape, f"Expected shape {y.shape}, got {pred.shape}"
+        assert pred.dtype == np.float32
+        assert np.all(np.isfinite(pred)), "Predictions should be finite"
+
+    def test_training_reduces_loss(self, regression_200x10):
+        """More trees should reduce training loss."""
+        X, y = regression_200x10
+
+        model_few = ob.LinearLeafGBDT(n_trees=5, max_depth=3)
+        model_few.fit(X, y)
+        mse_few = np.mean((model_few.predict(X) - y) ** 2)
+
+        model_many = ob.LinearLeafGBDT(n_trees=30, max_depth=3)
+        model_many.fit(X, y)
+        mse_many = np.mean((model_many.predict(X) - y) ** 2)
+
+        assert mse_many < mse_few, (
+            f"More trees should reduce MSE: {mse_many} >= {mse_few}"
+        )
+
+    def test_deterministic(self, regression_100x5):
+        """Same input should produce identical output."""
+        X, y = regression_100x5
+
+        model1 = ob.LinearLeafGBDT(n_trees=5, max_depth=2)
+        model1.fit(X, y)
+        pred1 = model1.predict(X)
+
+        model2 = ob.LinearLeafGBDT(n_trees=5, max_depth=2)
+        model2.fit(X, y)
+        pred2 = model2.predict(X)
+
+        np.testing.assert_array_equal(pred1, pred2)
+
+    def test_predict_before_fit_raises(self):
+        """Predict on unfitted model should raise."""
+        model = ob.LinearLeafGBDT(n_trees=5)
+        rng = np.random.RandomState(42)
+        X = rng.randn(10, 3).astype(np.float32)
+
+        with pytest.raises((RuntimeError, AttributeError)):
+            model.predict(X)
+
+
+class TestLinearLeafExtrapolation:
+    """Verify that linear leaves improve extrapolation."""
+
+    def test_extrapolation_on_linear_target(self):
+        """LinearLeaf should extrapolate better than standard GBDT on linear data."""
+        rng = np.random.RandomState(42)
+        # Training: X in [-2, 2]
+        X_train = rng.uniform(-2, 2, (200, 3)).astype(np.float32)
+        y_train = (2 * X_train[:, 0] + X_train[:, 1]).astype(np.float32)
+
+        # Test: X in [3, 5] (extrapolation region)
+        X_test = rng.uniform(3, 5, (50, 3)).astype(np.float32)
+        y_test = (2 * X_test[:, 0] + X_test[:, 1]).astype(np.float32)
+
+        # Standard GBDT
+        standard = ob.GradientBoosting(n_trees=50, max_depth=4, learning_rate=0.1)
+        standard.fit(X_train, y_train)
+        std_pred = standard.predict(X_test)
+        _ = np.mean((std_pred - y_test) ** 2)
+
+        # Linear Leaf GBDT
+        linear = ob.LinearLeafGBDT(n_trees=50, max_depth=3, learning_rate=0.1)
+        linear.fit(X_train, y_train)
+        lin_pred = linear.predict(X_test)
+        _ = np.mean((lin_pred - y_test) ** 2)
+
+        # Linear leaf should extrapolate better (or at least comparably)
+        # We don't assert strict superiority since it depends on the data
+        assert np.all(np.isfinite(lin_pred)), "Linear leaf predictions should be finite"
+        # At minimum, linear leaf predictions should be in a reasonable range
+        assert np.max(np.abs(lin_pred)) < 100, "Predictions shouldn't explode"
+
+
+class TestLinearLeafEdgeCases:
+    """Edge cases for LinearLeafGBDT."""
+
+    def test_with_constant_features(self):
+        """Should handle constant features gracefully."""
+        rng = np.random.RandomState(42)
+        X = rng.randn(100, 3).astype(np.float32)
+        X[:, 1] = 5.0  # Constant feature
+        y = X[:, 0].copy()
+
+        model = ob.LinearLeafGBDT(n_trees=10, max_depth=2)
+        model.fit(X, y)
+        pred = model.predict(X)
+
+        assert np.all(np.isfinite(pred))
+
+    def test_with_missing_values(self):
+        """Should handle NaN in features."""
+        rng = np.random.RandomState(42)
+        X = rng.randn(100, 3).astype(np.float32)
+        X[:5, 0] = np.nan
+        y = rng.randn(100).astype(np.float32)
+
+        model = ob.LinearLeafGBDT(n_trees=10, max_depth=2)
+        model.fit(X, y)
+        pred = model.predict(X)
+
+        assert pred.shape == y.shape
+        assert np.all(np.isfinite(pred))
+
+    def test_shallow_trees_with_linear_leaves(self):
+        """Shallow trees (depth 1-2) should still work with linear leaves."""
+        rng = np.random.RandomState(42)
+        X = rng.randn(100, 5).astype(np.float32)
+        y = (X[:, 0] + 0.5 * X[:, 1]).astype(np.float32)
+
+        model = ob.LinearLeafGBDT(n_trees=20, max_depth=1, learning_rate=0.1)
+        model.fit(X, y)
+        pred = model.predict(X)
+
+        mse = np.mean((pred - y) ** 2)
+        baseline_mse = np.var(y)
+        assert mse < baseline_mse, "Model should fit better than mean prediction"
+
+    def test_single_tree(self):
+        """Should work with a single tree."""
+        rng = np.random.RandomState(42)
+        X = rng.randn(50, 3).astype(np.float32)
+        y = rng.randn(50).astype(np.float32)
+
+        model = ob.LinearLeafGBDT(n_trees=1, max_depth=3)
+        model.fit(X, y)
+        pred = model.predict(X)
+
+        assert pred.shape == y.shape
+        assert np.all(np.isfinite(pred))
+
+
+class TestLinearLeafPersistence:
+    """Save/load functionality."""
+
+    def test_save_load_roundtrip(self, regression_100x5, tmp_path):
+        """Predictions should match after save/load."""
+        X, y = regression_100x5
+
+        model = ob.LinearLeafGBDT(n_trees=5, max_depth=2)
+        model.fit(X, y)
+        pred_before = model.predict(X)
+
+        path = str(tmp_path / "linear_leaf_model.json")
+        model.save(path)
+
+        loaded = ob.LinearLeafGBDT.load(path)
+        pred_after = loaded.predict(X)
+
+        np.testing.assert_array_equal(pred_before, pred_after)
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
diff --git a/tests/test_loss_correctness.py b/tests/test_loss_correctness.py
new file mode 100644
index 0000000..62dc08c
--- /dev/null
+++ b/tests/test_loss_correctness.py
@@ -0,0 +1,298 @@
+"""Loss function correctness tests for OpenBoost.
+
+Verifies all loss functions using two approaches:
+1. Analytical: compare gradient/hessian against independently computed formulas
+2. Numerical differentiation: central differences vs returned gradient
+
+This catches sign errors, missing factors, numerical instability.
+"""
+
+import numpy as np
+import pytest
+
+from openboost._loss import (
+    compute_loss_value,
+    gamma_gradient,
+    get_loss_function,
+    huber_gradient,  # noqa: F401
+    logloss_gradient,
+    mae_gradient,
+    mse_gradient,
+    poisson_gradient,
+    quantile_gradient,  # noqa: F401
+    tweedie_gradient,
+)
+
+
+def _numerical_gradient(loss_name, pred, y, eps=1e-5, **kwargs):
+    """Compute gradient numerically via central differences.
+
+    grad_i ≈ (L(pred+eps) - L(pred-eps)) / (2*eps)
+    """
+    pred = np.asarray(pred, dtype=np.float64)
+    y = np.asarray(y, dtype=np.float64)
+    n = len(pred)
+    num_grad = np.zeros(n, dtype=np.float64)
+
+    for i in range(n):
+        pred_plus = pred.copy()
+        pred_minus = pred.copy()
+        pred_plus[i] += eps
+        pred_minus[i] -= eps
+        loss_plus = compute_loss_value(loss_name, pred_plus, y, **kwargs) * n
+        loss_minus = compute_loss_value(loss_name, pred_minus, y, **kwargs) * n
+        num_grad[i] = (loss_plus - loss_minus) / (2 * eps)
+
+    return num_grad
+
+
+# =============================================================================
+# Analytical gradient verification
+# =============================================================================
+
+
+class TestAnalyticalGradients:
+    """Verify gradients against independently computed formulas."""
+
+    def test_mse_gradient_analytical(self):
+        """MSE: grad = 2*(pred - y), hess = 2."""
+        pred = np.array([1.0, 2.0, 3.0], dtype=np.float32)
+        y = np.array([0.5, 2.5, 1.0], dtype=np.float32)
+
+        grad, hess = mse_gradient(pred, y)
+
+        expected_grad = 2.0 * (pred - y)
+        np.testing.assert_allclose(grad, expected_grad, atol=1e-6)
+        np.testing.assert_allclose(hess, 2.0, atol=1e-6)
+
+    def test_mse_gradient_zero_at_match(self):
+        """MSE gradient should be zero when pred == y."""
+        y = np.array([1.0, 2.0, 3.0], dtype=np.float32)
+        grad, hess = mse_gradient(y.copy(), y)
+        np.testing.assert_allclose(grad, 0.0, atol=1e-6)
+
+    def test_logloss_gradient_analytical(self):
+        """LogLoss: grad = sigmoid(pred) - y, hess = p*(1-p)."""
+        pred = np.array([0.0, 1.0, -1.0, 2.0], dtype=np.float32)
+        y = np.array([1.0, 0.0, 1.0, 1.0], dtype=np.float32)
+
+        grad, hess = logloss_gradient(pred, y)
+
+        # Independently compute sigmoid
+        p = 1.0 / (1.0 + np.exp(-pred.astype(np.float64)))
+        expected_grad = (p - y).astype(np.float32)
+        expected_hess = np.clip((p * (1 - p)).astype(np.float32), 1e-6, 1.0 - 1e-6)
+
+        np.testing.assert_allclose(grad, expected_grad, atol=1e-5)
+        np.testing.assert_allclose(hess, expected_hess, atol=1e-5)
+
+    def test_mae_gradient_sign(self):
+        """MAE: grad = sign(pred - y)."""
+        pred = np.array([1.0, 2.0, 3.0], dtype=np.float32)
+        y = np.array([0.5, 3.0, 1.0], dtype=np.float32)
+
+        grad, hess = mae_gradient(pred, y)
+
+        expected_sign = np.sign(pred - y)
+        np.testing.assert_allclose(grad, expected_sign, atol=1e-5)
+
+    def test_poisson_gradient_analytical(self):
+        """Poisson: grad = exp(pred) - y."""
+        pred = np.array([0.0, 1.0, -0.5], dtype=np.float32)
+        y = np.array([2.0, 1.0, 3.0], dtype=np.float32)
+
+        grad, hess = poisson_gradient(pred, y)
+
+        expected_grad = np.exp(pred) - y
+        np.testing.assert_allclose(grad, expected_grad, atol=1e-4)
+
+    def test_gamma_gradient_analytical(self):
+        """Gamma: grad = 1 - y*exp(-pred), hess = y*exp(-pred)."""
+        pred = np.array([1.0, 0.5, 2.0], dtype=np.float32)
+        y = np.array([2.0, 1.0, 3.0], dtype=np.float32)
+
+        grad, hess = gamma_gradient(pred, y)
+
+        expected_grad = 1.0 - y * np.exp(-pred)
+        np.testing.assert_allclose(grad, expected_grad, atol=1e-4)
+
+
+# =============================================================================
+# Numerical differentiation verification
+# =============================================================================
+
+
+class TestNumericalGradients:
+    """Verify gradients match numerical differentiation (central differences)."""
+
+    def _check_gradient(self, loss_name, pred, y, atol=1e-3, **kwargs):
+        """Helper: compare analytical gradient against numerical gradient."""
+        loss_fn = get_loss_function(loss_name, **kwargs)
+        grad, _ = loss_fn(
+            np.asarray(pred, dtype=np.float32),
+            np.asarray(y, dtype=np.float32),
+        )
+        grad = np.asarray(grad, dtype=np.float64)
+
+        num_grad = _numerical_gradient(loss_name, pred, y, **kwargs)
+
+        np.testing.assert_allclose(
+            grad, num_grad, atol=atol,
+            err_msg=f"Gradient mismatch for {loss_name}"
+        )
+
+    def test_mse_gradient_numerical(self):
+        pred = np.array([1.0, 2.5, -0.3])
+        y = np.array([0.5, 3.0, 1.0])
+        self._check_gradient('mse', pred, y)
+
+    def test_logloss_gradient_numerical(self):
+        pred = np.array([0.5, -1.0, 2.0])
+        y = np.array([1.0, 0.0, 1.0])
+        self._check_gradient('logloss', pred, y)
+
+    def test_huber_gradient_numerical(self):
+        pred = np.array([1.0, 5.0, -2.0])
+        y = np.array([0.5, 0.0, 1.0])
+        self._check_gradient('huber', pred, y, huber_delta=1.0)
+
+    def test_poisson_gradient_numerical(self):
+        pred = np.array([0.5, 1.0, -0.5])
+        y = np.array([2.0, 1.0, 3.0])
+        self._check_gradient('poisson', pred, y, atol=1e-2)
+
+    def test_gamma_gradient_numerical(self):
+        pred = np.array([0.5, 1.0, 1.5])
+        y = np.array([2.0, 1.0, 3.0])
+        self._check_gradient('gamma', pred, y, atol=1e-2)
+
+    @pytest.mark.parametrize("rho", [1.1, 1.5, 1.9])
+    def test_tweedie_gradient_numerical(self, rho):
+        pred = np.array([0.5, 1.0, 0.2])
+        y = np.array([2.0, 1.0, 3.0])
+        self._check_gradient('tweedie', pred, y, tweedie_rho=rho, atol=1e-2)
+
+    @pytest.mark.parametrize("alpha", [0.1, 0.5, 0.9])
+    def test_quantile_gradient_numerical(self, alpha):
+        """Quantile loss gradient via numerical differentiation.
+
+        Note: quantile loss has discontinuous gradient at pred=y,
+        so we avoid exact match points.
+        """
+        pred = np.array([1.5, 0.3, -0.7])
+        y = np.array([1.0, 2.0, 0.5])
+        self._check_gradient('quantile', pred, y, quantile_alpha=alpha, atol=0.1)
+
+
+# =============================================================================
+# Edge cases and numerical stability
+# =============================================================================
+
+
+class TestLossEdgeCases:
+    """Edge cases that can cause NaN, overflow, or incorrect behavior."""
+
+    def test_logloss_extreme_negative_pred(self):
+        """Very negative predictions should not produce NaN."""
+        pred = np.array([-500.0, -100.0], dtype=np.float32)
+        y = np.array([1.0, 0.0], dtype=np.float32)
+
+        grad, hess = logloss_gradient(pred, y)
+
+        assert np.all(np.isfinite(grad)), f"NaN in logloss grad: {grad}"
+        assert np.all(np.isfinite(hess)), f"NaN in logloss hess: {hess}"
+
+    def test_logloss_extreme_positive_pred(self):
+        """Very positive predictions should not produce NaN."""
+        pred = np.array([500.0, 100.0], dtype=np.float32)
+        y = np.array([0.0, 1.0], dtype=np.float32)
+
+        grad, hess = logloss_gradient(pred, y)
+
+        assert np.all(np.isfinite(grad)), f"NaN in logloss grad: {grad}"
+        assert np.all(np.isfinite(hess)), f"NaN in logloss hess: {hess}"
+
+    def test_poisson_large_pred_no_overflow(self):
+        """exp(pred) should not overflow for large predictions."""
+        pred = np.array([15.0, 18.0], dtype=np.float32)
+        y = np.array([1.0, 2.0], dtype=np.float32)
+
+        grad, hess = poisson_gradient(pred, y)
+
+        assert np.all(np.isfinite(grad)), f"Overflow in poisson grad: {grad}"
+        assert np.all(np.isfinite(hess)), f"Overflow in poisson hess: {hess}"
+
+    def test_tweedie_zero_y(self):
+        """y=0 is valid for Tweedie — should not produce NaN."""
+        pred = np.array([0.5, 1.0], dtype=np.float32)
+        y = np.array([0.0, 0.0], dtype=np.float32)
+
+        grad, hess = tweedie_gradient(pred, y, rho=1.5)
+
+        assert np.all(np.isfinite(grad)), f"NaN in tweedie grad with y=0: {grad}"
+        assert np.all(np.isfinite(hess)), f"NaN in tweedie hess with y=0: {hess}"
+
+    def test_all_losses_finite_on_normal_input(self):
+        """Every built-in loss should produce finite grad/hess on normal inputs."""
+        rng = np.random.RandomState(42)
+        pred = rng.randn(20).astype(np.float32)
+        y_reg = rng.randn(20).astype(np.float32)
+        y_bin = (rng.rand(20) > 0.5).astype(np.float32)
+        y_pos = np.abs(y_reg) + 0.1  # Positive for Poisson/Gamma
+
+        losses_and_data = [
+            ('mse', pred, y_reg),
+            ('mae', pred, y_reg),
+            ('huber', pred, y_reg),
+            ('logloss', pred, y_bin),
+            ('poisson', pred * 0.5, y_pos),  # Smaller pred to avoid overflow
+            ('gamma', np.abs(pred) + 0.1, y_pos),
+        ]
+
+        for loss_name, p, y in losses_and_data:
+            loss_fn = get_loss_function(loss_name)
+            grad, hess = loss_fn(p, y)
+            assert np.all(np.isfinite(grad)), f"{loss_name}: NaN/inf in grad"
+            assert np.all(np.isfinite(hess)), f"{loss_name}: NaN/inf in hess"
+
+
+# =============================================================================
+# Loss value computation
+# =============================================================================
+
+
+class TestLossValueComputation:
+    """Verify compute_loss_value returns correct scalar losses."""
+
+    def test_mse_loss_value(self):
+        pred = np.array([1.0, 2.0, 3.0])
+        y = np.array([1.5, 2.5, 2.0])
+        loss = compute_loss_value('mse', pred, y)
+        expected = np.mean((pred - y) ** 2)
+        np.testing.assert_almost_equal(loss, expected, decimal=6)
+
+    def test_mae_loss_value(self):
+        pred = np.array([1.0, 2.0, 3.0])
+        y = np.array([1.5, 2.5, 2.0])
+        loss = compute_loss_value('mae', pred, y)
+        expected = np.mean(np.abs(pred - y))
+        np.testing.assert_almost_equal(loss, expected, decimal=6)
+
+    def test_logloss_value(self):
+        pred = np.array([2.0, -1.0])
+        y = np.array([1.0, 0.0])
+        loss = compute_loss_value('logloss', pred, y)
+        # Manual: p = sigmoid(pred), -mean(y*log(p) + (1-y)*log(1-p))
+        p = 1.0 / (1.0 + np.exp(-pred.astype(np.float64)))
+        expected = -np.mean(y * np.log(p) + (1 - y) * np.log(1 - p))
+        np.testing.assert_almost_equal(loss, expected, decimal=6)
+
+    def test_loss_zero_when_perfect(self):
+        """MSE loss should be zero when predictions are perfect."""
+        y = np.array([1.0, 2.0, 3.0])
+        loss = compute_loss_value('mse', y, y)
+        np.testing.assert_almost_equal(loss, 0.0, decimal=10)
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
diff --git a/tests/test_numerical_agreement.py b/tests/test_numerical_agreement.py
new file mode 100644
index 0000000..e79f595
--- /dev/null
+++ b/tests/test_numerical_agreement.py
@@ -0,0 +1,268 @@
+"""Numerical agreement tests: OpenBoost vs XGBoost.
+
+For matched hyperparameters, OpenBoost predictions should be very close
+to XGBoost predictions. This is the strongest end-to-end correctness signal.
+
+All tests are marked @pytest.mark.xgboost and skip if xgboost is not installed.
+"""
+
+import numpy as np
+import pytest
+
+import openboost as ob
+
+xgb = pytest.importorskip("xgboost")
+
+
+def _matched_params(n_trees=50, max_depth=4):
+    """Hyperparameters that align OpenBoost and XGBoost behavior."""
+    return dict(
+        ob_params=dict(
+            n_trees=n_trees,
+            max_depth=max_depth,
+            learning_rate=0.1,
+            reg_lambda=1.0,
+            min_child_weight=1.0,
+            subsample=1.0,
+            colsample_bytree=1.0,
+        ),
+        xgb_params=dict(
+            n_estimators=n_trees,
+            max_depth=max_depth,
+            learning_rate=0.1,
+            reg_lambda=1.0,
+            reg_alpha=0.0,
+            min_child_weight=1.0,
+            subsample=1.0,
+            colsample_bytree=1.0,
+            tree_method='hist',
+            max_bin=255,
+            random_state=42,
+        ),
+    )
+
+
+@pytest.mark.xgboost
+class TestXGBoostRegressionAgreement:
+    """Regression prediction agreement between OpenBoost and XGBoost."""
+
+    def test_single_tree_very_close(self, regression_500x10):
+        """Single depth-1 tree should produce very similar predictions."""
+        X, y = regression_500x10
+        params = _matched_params(n_trees=1, max_depth=1)
+
+        ob_model = ob.GradientBoosting(**params['ob_params'])
+        ob_model.fit(X, y)
+        ob_pred = ob_model.predict(X)
+
+        xgb_model = xgb.XGBRegressor(**params['xgb_params'])
+        xgb_model.fit(X, y)
+        xgb_pred = xgb_model.predict(X)
+
+        # Single tree, depth 1: very few ways to differ
+        rmse_diff = np.sqrt(np.mean((ob_pred - xgb_pred) ** 2))
+        target_std = np.std(y)
+
+        assert rmse_diff / target_std < 0.05, (
+            f"Single tree predictions differ too much: RMSE diff = {rmse_diff:.4f}, "
+            f"target std = {target_std:.4f} (ratio = {rmse_diff/target_std:.3f})"
+        )
+
+    def test_regression_predictions_close(self, regression_500x10):
+        """50-tree predictions should be within 5% relative RMSE."""
+        X, y = regression_500x10
+        params = _matched_params(n_trees=50, max_depth=4)
+
+        ob_model = ob.GradientBoosting(**params['ob_params'])
+        ob_model.fit(X, y)
+        ob_pred = ob_model.predict(X)
+
+        xgb_model = xgb.XGBRegressor(**params['xgb_params'])
+        xgb_model.fit(X, y)
+        xgb_pred = xgb_model.predict(X)
+
+        rmse_diff = np.sqrt(np.mean((ob_pred - xgb_pred) ** 2))
+        rmse_target = np.sqrt(np.mean((y - np.mean(y)) ** 2))
+
+        assert rmse_diff / rmse_target < 0.10, (
+            f"Prediction RMSE diff {rmse_diff:.4f} is >{10}% of target RMSE {rmse_target:.4f}"
+        )
+
+    def test_predictions_same_direction(self, regression_500x10):
+        """Predictions should agree on relative ordering (correlation > 0.95)."""
+        X, y = regression_500x10
+        params = _matched_params(n_trees=50, max_depth=4)
+
+        ob_model = ob.GradientBoosting(**params['ob_params'])
+        ob_model.fit(X, y)
+        ob_pred = ob_model.predict(X)
+
+        xgb_model = xgb.XGBRegressor(**params['xgb_params'])
+        xgb_model.fit(X, y)
+        xgb_pred = xgb_model.predict(X)
+
+        correlation = np.corrcoef(ob_pred, xgb_pred)[0, 1]
+        assert correlation > 0.95, (
+            f"Prediction correlation should be > 0.95, got {correlation:.4f}"
+        )
+
+
+@pytest.mark.xgboost
+class TestXGBoostClassificationAgreement:
+    """Classification prediction agreement."""
+
+    def test_classification_probabilities_close(self, binary_500x10):
+        """Predicted probabilities should be within 0.10 of each other."""
+        X, y = binary_500x10
+        params = _matched_params(n_trees=50, max_depth=4)
+
+        ob_model = ob.GradientBoosting(
+            loss='logloss', **params['ob_params']
+        )
+        ob_model.fit(X, y)
+        ob_raw = ob_model.predict(X)
+        # Convert logits to probabilities
+        ob_prob = 1.0 / (1.0 + np.exp(-ob_raw))
+
+        xgb_model = xgb.XGBClassifier(
+            objective='binary:logistic',
+            eval_metric='logloss',
+            **params['xgb_params'],
+        )
+        xgb_model.fit(X, y)
+        xgb_prob = xgb_model.predict_proba(X)[:, 1]
+
+        mean_diff = np.mean(np.abs(ob_prob - xgb_prob))
+
+        assert mean_diff < 0.10, (
+            f"Mean probability difference {mean_diff:.4f} > 0.10"
+        )
+
+    def test_classification_accuracy_comparable(self, binary_500x10):
+        """Both models should achieve similar accuracy."""
+        X, y = binary_500x10
+        params = _matched_params(n_trees=50, max_depth=4)
+
+        ob_model = ob.GradientBoosting(
+            loss='logloss', **params['ob_params']
+        )
+        ob_model.fit(X, y)
+        ob_raw = ob_model.predict(X)
+        ob_labels = (ob_raw > 0).astype(float)
+        ob_acc = np.mean(ob_labels == y)
+
+        xgb_model = xgb.XGBClassifier(
+            objective='binary:logistic',
+            **params['xgb_params'],
+        )
+        xgb_model.fit(X, y)
+        xgb_labels = xgb_model.predict(X)
+        xgb_acc = np.mean(xgb_labels == y)
+
+        # Accuracies should be within 5 percentage points
+        assert abs(ob_acc - xgb_acc) < 0.05, (
+            f"Accuracy gap too large: OB={ob_acc:.3f}, XGB={xgb_acc:.3f}"
+        )
+
+
+@pytest.mark.xgboost
+class TestXGBoostQualityParity:
+    """Model quality should be competitive with XGBoost."""
+
+    def test_regression_r2_competitive(self, regression_500x10):
+        """OpenBoost R2 should be within 15% of XGBoost R2."""
+        X, y = regression_500x10
+        params = _matched_params(n_trees=100, max_depth=4)
+
+        ob_model = ob.GradientBoosting(**params['ob_params'])
+        ob_model.fit(X, y)
+        ob_pred = ob_model.predict(X)
+        ss_res_ob = np.sum((y - ob_pred) ** 2)
+        ss_tot = np.sum((y - np.mean(y)) ** 2)
+        ob_r2 = 1 - ss_res_ob / ss_tot
+
+        xgb_model = xgb.XGBRegressor(**params['xgb_params'])
+        xgb_model.fit(X, y)
+        xgb_pred = xgb_model.predict(X)
+        ss_res_xgb = np.sum((y - xgb_pred) ** 2)
+        xgb_r2 = 1 - ss_res_xgb / ss_tot
+
+        assert ob_r2 > xgb_r2 * 0.85, (
+            f"OpenBoost R2 ({ob_r2:.4f}) should be within 15% of XGBoost R2 ({xgb_r2:.4f})"
+        )
+
+    @pytest.mark.slow
+    def test_regression_california_housing(self):
+        """Real dataset: California Housing regression."""
+        pytest.importorskip("sklearn")
+        from sklearn.datasets import fetch_california_housing
+        from sklearn.model_selection import train_test_split
+
+        try:
+            data = fetch_california_housing()
+        except Exception:
+            pytest.skip("Could not download California Housing dataset")
+
+        X_train, X_test, y_train, y_test = train_test_split(
+            data.data.astype(np.float32),
+            data.target.astype(np.float32),
+            test_size=0.2, random_state=42,
+        )
+
+        params = _matched_params(n_trees=100, max_depth=6)
+
+        ob_model = ob.GradientBoosting(**params['ob_params'])
+        ob_model.fit(X_train, y_train)
+        ob_pred = ob_model.predict(X_test)
+        ob_rmse = np.sqrt(np.mean((ob_pred - y_test) ** 2))
+
+        xgb_model = xgb.XGBRegressor(**params['xgb_params'])
+        xgb_model.fit(X_train, y_train)
+        xgb_pred = xgb_model.predict(X_test)
+        xgb_rmse = np.sqrt(np.mean((xgb_pred - y_test) ** 2))
+
+        # OpenBoost RMSE should be within 15% of XGBoost RMSE
+        assert ob_rmse < xgb_rmse * 1.15, (
+            f"OB RMSE ({ob_rmse:.4f}) > 1.15x XGB RMSE ({xgb_rmse:.4f})"
+        )
+
+    @pytest.mark.slow
+    def test_binary_breast_cancer(self):
+        """Real dataset: Breast Cancer binary classification."""
+        pytest.importorskip("sklearn")
+        from sklearn.datasets import load_breast_cancer
+        from sklearn.model_selection import train_test_split
+
+        data = load_breast_cancer()
+        X_train, X_test, y_train, y_test = train_test_split(
+            data.data.astype(np.float32),
+            data.target.astype(np.float32),
+            test_size=0.2, random_state=42,
+        )
+
+        params = _matched_params(n_trees=50, max_depth=4)
+
+        ob_model = ob.GradientBoosting(
+            loss='logloss', **params['ob_params']
+        )
+        ob_model.fit(X_train, y_train)
+        ob_pred = ob_model.predict(X_test)
+        ob_acc = np.mean((ob_pred > 0).astype(float) == y_test)
+
+        xgb_model = xgb.XGBClassifier(
+            objective='binary:logistic',
+            **params['xgb_params'],
+        )
+        xgb_model.fit(X_train, y_train)
+        xgb_acc = np.mean(xgb_model.predict(X_test) == y_test)
+
+        # Both should achieve > 90% accuracy
+        assert ob_acc > 0.90, f"OB accuracy {ob_acc:.3f} < 0.90"
+        # Within 5 points of each other
+        assert abs(ob_acc - xgb_acc) < 0.05, (
+            f"Accuracy gap: OB={ob_acc:.3f}, XGB={xgb_acc:.3f}"
+        )
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
diff --git a/uv.lock b/uv.lock
index 077322d..f9bcdcf 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1,5 +1,5 @@
 version = 1
-revision = 3
+revision = 2
 requires-python = ">=3.10"
 resolution-markers = [
     "python_full_version >= '3.13'",
@@ -615,6 +615,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/8a/0e/97c33bf5009bdbac74fd2beace167cab3f978feb69cc36f1ef79360d6c4e/exceptiongroup-1.3.1-py3-none-any.whl", hash = "sha256:a7a39a3bd276781e98394987d3a5701d0c4edffb633bb7a5144577f82c773598", size = 16740, upload-time = "2025-11-21T23:01:53.443Z" },
 ]
 
+[[package]]
+name = "execnet"
+version = "2.1.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/bf/89/780e11f9588d9e7128a3f87788354c7946a9cbb1401ad38a48c4db9a4f07/execnet-2.1.2.tar.gz", hash = "sha256:63d83bfdd9a23e35b9c6a3261412324f964c2ec8dcd8d3c6916ee9373e0befcd", size = 166622, upload-time = "2025-11-12T09:56:37.75Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ab/84/02fc1827e8cdded4aa65baef11296a9bbe595c474f0d6d758af082d849fd/execnet-2.1.2-py3-none-any.whl", hash = "sha256:67fba928dd5a544b783f6056f449e5e3931a5c378b128bc18501f7ea79e296ec", size = 40708, upload-time = "2025-11-12T09:56:36.333Z" },
+]
+
 [[package]]
 name = "fastrlock"
 version = "0.8.3"
@@ -2405,6 +2414,7 @@ all = [
     { name = "numba-cuda" },
     { name = "pytest" },
     { name = "pytest-cov" },
+    { name = "pytest-xdist" },
     { name = "ray", extra = ["default"] },
     { name = "ruff" },
     { name = "scikit-learn", version = "1.7.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
@@ -2426,6 +2436,7 @@ dev = [
     { name = "modal" },
     { name = "pytest" },
     { name = "pytest-cov" },
+    { name = "pytest-xdist" },
     { name = "ruff" },
     { name = "scikit-learn", version = "1.7.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
     { name = "scikit-learn", version = "1.8.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
@@ -2445,6 +2456,7 @@ sklearn = [
 test = [
     { name = "pytest" },
     { name = "pytest-cov" },
+    { name = "pytest-xdist" },
 ]
 torch = [
     { name = "torch" },
@@ -2460,6 +2472,7 @@ dev = [
     { name = "mypy" },
     { name = "pytest" },
     { name = "pytest-cov" },
+    { name = "pytest-xdist" },
     { name = "ruff" },
     { name = "xgboost" },
 ]
@@ -2477,6 +2490,7 @@ requires-dist = [
     { name = "openboost", extras = ["test", "bench", "sklearn"], marker = "extra == 'dev'" },
     { name = "pytest", marker = "extra == 'test'", specifier = ">=7.0" },
     { name = "pytest-cov", marker = "extra == 'test'", specifier = ">=4.0" },
+    { name = "pytest-xdist", marker = "extra == 'test'", specifier = ">=3.0" },
     { name = "ray", extras = ["default"], marker = "extra == 'distributed'", specifier = ">=2.0" },
     { name = "ruff", marker = "extra == 'dev'", specifier = ">=0.4" },
     { name = "scikit-learn", marker = "extra == 'bench'", specifier = ">=1.0" },
@@ -2494,10 +2508,11 @@ dev = [
     { name = "mkdocs-section-index", specifier = ">=0.3.10" },
     { name = "mkdocstrings", extras = ["python"], specifier = ">=1.0.1" },
     { name = "mypy", specifier = ">=1.19.1" },
-    { name = "pytest", specifier = ">=8.0" },
+    { name = "pytest", specifier = ">=7.0" },
     { name = "pytest-cov", specifier = ">=4.0" },
+    { name = "pytest-xdist", specifier = ">=3.0" },
     { name = "ruff", specifier = ">=0.4" },
-    { name = "xgboost", specifier = ">=3.1.3" },
+    { name = "xgboost", specifier = ">=2.0" },
 ]
 
 [[package]]
@@ -3016,6 +3031,19 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/ee/49/1377b49de7d0c1ce41292161ea0f721913fa8722c19fb9c1e3aa0367eecb/pytest_cov-7.0.0-py3-none-any.whl", hash = "sha256:3b8e9558b16cc1479da72058bdecf8073661c7f57f7d3c5f22a1c23507f2d861", size = 22424, upload-time = "2025-09-09T10:57:00.695Z" },
 ]
 
+[[package]]
+name = "pytest-xdist"
+version = "3.8.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "execnet" },
+    { name = "pytest" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/78/b4/439b179d1ff526791eb921115fca8e44e596a13efeda518b9d845a619450/pytest_xdist-3.8.0.tar.gz", hash = "sha256:7e578125ec9bc6050861aa93f2d59f1d8d085595d6551c2c90b6f4fad8d3a9f1", size = 88069, upload-time = "2025-07-01T13:30:59.346Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ca/31/d4e37e9e550c2b92a9cbc2e4d0b7420a27224968580b5a447f420847c975/pytest_xdist-3.8.0-py3-none-any.whl", hash = "sha256:202ca578cfeb7370784a8c33d6d05bc6e13b4f25b5053c30a152269fd10f0b88", size = 46396, upload-time = "2025-07-01T13:30:56.632Z" },
+]
+
 [[package]]
 name = "python-dateutil"
 version = "2.9.0.post0"

From c2a4e84293b876cc995a83423dc1f657bb764768 Mon Sep 17 00:00:00 2001
From: J Xu <jxucoder@gmail.com>
Date: Sun, 22 Mar 2026 19:32:48 -0700
Subject: [PATCH 2/6] Fix all ruff lint errors (276 errors resolved)

- Add E402, F821 to ruff ignore list (intentional patterns)
- Auto-fix: unused imports, f-string placeholders, nested ifs, ternaries
- Add `from err` to raise-in-except clauses (B904)
- Suppress intentional ABC no-op methods (B024/B027)
- Rename unused loop variables (B007)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .claude/worktrees/silly-kirch            |   1 +
 pyproject.toml                           |   6 +-
 src/openboost/__init__.py                | 285 ++++++++++++-----------
 src/openboost/_array.py                  |  10 +-
 src/openboost/_backends/_cpu.py          |  26 +--
 src/openboost/_backends/_cuda.py         |  51 ++--
 src/openboost/_batch.py                  |   3 +-
 src/openboost/_boosting.py               |   6 +-
 src/openboost/_callbacks.py              |  22 +-
 src/openboost/_core/__init__.py          |  53 ++---
 src/openboost/_core/_growth.py           |  25 +-
 src/openboost/_core/_histogram.py        |   1 +
 src/openboost/_core/_predict.py          |  13 +-
 src/openboost/_core/_primitives.py       |  28 +--
 src/openboost/_core/_split.py            |   3 +-
 src/openboost/_core/_tree.py             |  38 +--
 src/openboost/_distributed/__init__.py   |   7 +-
 src/openboost/_distributed/_multigpu.py  |  58 ++---
 src/openboost/_distributed/_ray.py       |   9 +-
 src/openboost/_distributed/_tree.py      |  29 +--
 src/openboost/_distributions.py          |  18 +-
 src/openboost/_histogram.py              |   2 +-
 src/openboost/_importance.py             |   9 +-
 src/openboost/_kernels.py                |  14 +-
 src/openboost/_loss.py                   |  22 +-
 src/openboost/_models/__init__.py        |  28 +--
 src/openboost/_models/_batch.py          |   3 +-
 src/openboost/_models/_boosting.py       |  34 +--
 src/openboost/_models/_dart.py           |  15 +-
 src/openboost/_models/_distributional.py |  10 +-
 src/openboost/_models/_gam.py            |   9 +-
 src/openboost/_models/_linear_leaf.py    |  16 +-
 src/openboost/_models/_sklearn.py        |  18 +-
 src/openboost/_persistence.py            |  12 +-
 src/openboost/_predict.py                |   8 +-
 src/openboost/_profiler.py               |  13 +-
 src/openboost/_sampling.py               |  15 +-
 src/openboost/_split.py                  |   2 +-
 src/openboost/_training.py               |   5 +-
 src/openboost/_utils.py                  |  29 ++-
 src/openboost/_validation.py             |  15 +-
 41 files changed, 442 insertions(+), 529 deletions(-)
 create mode 160000 .claude/worktrees/silly-kirch

diff --git a/.claude/worktrees/silly-kirch b/.claude/worktrees/silly-kirch
new file mode 160000
index 0000000..7450b78
--- /dev/null
+++ b/.claude/worktrees/silly-kirch
@@ -0,0 +1 @@
+Subproject commit 7450b7841f2af64569a9cc573946c42b0fcae851
diff --git a/pyproject.toml b/pyproject.toml
index 87a468a..4390e3c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -89,7 +89,11 @@ src = ["src"]
 
 [tool.ruff.lint]
 select = ["E", "F", "I", "UP", "B", "SIM"]
-ignore = ["E501"]  # Line length handled separately
+ignore = [
+    "E501",   # Line length handled separately
+    "E402",   # Imports organized by section in __init__.py
+    "F821",   # DeviceNDArray type hints for optional CUDA (not available at lint time)
+]
 
 [tool.pytest.ini_options]
 testpaths = ["tests"]
diff --git a/src/openboost/__init__.py b/src/openboost/__init__.py
index 206e100..8e21fb9 100644
--- a/src/openboost/__init__.py
+++ b/src/openboost/__init__.py
@@ -39,7 +39,7 @@
 # =============================================================================
 # Data Layer
 # =============================================================================
-from ._array import BinnedArray, array, as_numba_array, MISSING_BIN
+from ._array import MISSING_BIN, BinnedArray, array, as_numba_array
 
 # =============================================================================
 # Core (Foundation)
@@ -48,41 +48,43 @@
     # Growth strategies (Phase 8.2)
     GrowthConfig,
     GrowthStrategy,
-    TreeStructure,
-    LevelWiseGrowth,
-    LeafWiseGrowth,
-    SymmetricGrowth,
-    get_growth_strategy,
     # Leaf value abstractions (Phase 9.0)
     LeafValues,
+    LeafWiseGrowth,
+    LevelWiseGrowth,
+    # Primitives (Phase 8.1)
+    NodeHistogram,
+    NodeSplit,
     ScalarLeaves,
+    SymmetricGrowth,
+    # Symmetric trees
+    SymmetricTree,
+    TreeNode,
+    TreeStructure,
     VectorLeaves,
+    build_node_histograms,
+    compute_leaf_values,
+    find_node_splits,
     # Tree building
     fit_tree,
-    fit_trees_batch,
-    Tree as LegacyTree,
-    TreeNode,
     fit_tree_gpu_native,
-    predict_tree,
-    # Symmetric trees
-    SymmetricTree,
     fit_tree_symmetric,
     fit_tree_symmetric_gpu_native,
-    predict_symmetric_tree,
-    # Primitives (Phase 8.1)
-    NodeHistogram,
-    NodeSplit,
-    build_node_histograms,
-    subtract_histogram,
-    find_node_splits,
-    partition_samples,
-    compute_leaf_values,
-    init_sample_node_ids,
-    get_nodes_at_depth,
+    fit_trees_batch,
     get_children,
+    get_growth_strategy,
+    get_nodes_at_depth,
     get_parent,
+    init_sample_node_ids,
+    partition_samples,
     # Prediction
     predict_ensemble,
+    predict_symmetric_tree,
+    predict_tree,
+    subtract_histogram,
+)
+from ._core import (
+    Tree as LegacyTree,
 )
 
 # Phase 8: TreeStructure is the new Tree
@@ -91,78 +93,56 @@
 # =============================================================================
 # Models (High-Level)
 # =============================================================================
-from ._models import (
-    GradientBoosting,
-    MultiClassGradientBoosting,
-    DART,
-    OpenBoostGAM,
-    ConfigBatch,
-    BatchTrainingState,
-    # Phase 13: sklearn-compatible wrappers
-    OpenBoostRegressor,
-    OpenBoostClassifier,
-    # Phase 15: sklearn wrappers for new models
-    OpenBoostDistributionalRegressor,
-    OpenBoostLinearLeafRegressor,
-    # Phase 15/16: Distributional GBDT (NaturalBoost)
-    DistributionalGBDT,
-    NaturalBoost,
-    NaturalBoostNormal,
-    NaturalBoostLogNormal,
-    NaturalBoostGamma,
-    NaturalBoostPoisson,
-    NaturalBoostStudentT,
-    NaturalBoostTweedie,
-    NaturalBoostNegBin,
-    # Backward compatibility aliases (deprecated, accessed via __getattr__)
-    NGBoost as _NGBoost,
-    NGBoostNormal as _NGBoostNormal,
-    NGBoostLogNormal as _NGBoostLogNormal,
-    NGBoostGamma as _NGBoostGamma,
-    NGBoostPoisson as _NGBoostPoisson,
-    NGBoostStudentT as _NGBoostStudentT,
-    NGBoostTweedie as _NGBoostTweedie,
-    NGBoostNegBin as _NGBoostNegBin,
-    # Phase 15: Linear Leaf GBDT
-    LinearLeafGBDT,
+# =============================================================================
+# Backend Control
+# =============================================================================
+from ._backends import get_backend, is_cpu, is_cuda, set_backend
+
+# =============================================================================
+# Callbacks (Phase 13)
+# =============================================================================
+from ._callbacks import (
+    Callback,
+    CallbackManager,
+    EarlyStopping,
+    HistoryCallback,
+    LearningRateScheduler,
+    Logger,
+    ModelCheckpoint,
+    TrainingState,
+)
+
+# =============================================================================
+# Multi-GPU Training (Phase 18)
+# =============================================================================
+from ._distributed import (
+    GPUWorker,
+    GPUWorkerBase,
+    MultiGPUContext,
+    fit_tree_multigpu,
 )
 
 # =============================================================================
 # Distributions (Phase 15)
 # =============================================================================
 from ._distributions import (
+    # Custom distributions with autodiff
+    CustomDistribution,
     Distribution,
     DistributionOutput,
-    Normal,
-    LogNormal,
     Gamma,
+    LogNormal,
+    NegativeBinomial,
+    Normal,
     Poisson,
     StudentT,
     # Kaggle competition favorites
     Tweedie,
-    NegativeBinomial,
-    # Custom distributions with autodiff
-    CustomDistribution,
     create_custom_distribution,
     get_distribution,
     list_distributions,
 )
 
-# =============================================================================
-# Callbacks (Phase 13)
-# =============================================================================
-from ._callbacks import (
-    Callback,
-    EarlyStopping,
-    Logger,
-    ModelCheckpoint,
-    LearningRateScheduler,
-    HistoryCallback,
-    CallbackManager,
-    TrainingState,
-)
-from ._profiler import ProfilingCallback
-
 # =============================================================================
 # Feature Importance (Phase 13)
 # =============================================================================
@@ -176,93 +156,124 @@
 # Loss Functions
 # =============================================================================
 from ._loss import (
-    mse_gradient,
-    logloss_gradient,
-    huber_gradient,
-    mae_gradient,        # Phase 9.1
-    quantile_gradient,   # Phase 9.1
-    poisson_gradient,    # Phase 9.3
-    gamma_gradient,      # Phase 9.3
-    tweedie_gradient,    # Phase 9.3
-    softmax_gradient,    # Phase 9.2
+    gamma_gradient,  # Phase 9.3
     get_loss_function,
+    huber_gradient,
+    logloss_gradient,
+    mae_gradient,  # Phase 9.1
+    mse_gradient,
+    poisson_gradient,  # Phase 9.3
+    quantile_gradient,  # Phase 9.1
+    softmax_gradient,  # Phase 9.2
+    tweedie_gradient,  # Phase 9.3
 )
-
-# =============================================================================
-# Backend Control
-# =============================================================================
-from ._backends import get_backend, set_backend, is_cuda, is_cpu
+from ._models import (
+    DART,
+    BatchTrainingState,
+    ConfigBatch,
+    # Phase 15/16: Distributional GBDT (NaturalBoost)
+    DistributionalGBDT,
+    GradientBoosting,
+    # Phase 15: Linear Leaf GBDT
+    LinearLeafGBDT,
+    MultiClassGradientBoosting,
+    NaturalBoost,
+    NaturalBoostGamma,
+    NaturalBoostLogNormal,
+    NaturalBoostNegBin,
+    NaturalBoostNormal,
+    NaturalBoostPoisson,
+    NaturalBoostStudentT,
+    NaturalBoostTweedie,
+    OpenBoostClassifier,
+    # Phase 15: sklearn wrappers for new models
+    OpenBoostDistributionalRegressor,
+    OpenBoostGAM,
+    OpenBoostLinearLeafRegressor,
+    # Phase 13: sklearn-compatible wrappers
+    OpenBoostRegressor,
+)
+from ._models import (
+    # Backward compatibility aliases (deprecated, accessed via __getattr__)
+    NGBoost as _NGBoost,
+)
+from ._models import (
+    NGBoostGamma as _NGBoostGamma,
+)
+from ._models import (
+    NGBoostLogNormal as _NGBoostLogNormal,
+)
+from ._models import (
+    NGBoostNegBin as _NGBoostNegBin,
+)
+from ._models import (
+    NGBoostNormal as _NGBoostNormal,
+)
+from ._models import (
+    NGBoostPoisson as _NGBoostPoisson,
+)
+from ._models import (
+    NGBoostStudentT as _NGBoostStudentT,
+)
+from ._models import (
+    NGBoostTweedie as _NGBoostTweedie,
+)
+from ._profiler import ProfilingCallback
 
 # =============================================================================
 # Sampling Strategies (Phase 17)
 # =============================================================================
 from ._sampling import (
-    SamplingStrategy,
     GOSSConfig,
     MiniBatchConfig,
-    SamplingResult,
-    goss_sample,
-    random_sample,
-    apply_sampling,
     MiniBatchIterator,
+    SamplingResult,
+    SamplingStrategy,
     accumulate_histograms_minibatch,
+    apply_sampling,
     create_memmap_binned,
+    goss_sample,
     load_memmap_binned,
-)
-
-# =============================================================================
-# Multi-GPU Training (Phase 18)
-# =============================================================================
-from ._distributed import (
-    MultiGPUContext,
-    GPUWorkerBase,
-    GPUWorker,
-    fit_tree_multigpu,
+    random_sample,
 )
 
 # =============================================================================
 # Utilities (Phase 20.6)
 # =============================================================================
-from ._utils import (
-    suggest_params,
-    cross_val_predict,
-    cross_val_predict_proba,
-    cross_val_predict_interval,
-    evaluate_coverage,
-    get_param_grid,
-    PARAM_GRID_REGRESSION,
-    PARAM_GRID_CLASSIFICATION,
-    PARAM_GRID_DISTRIBUTIONAL,
-)
-
 # =============================================================================
 # Evaluation Metrics (Phase 22)
 # =============================================================================
-from ._utils import (
-    roc_auc_score,
-    accuracy_score,
-    log_loss_score,
-    mse_score,
-    r2_score,
-    mae_score,
-    rmse_score,
-    f1_score,
-    precision_score,
-    recall_score,
-)
-
 # =============================================================================
 # Probabilistic/Distributional Metrics (Phase 22 Sprint 2)
 # =============================================================================
 from ._utils import (
-    crps_gaussian,
-    crps_empirical,
+    PARAM_GRID_CLASSIFICATION,
+    PARAM_GRID_DISTRIBUTIONAL,
+    PARAM_GRID_REGRESSION,
+    accuracy_score,
     brier_score,
-    pinball_loss,
-    interval_score,
-    expected_calibration_error,
     calibration_curve,
+    cross_val_predict,
+    cross_val_predict_interval,
+    cross_val_predict_proba,
+    crps_empirical,
+    crps_gaussian,
+    evaluate_coverage,
+    expected_calibration_error,
+    f1_score,
+    get_param_grid,
+    interval_score,
+    log_loss_score,
+    mae_score,
+    mse_score,
     negative_log_likelihood,
+    pinball_loss,
+    precision_score,
+    r2_score,
+    recall_score,
+    rmse_score,
+    roc_auc_score,
+    suggest_params,
 )
 
 _DEPRECATED_ALIASES = {
@@ -312,6 +323,7 @@ def __getattr__(name: str):
     "NaturalBoostStudentT",
     "NaturalBoostTweedie",
     "NaturalBoostNegBin",
+    "LegacyTree",
     # Backward compatibility (deprecated)
     "NGBoost",
     "NGBoostNormal",
@@ -377,6 +389,7 @@ def __getattr__(name: str):
     "fit_tree_symmetric",
     "fit_tree_symmetric_gpu_native",
     "SymmetricTree",
+    "TreeNode",
     "predict_symmetric_tree",
     # Training (batch, low-level)
     "fit_trees_batch",
diff --git a/src/openboost/_array.py b/src/openboost/_array.py
index 1607813..5370a13 100644
--- a/src/openboost/_array.py
+++ b/src/openboost/_array.py
@@ -9,8 +9,9 @@
 from __future__ import annotations
 
 import warnings
+from collections.abc import Sequence
 from dataclasses import dataclass, field
-from typing import TYPE_CHECKING, Sequence
+from typing import TYPE_CHECKING
 
 import numpy as np
 
@@ -70,7 +71,7 @@ def any_categorical(self) -> bool:
         """Check if any feature is categorical."""
         return len(self.is_categorical) > 0 and np.any(self.is_categorical)
     
-    def transform(self, X: ArrayLike) -> "BinnedArray":
+    def transform(self, X: ArrayLike) -> BinnedArray:
         """Transform new data using the bin edges from this BinnedArray.
         
         Use this method to transform test/validation data using the same
@@ -420,10 +421,7 @@ def _bin_categorical_feature(
     has_nan = bool(np.any(nan_mask))
     
     # Get unique non-missing values
-    if has_nan:
-        valid_values = col[~nan_mask]
-    else:
-        valid_values = col
+    valid_values = col[~nan_mask] if has_nan else col
     
     unique_vals = np.unique(valid_values)
     n_categories = len(unique_vals)
diff --git a/src/openboost/_backends/_cpu.py b/src/openboost/_backends/_cpu.py
index 5c2c7cc..d160e48 100644
--- a/src/openboost/_backends/_cpu.py
+++ b/src/openboost/_backends/_cpu.py
@@ -5,7 +5,6 @@
 import numpy as np
 from numba import jit, prange
 
-
 # =============================================================================
 # Histogram Functions
 # =============================================================================
@@ -589,10 +588,7 @@ def _predict_cpu(
             threshold = tree_thresholds[node]
             bin_value = binned[feature, i]
             
-            if bin_value <= threshold:
-                node = tree_left[node]
-            else:
-                node = tree_right[node]
+            node = tree_left[node] if bin_value <= threshold else tree_right[node]
         
         predictions[i] = tree_values[node]
 
@@ -671,10 +667,7 @@ def _predict_cpu_with_missing(
             # Check for missing value
             if bin_value == MISSING_BIN:
                 # Use learned direction
-                if tree_missing_left[node]:
-                    node = tree_left[node]
-                else:
-                    node = tree_right[node]
+                node = tree_left[node] if tree_missing_left[node] else tree_right[node]
             elif bin_value <= threshold:
                 node = tree_left[node]
             else:
@@ -716,24 +709,15 @@ def _predict_cpu_with_categorical(
 
             # Check for missing value
             if bin_value == MISSING_BIN:
-                if tree_missing_left[node]:
-                    node = tree_left[node]
-                else:
-                    node = tree_right[node]
+                node = tree_left[node] if tree_missing_left[node] else tree_right[node]
             elif is_categorical_split[node]:
                 # Categorical split: use bitmask
                 bitset = cat_bitsets[node]
-                if (np.int64(1) << bin_value) & bitset:
-                    node = tree_left[node]
-                else:
-                    node = tree_right[node]
+                node = tree_left[node] if np.int64(1) << bin_value & bitset else tree_right[node]
             else:
                 # Numeric split: use threshold
                 threshold = tree_thresholds[node]
-                if bin_value <= threshold:
-                    node = tree_left[node]
-                else:
-                    node = tree_right[node]
+                node = tree_left[node] if bin_value <= threshold else tree_right[node]
 
         predictions[i] = tree_values[node]
 
diff --git a/src/openboost/_backends/_cuda.py b/src/openboost/_backends/_cuda.py
index d7e5ba3..c6f9685 100644
--- a/src/openboost/_backends/_cuda.py
+++ b/src/openboost/_backends/_cuda.py
@@ -406,10 +406,9 @@ def _argmax_with_values_kernel(
     # Tree reduction to find global max
     s = block_size // 2
     while s > 0:
-        if thread_idx < s:
-            if shared_vals[thread_idx + s] > shared_vals[thread_idx]:
-                shared_vals[thread_idx] = shared_vals[thread_idx + s]
-                shared_idxs[thread_idx] = shared_idxs[thread_idx + s]
+        if thread_idx < s and shared_vals[thread_idx + s] > shared_vals[thread_idx]:
+            shared_vals[thread_idx] = shared_vals[thread_idx + s]
+            shared_idxs[thread_idx] = shared_idxs[thread_idx + s]
         cuda.syncthreads()
         s //= 2
     
@@ -1148,10 +1147,7 @@ def _predict_kernel(
         threshold = tree_thresholds[node]
         bin_value = binned[feature, sample_idx]
         
-        if bin_value <= threshold:
-            node = tree_left[node]
-        else:
-            node = tree_right[node]
+        node = tree_left[node] if bin_value <= threshold else tree_right[node]
     
     predictions[sample_idx] = tree_values[node]
 
@@ -1235,10 +1231,7 @@ def _predict_with_missing_kernel(
         
         # Phase 14.2: Check for missing value
         if bin_value == 255:  # MISSING_BIN
-            if tree_missing_left[node]:
-                node = tree_left[node]
-            else:
-                node = tree_right[node]
+            node = tree_left[node] if tree_missing_left[node] else tree_right[node]
         elif bin_value <= threshold:
             node = tree_left[node]
         else:
@@ -1279,24 +1272,15 @@ def _predict_with_categorical_kernel(
         
         # Check for missing value first
         if bin_value == 255:  # MISSING_BIN
-            if tree_missing_left[node]:
-                node = tree_left[node]
-            else:
-                node = tree_right[node]
+            node = tree_left[node] if tree_missing_left[node] else tree_right[node]
         elif is_categorical_split[node]:
             # Categorical split: use bitmask
             bitset = cat_bitsets[node]
-            if (int64(1) << bin_value) & bitset:
-                node = tree_left[node]
-            else:
-                node = tree_right[node]
+            node = tree_left[node] if int64(1) << bin_value & bitset else tree_right[node]
         else:
             # Numeric split: use threshold
             threshold = tree_thresholds[node]
-            if bin_value <= threshold:
-                node = tree_left[node]
-            else:
-                node = tree_right[node]
+            node = tree_left[node] if bin_value <= threshold else tree_right[node]
     
     predictions[sample_idx] = tree_values[node]
 
@@ -1949,7 +1933,6 @@ def _find_split_batch_kernel(
     feature_idx = cuda.blockIdx.x
     config_idx = cuda.blockIdx.y
     thread_idx = cuda.threadIdx.x
-    block_size = cuda.blockDim.x
     
     n_features = hist_grad.shape[1]
     n_configs = hist_grad.shape[0]
@@ -2723,11 +2706,10 @@ def _find_level_splits_kernel(
     # Tree reduction to find global best
     s = block_size // 2
     while s > 0:
-        if thread_idx < s:
-            if shared_gains[thread_idx + s] > shared_gains[thread_idx]:
-                shared_gains[thread_idx] = shared_gains[thread_idx + s]
-                shared_bins[thread_idx] = shared_bins[thread_idx + s]
-                shared_features[thread_idx] = shared_features[thread_idx + s]
+        if thread_idx < s and shared_gains[thread_idx + s] > shared_gains[thread_idx]:
+            shared_gains[thread_idx] = shared_gains[thread_idx + s]
+            shared_bins[thread_idx] = shared_bins[thread_idx + s]
+            shared_features[thread_idx] = shared_features[thread_idx + s]
         cuda.syncthreads()
         s //= 2
     
@@ -3218,10 +3200,9 @@ def _find_symmetric_split_kernel(
     # Parallel reduction to find max gain within this feature
     stride = 128
     while stride > 0:
-        if tid < stride:
-            if shared_gains[tid + stride] > shared_gains[tid]:
-                shared_gains[tid] = shared_gains[tid + stride]
-                shared_thresholds[tid] = shared_thresholds[tid + stride]
+        if tid < stride and shared_gains[tid + stride] > shared_gains[tid]:
+            shared_gains[tid] = shared_gains[tid + stride]
+            shared_thresholds[tid] = shared_thresholds[tid + stride]
         cuda.syncthreads()
         stride //= 2
     
@@ -3418,7 +3399,7 @@ def build_tree_symmetric_gpu_native(
     # Convert params to float32
     reg_lambda_f32 = np.float32(reg_lambda)
     min_child_weight_f32 = np.float32(min_child_weight)
-    min_gain_f32 = np.float32(min_gain)
+    np.float32(min_gain)
     
     # Initialize GPU arrays (using module-level kernel to avoid JIT overhead)
     init_blocks = max(sample_blocks, leaf_blocks, 1)
diff --git a/src/openboost/_batch.py b/src/openboost/_batch.py
index a77eccb..1678d67 100644
--- a/src/openboost/_batch.py
+++ b/src/openboost/_batch.py
@@ -6,8 +6,9 @@
 
 from __future__ import annotations
 
+from collections.abc import Sequence
 from dataclasses import dataclass, field
-from typing import TYPE_CHECKING, Sequence
+from typing import TYPE_CHECKING
 
 import numpy as np
 
diff --git a/src/openboost/_boosting.py b/src/openboost/_boosting.py
index 3644352..49c8ac9 100644
--- a/src/openboost/_boosting.py
+++ b/src/openboost/_boosting.py
@@ -4,10 +4,10 @@
 from numba import cuda
 
 from ._array import _quantile_bin
-from ._tree import Tree
 from ._histogram import build_histograms
-from ._split import find_best_splits, compute_leaf_values
-from ._kernels import update_sample_nodes_kernel, predict_kernel
+from ._kernels import predict_kernel, update_sample_nodes_kernel
+from ._split import compute_leaf_values, find_best_splits
+from ._tree import Tree
 
 
 class GradientBoosting:
diff --git a/src/openboost/_callbacks.py b/src/openboost/_callbacks.py
index e752ac6..77f6adb 100644
--- a/src/openboost/_callbacks.py
+++ b/src/openboost/_callbacks.py
@@ -27,10 +27,8 @@
 from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, Any
 
-import numpy as np
-
 if TYPE_CHECKING:
-    from numpy.typing import NDArray
+    pass
 
 
 @dataclass
@@ -56,7 +54,7 @@ class TrainingState:
     extra: dict = field(default_factory=dict)
 
 
-class Callback(ABC):
+class Callback(ABC):  # noqa: B024
     """Base class for training callbacks.
     
     Subclass this to create custom callbacks for training hooks.
@@ -77,22 +75,22 @@ class Callback(ABC):
         >>> plt.plot(tracker.grad_norms)
     """
     
-    def on_train_begin(self, state: TrainingState) -> None:
+    def on_train_begin(self, state: TrainingState) -> None:  # noqa: B027
         """Called at the start of training.
-        
+
         Args:
             state: Current training state.
         """
         pass
-    
-    def on_round_begin(self, state: TrainingState) -> None:
+
+    def on_round_begin(self, state: TrainingState) -> None:  # noqa: B027
         """Called at the start of each boosting round.
-        
+
         Args:
             state: Current training state.
         """
         pass
-    
+
     def on_round_end(self, state: TrainingState) -> bool:
         """Called at the end of each boosting round.
         
@@ -104,9 +102,9 @@ def on_round_end(self, state: TrainingState) -> bool:
         """
         return True
     
-    def on_train_end(self, state: TrainingState) -> None:
+    def on_train_end(self, state: TrainingState) -> None:  # noqa: B027
         """Called at the end of training.
-        
+
         Args:
             state: Current training state.
         """
diff --git a/src/openboost/_core/__init__.py b/src/openboost/_core/__init__.py
index 23197a9..4faee93 100644
--- a/src/openboost/_core/__init__.py
+++ b/src/openboost/_core/__init__.py
@@ -6,50 +6,47 @@
 - fit_tree: main entry point for building trees
 """
 
+from ._growth import (
+    GrowthConfig,
+    GrowthStrategy,
+    LeafValues,
+    LeafWiseGrowth,
+    LevelWiseGrowth,
+    ScalarLeaves,
+    SymmetricGrowth,
+    TreeStructure,
+    VectorLeaves,
+    get_growth_strategy,
+)
+from ._histogram import build_histogram
+from ._predict import predict_ensemble
 from ._primitives import (
     NodeHistogram,
     NodeSplit,
     build_node_histograms,
-    subtract_histogram,
-    find_node_splits,
-    partition_samples,
     compute_leaf_values,
-    init_sample_node_ids,
-    get_nodes_at_depth,
+    find_node_splits,
     get_children,
+    get_nodes_at_depth,
     get_parent,
+    init_sample_node_ids,
+    partition_samples,
+    subtract_histogram,
 )
-
-from ._growth import (
-    GrowthConfig,
-    GrowthStrategy,
-    TreeStructure,
-    LevelWiseGrowth,
-    LeafWiseGrowth,
-    SymmetricGrowth,
-    get_growth_strategy,
-    LeafValues,
-    ScalarLeaves,
-    VectorLeaves,
-)
-
+from ._split import SplitInfo, compute_leaf_value, find_best_split
 from ._tree import (
-    fit_tree,
-    fit_trees_batch,
+    SymmetricTree,
     Tree,
     TreeNode,
-    SymmetricTree,
+    fit_tree,
+    fit_tree_gpu_native,
     fit_tree_symmetric,
     fit_tree_symmetric_gpu_native,
-    fit_tree_gpu_native,
-    predict_tree,
+    fit_trees_batch,
     predict_symmetric_tree,
+    predict_tree,
 )
 
-from ._histogram import build_histogram, subtract_histogram as hist_subtract
-from ._split import find_best_split, compute_leaf_value, SplitInfo
-from ._predict import predict_ensemble
-
 __all__ = [
     # Primitives
     "NodeHistogram",
diff --git a/src/openboost/_core/_growth.py b/src/openboost/_core/_growth.py
index b45cf67..d126d89 100644
--- a/src/openboost/_core/_growth.py
+++ b/src/openboost/_core/_growth.py
@@ -20,19 +20,18 @@
 
 import numpy as np
 
-from .._backends import is_cuda
 from .._array import MISSING_BIN
+from .._backends import is_cuda
 from ._primitives import (
     NodeHistogram,
     NodeSplit,
     build_node_histograms,
-    subtract_histogram,
-    find_node_splits,
-    partition_samples,
     compute_leaf_values,
-    init_sample_node_ids,
+    find_node_splits,
     get_nodes_at_depth,
-    get_children,
+    init_sample_node_ids,
+    partition_samples,
+    subtract_histogram,
 )
 
 if TYPE_CHECKING:
@@ -94,7 +93,7 @@ def values(self) -> NDArray:
         return self._values
     
     @classmethod
-    def zeros(cls, n_nodes: int) -> "ScalarLeaves":
+    def zeros(cls, n_nodes: int) -> ScalarLeaves:
         """Create zero-initialized scalar leaves."""
         return cls(_values=np.zeros(n_nodes, dtype=np.float32))
 
@@ -127,7 +126,7 @@ def values(self) -> NDArray:
         return self._values
     
     @classmethod
-    def zeros(cls, n_nodes: int, n_outputs: int) -> "VectorLeaves":
+    def zeros(cls, n_nodes: int, n_outputs: int) -> VectorLeaves:
         """Create zero-initialized vector leaves."""
         return cls(
             _values=np.zeros((n_nodes, n_outputs), dtype=np.float32),
@@ -277,10 +276,7 @@ def __call__(self, X) -> NDArray:
         """
         # Handle BinnedArray
         from .._array import BinnedArray
-        if isinstance(X, BinnedArray):
-            binned = X.data
-        else:
-            binned = X
+        binned = X.data if isinstance(X, BinnedArray) else X
         return self.predict(binned)
     
     def _predict_standard(self, binned: NDArray) -> NDArray:
@@ -324,10 +320,7 @@ def _predict_standard_cpu(self, binned: NDArray) -> NDArray:
                     # Check bitmask membership: bit[bin_value] == 1 means go left
                     bitset = self.cat_bitsets[node]
                     goes_left = (bitset >> bin_value) & 1
-                    if goes_left:
-                        node = self.left_children[node]
-                    else:
-                        node = self.right_children[node]
+                    node = self.left_children[node] if goes_left else self.right_children[node]
                 # Standard ordinal split
                 elif bin_value <= threshold:
                     node = self.left_children[node]
diff --git a/src/openboost/_core/_histogram.py b/src/openboost/_core/_histogram.py
index 6e28719..135abf3 100644
--- a/src/openboost/_core/_histogram.py
+++ b/src/openboost/_core/_histogram.py
@@ -13,6 +13,7 @@
 
 if TYPE_CHECKING:
     from numpy.typing import NDArray
+
     from .._array import BinnedArray
 
 
diff --git a/src/openboost/_core/_predict.py b/src/openboost/_core/_predict.py
index 9e3d8b0..3f8f577 100644
--- a/src/openboost/_core/_predict.py
+++ b/src/openboost/_core/_predict.py
@@ -6,6 +6,7 @@
 
 from __future__ import annotations
 
+import contextlib
 from typing import TYPE_CHECKING
 
 import numpy as np
@@ -14,9 +15,10 @@
 from .._backends import is_cuda
 
 if TYPE_CHECKING:
-    from ._tree import Tree
     from numpy.typing import NDArray
 
+    from ._tree import Tree
+
 
 def predict_ensemble(
     trees: list[Tree],
@@ -203,10 +205,7 @@ def kernel(X_binned, node_features, node_thresholds, node_left, node_right,
             while node_features[node] >= 0:  # Not a leaf
                 feat = node_features[node]
                 val = X_binned[feat, idx]  # Feature-major layout
-                if val <= node_thresholds[node]:
-                    node = node_left[node]
-                else:
-                    node = node_right[node]
+                node = node_left[node] if val <= node_thresholds[node] else node_right[node]
             
             # Add leaf value to prediction
             pred[idx] += learning_rate * node_values[node]
@@ -217,8 +216,6 @@ def kernel(X_binned, node_features, node_thresholds, node_left, node_right,
 
 # Initialize kernel at module load if CUDA available
 if is_cuda():
-    try:
+    with contextlib.suppress(Exception):
         _predict_tree_add_kernel = _get_predict_tree_add_kernel()
-    except Exception:
-        pass
 
diff --git a/src/openboost/_core/_primitives.py b/src/openboost/_core/_primitives.py
index db7e8af..32fc1bf 100644
--- a/src/openboost/_core/_primitives.py
+++ b/src/openboost/_core/_primitives.py
@@ -25,8 +25,8 @@
 
 import numpy as np
 
-from .._backends import is_cuda
 from .._array import MISSING_BIN
+from .._backends import is_cuda
 from ._split import SplitInfo
 
 if TYPE_CHECKING:
@@ -174,12 +174,6 @@ def _build_node_histograms_gpu(
     
     Uses the optimized shared memory histogram kernel from Phase 6.3.
     """
-    from numba import cuda
-    import math
-    from .._backends._cuda import (
-        _build_histogram_shared_kernel,
-        _zero_level_histograms_kernel,
-    )
     
     n_features, n_samples = binned.shape
     
@@ -216,8 +210,10 @@ def _build_node_histograms_gpu_contiguous(
     n_nodes: int,
 ) -> dict[int, NodeHistogram]:
     """GPU histogram building for contiguous node range."""
-    from numba import cuda
     import math
+
+    from numba import cuda
+
     from .._backends._cuda import (
         _build_histogram_shared_kernel,
         _zero_level_histograms_kernel,
@@ -267,7 +263,7 @@ def _build_node_histograms_gpu_contiguous(
     sample_node_ids_cpu = sample_node_ids.copy_to_host()
     
     result = {}
-    for i, node_id in enumerate(range(level_start, level_end)):
+    for _i, node_id in enumerate(range(level_start, level_end)):
         node_hist = histograms_cpu[node_id]
         hist_grad = node_hist[:, :, 0]  # (n_features, 256)
         hist_hess = node_hist[:, :, 1]
@@ -297,6 +293,7 @@ def _build_node_histograms_gpu_sparse(
 ) -> dict[int, NodeHistogram]:
     """GPU histogram building for non-contiguous nodes (leaf-wise)."""
     from numba import cuda
+
     from .._backends._cuda import build_histogram_cuda, gather_cuda
     
     # For sparse node sets, build each node separately
@@ -426,7 +423,11 @@ def find_node_splits(
         >>> for node_id, split in splits.items():
         ...     print(f"Node {node_id}: split on feature {split.split.feature}")
     """
-    from ._split import find_best_split, find_best_split_with_missing, find_best_split_with_categorical
+    from ._split import (
+        find_best_split,
+        find_best_split_with_categorical,
+        find_best_split_with_missing,
+    )
     
     result = {}
     
@@ -596,13 +597,14 @@ def _partition_samples_gpu(
     sample_node_ids,
     splits: dict[int, NodeSplit],
     missing_go_left: NDArray | None = None,
-) -> "DeviceNDArray":
+) -> DeviceNDArray:
     """GPU implementation of sample partitioning.
     
     Phase 14: Handles missing values (bin 255) using learned direction.
     """
-    from numba import cuda
     import math
+
+    from numba import cuda
     
     n_samples = sample_node_ids.shape[0]
     
@@ -660,7 +662,7 @@ def _init_partition_kernel_with_missing():
     if _partition_kernel_with_missing is not None:
         return
     
-    from numba import cuda, int32, uint8
+    from numba import cuda, int32
     
     @cuda.jit
     def kernel(binned, old_node_ids, new_node_ids, 
diff --git a/src/openboost/_core/_split.py b/src/openboost/_core/_split.py
index 2b0bff6..029e0cd 100644
--- a/src/openboost/_core/_split.py
+++ b/src/openboost/_core/_split.py
@@ -11,7 +11,6 @@
 import numpy as np
 
 from .._backends import is_cuda
-from .._array import MISSING_BIN
 
 if TYPE_CHECKING:
     from numpy.typing import NDArray
@@ -184,7 +183,7 @@ def find_best_split_with_missing(
         total_hess = float(_sum_histogram(hist_hess))
     
     # Check if any feature has missing values
-    n_features = hist_grad.shape[0]
+    hist_grad.shape[0]
     any_missing = has_missing is not None and np.any(has_missing)
     
     # If no missing values, use standard split finding
diff --git a/src/openboost/_core/_tree.py b/src/openboost/_core/_tree.py
index 8588062..1c2f607 100644
--- a/src/openboost/_core/_tree.py
+++ b/src/openboost/_core/_tree.py
@@ -17,15 +17,12 @@
     GrowthConfig,
     GrowthStrategy,
     TreeStructure,
-    LevelWiseGrowth,
-    LeafWiseGrowth,
-    SymmetricGrowth,
     get_growth_strategy,
 )
 
 # Legacy imports for backward compatibility with internal code
 from ._histogram import build_histogram, subtract_histogram
-from ._split import SplitInfo, compute_leaf_value, find_best_split
+from ._split import compute_leaf_value, find_best_split
 
 if TYPE_CHECKING:
     from numpy.typing import NDArray
@@ -74,11 +71,11 @@ class Tree:
     _right: NDArray | None = field(default=None, repr=False)
     
     # GPU arrays for fast GPU training (Phase 5.1)
-    _features_gpu: "DeviceNDArray | None" = field(default=None, repr=False)
-    _thresholds_gpu: "DeviceNDArray | None" = field(default=None, repr=False)
-    _values_gpu: "DeviceNDArray | None" = field(default=None, repr=False)
-    _left_gpu: "DeviceNDArray | None" = field(default=None, repr=False)
-    _right_gpu: "DeviceNDArray | None" = field(default=None, repr=False)
+    _features_gpu: DeviceNDArray | None = field(default=None, repr=False)
+    _thresholds_gpu: DeviceNDArray | None = field(default=None, repr=False)
+    _values_gpu: DeviceNDArray | None = field(default=None, repr=False)
+    _left_gpu: DeviceNDArray | None = field(default=None, repr=False)
+    _right_gpu: DeviceNDArray | None = field(default=None, repr=False)
     
     @property
     def on_gpu(self) -> bool:
@@ -395,10 +392,7 @@ def fit_tree(
         subsample_mask = None
     
     # Get growth strategy
-    if isinstance(growth, str):
-        strategy = get_growth_strategy(growth)
-    else:
-        strategy = growth
+    strategy = get_growth_strategy(growth) if isinstance(growth, str) else growth
     
     # Build config
     config = GrowthConfig(
@@ -835,7 +829,7 @@ def fit_trees_batch(
         >>> 
         >>> # all_trees[0] contains trees for first config, etc.
     """
-    from .._models._batch import ConfigBatch, BatchTrainingState
+    from .._models._batch import BatchTrainingState, ConfigBatch
     
     if not isinstance(configs, ConfigBatch):
         raise TypeError(f"configs must be ConfigBatch, got {type(configs)}")
@@ -854,7 +848,6 @@ def fit_trees_batch(
     hess = as_numba_array(hess)
     
     n_configs = configs.n_configs
-    n_rounds = configs.n_rounds
     
     # Initialize training state
     state = BatchTrainingState.create(n_configs, n_samples)
@@ -892,7 +885,7 @@ def _fit_trees_batch_cpu(
         for round_idx in range(n_rounds):
             # Compute gradients from current predictions
             # Note: User provides initial grad/hess, subsequent rounds recompute
-            if round_idx > 0:
+            if round_idx > 0:  # noqa: SIM108
                 # Recompute MSE gradients from current predictions.
                 # Initial grad = 2*(0 - y) = -2y, so for MSE:
                 #   grad_new = 2*(pred - y) = 2*pred + initial_grad
@@ -934,13 +927,7 @@ def _fit_trees_batch_cuda(
 ) -> list[list[Tree]]:
     """CUDA batch training using fused kernels."""
     from numba import cuda
-    from .._backends._cuda import (
-        build_histogram_batch_cuda,
-        find_best_split_batch_cuda,
-        compute_split_masks_batch_cuda,
-        reduce_sum_cuda,
-        to_device,
-    )
+
     
     n_configs = configs.n_configs
     n_rounds = configs.n_rounds
@@ -1222,10 +1209,7 @@ def predict_symmetric_tree(tree: SymmetricTree, X: BinnedArray | NDArray) -> NDA
     
     Prediction is just bit operations - very fast!
     """
-    if isinstance(X, BinnedArray):
-        binned = X.data
-    else:
-        binned = X
+    binned = X.data if isinstance(X, BinnedArray) else X
     
     use_gpu = is_cuda() and hasattr(binned, '__cuda_array_interface__')
     
diff --git a/src/openboost/_distributed/__init__.py b/src/openboost/_distributed/__init__.py
index 38f1555..0901ef6 100644
--- a/src/openboost/_distributed/__init__.py
+++ b/src/openboost/_distributed/__init__.py
@@ -4,9 +4,9 @@
 Phase 18: Adds multi-GPU support via Ray actors.
 """
 
-from typing import Protocol, Any
+from typing import Any, Protocol
+
 from numpy.typing import NDArray
-import numpy as np
 
 
 class DistributedContext(Protocol):
@@ -29,13 +29,12 @@ def partition_data(self, X: NDArray, y: NDArray) -> tuple[NDArray, NDArray]:
 
 # Phase 18: Multi-GPU support
 from ._multigpu import (
-    GPUWorkerBase,
     GPUWorker,
+    GPUWorkerBase,
     MultiGPUContext,
     fit_tree_multigpu,
 )
 
-
 __all__ = [
     "DistributedContext",
     # Phase 18: Multi-GPU
diff --git a/src/openboost/_distributed/_multigpu.py b/src/openboost/_distributed/_multigpu.py
index bbe3dc0..d8ed219 100644
--- a/src/openboost/_distributed/_multigpu.py
+++ b/src/openboost/_distributed/_multigpu.py
@@ -22,7 +22,7 @@
 from __future__ import annotations
 
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, Callable, List, Optional
+from typing import TYPE_CHECKING, Any
 
 import numpy as np
 from numpy.typing import NDArray
@@ -59,7 +59,7 @@ def __init__(
         X_shard: NDArray,
         y_shard: NDArray,
         n_bins: int,
-        bin_edges: Optional[NDArray] = None,
+        bin_edges: NDArray | None = None,
     ):
         """Initialize worker with data shard on assigned GPU.
         
@@ -164,7 +164,7 @@ def build_histogram(
         self,
         grad: NDArray,
         hess: NDArray,
-        node_ids: Optional[List[int]] = None,
+        node_ids: list[int] | None = None,
     ) -> tuple[NDArray, NDArray]:
         """Build local histogram for this shard.
         
@@ -180,7 +180,6 @@ def build_histogram(
                    or dict mapping node_id to histogram
         """
         from .._core._histogram import build_histogram
-        from .._array import as_numba_array
         
         # Get binned data
         binned = self.X_binned.data
@@ -240,7 +239,7 @@ def get_n_samples(self) -> int:
         """Get number of samples in this shard."""
         return self.n_samples
     
-    def get_bin_edges(self) -> Optional[NDArray]:
+    def get_bin_edges(self) -> NDArray | None:
         """Get bin edges used by this worker (for consistent binning)."""
         if hasattr(self.X_binned, 'bin_edges'):
             return self.X_binned.bin_edges
@@ -248,10 +247,7 @@ def get_bin_edges(self) -> Optional[NDArray]:
 
 
 # Create Ray remote version if Ray is available
-if ray is not None:
-    GPUWorker = ray.remote(num_gpus=1)(GPUWorkerBase)
-else:
-    GPUWorker = GPUWorkerBase
+GPUWorker = ray.remote(num_gpus=1)(GPUWorkerBase) if ray is not None else GPUWorkerBase
 
 
 # =============================================================================
@@ -291,11 +287,11 @@ class MultiGPUContext:
     """
     
     n_gpus: int = None
-    devices: List[int] = None
-    workers: List[Any] = None
+    devices: list[int] = None
+    workers: list[Any] = None
     n_features: int = None
     n_samples: int = None
-    shard_sizes: List[int] = None
+    shard_sizes: list[int] = None
     bin_edges: NDArray = None
     
     def __post_init__(self):
@@ -334,7 +330,7 @@ def setup(
         X: NDArray,
         y: NDArray,
         n_bins: int = 256,
-        bin_edges: Optional[NDArray] = None,
+        bin_edges: NDArray | None = None,
     ):
         """Shard data and create GPU workers.
         
@@ -377,7 +373,7 @@ def setup(
         
         # Create workers
         self.workers = []
-        for gpu_id, shard_indices in zip(self.devices, indices):
+        for gpu_id, shard_indices in zip(self.devices, indices, strict=False):
             X_shard = X[shard_indices]
             y_shard = y[shard_indices]
             
@@ -393,7 +389,7 @@ def setup(
     def compute_all_gradients(
         self,
         loss_fn: LossFunction,
-    ) -> List[tuple[NDArray, NDArray]]:
+    ) -> list[tuple[NDArray, NDArray]]:
         """Compute gradients on all workers in parallel.
         
         Args:
@@ -410,8 +406,8 @@ def compute_all_gradients(
     
     def build_all_histograms(
         self,
-        grads_hess: List[tuple[NDArray, NDArray]],
-    ) -> List[tuple[NDArray, NDArray]]:
+        grads_hess: list[tuple[NDArray, NDArray]],
+    ) -> list[tuple[NDArray, NDArray]]:
         """Build local histograms on all workers in parallel.
         
         Args:
@@ -422,13 +418,13 @@ def build_all_histograms(
         """
         hist_refs = [
             worker.build_histogram.remote(grad, hess)
-            for worker, (grad, hess) in zip(self.workers, grads_hess)
+            for worker, (grad, hess) in zip(self.workers, grads_hess, strict=False)
         ]
         return ray.get(hist_refs)
     
     def aggregate_histograms(
         self,
-        local_histograms: List[tuple[NDArray, NDArray]],
+        local_histograms: list[tuple[NDArray, NDArray]],
     ) -> tuple[NDArray, NDArray]:
         """Sum histograms from all workers (AllReduce).
         
@@ -492,7 +488,7 @@ def shutdown(self):
 
 def fit_tree_multigpu(
     ctx: MultiGPUContext,
-    grads_hess: List[tuple[NDArray, NDArray]],
+    grads_hess: list[tuple[NDArray, NDArray]],
     *,
     max_depth: int = 6,
     min_child_weight: float = 1.0,
@@ -522,9 +518,6 @@ def fit_tree_multigpu(
     Returns:
         Fitted TreeStructure
     """
-    from .._core._tree import fit_tree
-    from .._array import BinnedArray
-    import openboost as ob
     
     # Build local histograms on each GPU
     local_histograms = ctx.build_all_histograms(grads_hess)
@@ -541,7 +534,7 @@ def fit_tree_multigpu(
     total_hess = np.zeros(ctx.n_samples, dtype=np.float32)
     
     offset = 0
-    for (grad, hess), size in zip(grads_hess, ctx.shard_sizes):
+    for (grad, hess), size in zip(grads_hess, ctx.shard_sizes, strict=False):
         total_grad[offset:offset + size] = grad
         total_hess[offset:offset + size] = hess
         offset += size
@@ -549,7 +542,7 @@ def fit_tree_multigpu(
     # Create a dummy BinnedArray for tree fitting
     # In practice, we'd want to do distributed tree building
     # For now, collect data to driver and fit there
-    all_preds = ray.get([w.get_predictions.remote() for w in ctx.workers])
+    ray.get([w.get_predictions.remote() for w in ctx.workers])
     
     # Use the global histogram for tree building
     # This is where we'd integrate with fit_tree_from_histogram
@@ -558,11 +551,10 @@ def fit_tree_multigpu(
     # Get binned data from first worker for structure
     # NOTE: This is a simplification - full implementation would do
     # distributed tree building with sample partitioning
-    first_worker_bin_edges = ray.get(ctx.workers[0].get_bin_edges.remote())
+    ray.get(ctx.workers[0].get_bin_edges.remote())
     
     # Build tree using growth strategy with global histogram
-    from .._core._growth import LevelWiseGrowth, GrowthConfig
-    from .._core._primitives import NodeHistogram
+    from .._core._growth import GrowthConfig
     
     config = GrowthConfig(
         max_depth=max_depth,
@@ -604,9 +596,9 @@ def _build_tree_from_global_histogram(
         TreeStructure
     """
     from .._core._growth import TreeStructure
-    from .._core._split import find_best_split, compute_leaf_value
+    from .._core._split import compute_leaf_value, find_best_split
 
-    n_bins = hist_grad.shape[1]
+    hist_grad.shape[1]
     max_nodes = 2**(config.max_depth + 1) - 1
 
     # Initialize tree arrays
@@ -695,7 +687,7 @@ def _build_tree_from_global_histogram(
             left_hist_grad = np.zeros_like(h_grad)
             left_hist_hess = np.zeros_like(h_hess)
 
-            for f in range(n_features):
+            for _f in range(n_features):
                 # For the split feature, partition bins at the threshold
                 # For all other features, we need the full histogram
                 # conditioned on left/right.  With only histogram information
@@ -716,8 +708,8 @@ def _build_tree_from_global_histogram(
             # Left child: bins <= threshold for the split feature
             left_sf_grad = float(np.sum(h_grad[sf, :t + 1]))
             left_sf_hess = float(np.sum(h_hess[sf, :t + 1]))
-            right_sf_grad = s_grad - left_sf_grad
-            right_sf_hess = s_hess - left_sf_hess
+            s_grad - left_sf_grad
+            s_hess - left_sf_hess
 
             # For each feature, split histogram bins proportionally based on
             # the split feature's left/right ratio.
diff --git a/src/openboost/_distributed/_ray.py b/src/openboost/_distributed/_ray.py
index a2c7034..d06473a 100644
--- a/src/openboost/_distributed/_ray.py
+++ b/src/openboost/_distributed/_ray.py
@@ -3,7 +3,8 @@
 Phase 12: Implements distributed training using Ray for multi-GPU/multi-node.
 """
 
-from typing import Any, List, Dict
+from typing import Any
+
 import numpy as np
 from numpy.typing import NDArray
 
@@ -42,7 +43,7 @@ def __init__(self, X_shard: NDArray, y_shard: NDArray, n_bins: int,
         self.pred = np.zeros(self.n_samples, dtype=np.float32)
     
     def compute_histograms(self, grad: NDArray, hess: NDArray, 
-                           node_ids: List[int]) -> Dict[int, Any]:
+                           node_ids: list[int]) -> dict[int, Any]:
         """Compute local histograms for this shard."""
         histograms = build_node_histograms(
             self.X_binned.data if hasattr(self.X_binned, 'data') else self.X_binned,
@@ -120,10 +121,10 @@ def setup(self, X: NDArray, y: NDArray, n_bins: int):
 
         self.workers = [
             RayWorker.remote(s, ys, n_bins, bin_edges=global_bin_edges)
-            for s, ys in zip(shards, y_shards)
+            for s, ys in zip(shards, y_shards, strict=False)
         ]
     
-    def allreduce_histograms(self, local_hists_refs: List[Any]) -> Dict[int, Any]:
+    def allreduce_histograms(self, local_hists_refs: list[Any]) -> dict[int, Any]:
         """Sum histograms from all workers."""
         local_hists = ray.get(local_hists_refs)
         
diff --git a/src/openboost/_distributed/_tree.py b/src/openboost/_distributed/_tree.py
index 3acb8af..bdc2562 100644
--- a/src/openboost/_distributed/_tree.py
+++ b/src/openboost/_distributed/_tree.py
@@ -3,7 +3,8 @@
 Phase 12: Implements distributed tree building using histogram aggregation.
 """
 
-from typing import Any, List, Optional
+from typing import Any
+
 import numpy as np
 
 try:
@@ -11,20 +12,15 @@
 except ImportError:
     ray = None
 
-from openboost._core._growth import TreeStructure, GrowthConfig
-from openboost._core._primitives import (
-    find_node_splits, 
-    compute_leaf_values,
-    NodeHistogram,
-    NodeSplit
-)
+from openboost._core._growth import TreeStructure
+from openboost._core._primitives import find_node_splits
 
 
 def fit_tree_distributed(
     ctx: Any,  # DistributedContext
-    workers: List[Any],
-    grad_refs: List[Any],  # Ray object refs
-    hess_refs: List[Any],
+    workers: list[Any],
+    grad_refs: list[Any],  # Ray object refs
+    hess_refs: list[Any],
     *,
     max_depth: int = 6,
     min_child_weight: float = 1.0,
@@ -42,7 +38,7 @@ def fit_tree_distributed(
         )
     
     # 1. Initialize
-    sample_node_ids_refs = [w.init_node_ids.remote() for w in workers]
+    [w.init_node_ids.remote() for w in workers]
     
     n_features = get_worker_n_features(workers[0])
     
@@ -67,7 +63,7 @@ def fit_tree_distributed(
         # 3. Compute local histograms
         local_hists_refs = [
             w.compute_histograms.remote(g, h, active_nodes)
-            for w, g, h in zip(workers, grad_refs, hess_refs)
+            for w, g, h in zip(workers, grad_refs, hess_refs, strict=False)
         ]
         
         # 4. Aggregate histograms
@@ -103,14 +99,13 @@ def fit_tree_distributed(
     # 8. Compute leaf values
     leaf_nodes = []
     for i in range(max_nodes):
-        if left_children[i] == -1:
-             if i == 0 or features[(i-1)//2] >= 0:
-                 leaf_nodes.append(i)
+        if left_children[i] == -1 and (i == 0 or features[(i-1)//2] >= 0):
+            leaf_nodes.append(i)
     
     if leaf_nodes:
         local_hists_refs = [
             w.compute_histograms.remote(g, h, leaf_nodes)
-            for w, g, h in zip(workers, grad_refs, hess_refs)
+            for w, g, h in zip(workers, grad_refs, hess_refs, strict=False)
         ]
         leaf_histograms = ctx.allreduce_histograms(local_hists_refs)
         
diff --git a/src/openboost/_distributions.py b/src/openboost/_distributions.py
index 4405bec..e24bb86 100644
--- a/src/openboost/_distributions.py
+++ b/src/openboost/_distributions.py
@@ -21,16 +21,12 @@
 
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Tuple
 
 import numpy as np
 from numpy.typing import NDArray
 
-from ._backends import is_cuda
-
-
 # Type alias for gradient/hessian tuple
-GradHess = Tuple[NDArray, NDArray]
+GradHess = tuple[NDArray, NDArray]
 
 
 @dataclass
@@ -42,7 +38,7 @@ class DistributionOutput:
         distribution: The Distribution instance used
     """
     params: dict[str, NDArray]
-    distribution: "Distribution"
+    distribution: Distribution
     
     def mean(self) -> NDArray:
         """Expected value E[Y|X]."""
@@ -60,7 +56,7 @@ def quantile(self, q: float) -> NDArray:
         """q-th quantile (0 < q < 1)."""
         return self.distribution.quantile(self.params, q)
     
-    def interval(self, alpha: float = 0.1) -> Tuple[NDArray, NDArray]:
+    def interval(self, alpha: float = 0.1) -> tuple[NDArray, NDArray]:
         """(1-alpha) prediction interval.
         
         Args:
@@ -1272,7 +1268,7 @@ def init_params(self, y: NDArray) -> dict[str, float]:
         var_y = float(np.var(y_clip)) + 1e-6
         
         # Estimate r from method of moments
-        if var_y > mu_init:
+        if var_y > mu_init:  # noqa: SIM108
             r_init = mu_init ** 2 / (var_y - mu_init)
         else:
             r_init = 10.0  # Default if not overdispersed
@@ -1570,7 +1566,7 @@ def _numerical_gradient(
         """
         results = {}
         eps = self._eps
-        n = len(y)
+        len(y)
 
         # Compute center NLL once
         nll_center = self._nll_fn(y, params)
@@ -1617,7 +1613,7 @@ def _jax_gradient(
 
         # Define loss for a single sample
         def single_nll(param_values, y_single):
-            params_dict = {name: jnp.array([val]) for name, val in zip(self._param_names, param_values)}
+            params_dict = {name: jnp.array([val]) for name, val in zip(self._param_names, param_values, strict=False)}
             return self._nll_fn(jnp.array([y_single]), params_dict)[0]
 
         # Create grad and hessian functions once (cached pattern)
@@ -1662,7 +1658,7 @@ def single_nll(param_values, y_single):
                         results[name][0][i] = float(g[j])
                         results[name][1][i] = max(float(h[j, j]), 1e-6)
                 except Exception:
-                    for j, name in enumerate(self._param_names):
+                    for _j, name in enumerate(self._param_names):
                         results[name][0][i] = 0.0
                         results[name][1][i] = 1.0
 
diff --git a/src/openboost/_histogram.py b/src/openboost/_histogram.py
index ca4c99b..6cb5daf 100644
--- a/src/openboost/_histogram.py
+++ b/src/openboost/_histogram.py
@@ -3,7 +3,7 @@
 import numpy as np
 from numba import cuda
 
-from ._kernels import histogram_kernel, HIST_BLOCK_SIZE, MAX_BINS
+from ._kernels import HIST_BLOCK_SIZE, MAX_BINS, histogram_kernel
 
 
 def build_histograms(
diff --git a/src/openboost/_importance.py b/src/openboost/_importance.py
index 47bdd9e..39c4e68 100644
--- a/src/openboost/_importance.py
+++ b/src/openboost/_importance.py
@@ -21,9 +21,8 @@
 
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Any, Protocol, runtime_checkable
-
 import warnings
+from typing import TYPE_CHECKING, Any, Protocol, runtime_checkable
 
 import numpy as np
 
@@ -237,7 +236,7 @@ def get_feature_importance_dict(
         feature_names = [f"feature_{i}" for i in range(len(importances))]
     
     # Create dict and sort by importance
-    importance_dict = dict(zip(feature_names, importances))
+    importance_dict = dict(zip(feature_names, importances, strict=False))
     sorted_dict = dict(sorted(importance_dict.items(), key=lambda x: x[1], reverse=True))
     
     # Limit to top N if requested
@@ -274,8 +273,8 @@ def plot_feature_importances(
     """
     try:
         import matplotlib.pyplot as plt
-    except ImportError:
-        raise ImportError("matplotlib is required for plotting. Install with: pip install matplotlib")
+    except ImportError as err:
+        raise ImportError("matplotlib is required for plotting. Install with: pip install matplotlib") from err
     
     importances = compute_feature_importances(model, importance_type, normalize=True)
     
diff --git a/src/openboost/_kernels.py b/src/openboost/_kernels.py
index 0e3c648..887cc0b 100644
--- a/src/openboost/_kernels.py
+++ b/src/openboost/_kernels.py
@@ -118,11 +118,10 @@ def find_best_split_kernel(
     # Tree reduction to find best
     s = cuda.blockDim.x // 2
     while s > 0:
-        if tid < s:
-            if shared_gain[tid + s] > shared_gain[tid]:
-                shared_gain[tid] = shared_gain[tid + s]
-                shared_feature[tid] = shared_feature[tid + s]
-                shared_bin[tid] = shared_bin[tid + s]
+        if tid < s and shared_gain[tid + s] > shared_gain[tid]:
+            shared_gain[tid] = shared_gain[tid + s]
+            shared_feature[tid] = shared_feature[tid + s]
+            shared_bin[tid] = shared_bin[tid + s]
         cuda.syncthreads()
         s //= 2
     
@@ -185,10 +184,7 @@ def predict_kernel(
     node = 0
     while not tree_is_leaf[node]:
         feature = tree_features[node]
-        if X_binned[feature, idx] <= tree_bins[node]:
-            node = 2 * node + 1
-        else:
-            node = 2 * node + 2
+        node = 2 * node + 1 if X_binned[feature, idx] <= tree_bins[node] else 2 * node + 2
     
     predictions[idx] += learning_rate * tree_values[node]
 
diff --git a/src/openboost/_loss.py b/src/openboost/_loss.py
index 790ce59..e649442 100644
--- a/src/openboost/_loss.py
+++ b/src/openboost/_loss.py
@@ -7,7 +7,8 @@
 from __future__ import annotations
 
 import warnings
-from typing import TYPE_CHECKING, Callable
+from collections.abc import Callable
+from typing import TYPE_CHECKING
 
 import numpy as np
 
@@ -288,8 +289,9 @@ def _logloss_gradient_gpu(pred, y):
 
 def _get_logloss_kernel():
     """Lazily compile LogLoss gradient kernel."""
-    from numba import cuda
     import math
+
+    from numba import cuda
     
     @cuda.jit
     def kernel(pred, y, grad, hess, n):
@@ -669,8 +671,9 @@ def _poisson_gradient_gpu(pred, y):
 
 def _get_poisson_kernel():
     """Lazily compile Poisson gradient kernel."""
-    from numba import cuda
     import math
+
+    from numba import cuda
     
     @cuda.jit
     def kernel(pred, y, grad, hess, n):
@@ -759,8 +762,9 @@ def _gamma_gradient_gpu(pred, y):
 
 def _get_gamma_kernel():
     """Lazily compile Gamma gradient kernel."""
-    from numba import cuda
     import math
+
+    from numba import cuda
     
     @cuda.jit
     def kernel(pred, y, grad, hess, n):
@@ -863,8 +867,9 @@ def _tweedie_gradient_gpu(pred, y, rho: float = 1.5):
 
 def _get_tweedie_kernel():
     """Lazily compile Tweedie gradient kernel."""
-    from numba import cuda
     import math
+
+    from numba import cuda
     
     @cuda.jit
     def kernel(pred, y, grad, hess, n, rho):
@@ -876,7 +881,7 @@ def kernel(pred, y, grad, hess, n, rho):
             elif p < -20:
                 p = -20.0
             
-            mu = math.exp(p)
+            math.exp(p)
             
             # mu^(2-rho) and mu^(1-rho) via exp
             mu_2_rho = math.exp(p * (2.0 - rho))
@@ -955,10 +960,7 @@ def _softmax_gradient_gpu(pred, y, n_classes: int):
     else:
         pred_cpu = np.asarray(pred, dtype=np.float32)
     
-    if hasattr(y, 'copy_to_host'):
-        y_cpu = y.copy_to_host()
-    else:
-        y_cpu = np.asarray(y)
+    y_cpu = y.copy_to_host() if hasattr(y, 'copy_to_host') else np.asarray(y)
     
     return _softmax_gradient_cpu(pred_cpu, y_cpu, n_classes)
 
diff --git a/src/openboost/_models/__init__.py b/src/openboost/_models/__init__.py
index 2618145..b145847 100644
--- a/src/openboost/_models/__init__.py
+++ b/src/openboost/_models/__init__.py
@@ -8,42 +8,42 @@
 Phase 16: Renamed NGBoost -> NaturalBoost for clarity.
 """
 
+from ._batch import BatchTrainingState, ConfigBatch
 from ._boosting import GradientBoosting, MultiClassGradientBoosting
 from ._dart import DART
-from ._gam import OpenBoostGAM
-from ._batch import ConfigBatch, BatchTrainingState
-from ._sklearn import (
-    OpenBoostRegressor,
-    OpenBoostClassifier,
-    OpenBoostDistributionalRegressor,
-    OpenBoostLinearLeafRegressor,
-)
 
 # Phase 15/16: Distributional GBDT and NaturalBoost
 from ._distributional import (
     DistributionalGBDT,
     # Primary names (Phase 16)
     NaturalBoost,
-    NaturalBoostNormal,
-    NaturalBoostLogNormal,
     NaturalBoostGamma,
+    NaturalBoostLogNormal,
+    NaturalBoostNegBin,
+    NaturalBoostNormal,
     NaturalBoostPoisson,
     NaturalBoostStudentT,
     NaturalBoostTweedie,
-    NaturalBoostNegBin,
     # Backward compatibility aliases
     NGBoost,
-    NGBoostNormal,
-    NGBoostLogNormal,
     NGBoostGamma,
+    NGBoostLogNormal,
+    NGBoostNegBin,
+    NGBoostNormal,
     NGBoostPoisson,
     NGBoostStudentT,
     NGBoostTweedie,
-    NGBoostNegBin,
 )
+from ._gam import OpenBoostGAM
 
 # Phase 15: Linear Leaf GBDT
 from ._linear_leaf import LinearLeafGBDT, LinearLeafTree
+from ._sklearn import (
+    OpenBoostClassifier,
+    OpenBoostDistributionalRegressor,
+    OpenBoostLinearLeafRegressor,
+    OpenBoostRegressor,
+)
 
 __all__ = [
     # Standard GBDT
diff --git a/src/openboost/_models/_batch.py b/src/openboost/_models/_batch.py
index 5417c85..604f06c 100644
--- a/src/openboost/_models/_batch.py
+++ b/src/openboost/_models/_batch.py
@@ -6,8 +6,9 @@
 
 from __future__ import annotations
 
+from collections.abc import Sequence
 from dataclasses import dataclass, field
-from typing import TYPE_CHECKING, Sequence
+from typing import TYPE_CHECKING
 
 import numpy as np
 
diff --git a/src/openboost/_models/_boosting.py b/src/openboost/_models/_boosting.py
index a9a1e7a..1f263f4 100644
--- a/src/openboost/_models/_boosting.py
+++ b/src/openboost/_models/_boosting.py
@@ -15,34 +15,29 @@
 from __future__ import annotations
 
 import os
+from collections.abc import Callable
 from dataclasses import dataclass, field
-from pathlib import Path
-from typing import TYPE_CHECKING, Callable, Literal
+from typing import TYPE_CHECKING, Literal
 
 import numpy as np
 
 from .._array import BinnedArray, array
 from .._backends import is_cuda
-from .._loss import get_loss_function, compute_loss_value, LossFunction
-from .._core._tree import fit_tree
-from .._core._growth import TreeStructure
 from .._callbacks import Callback, CallbackManager, TrainingState
+from .._core._growth import TreeStructure
+from .._core._tree import fit_tree
+from .._loss import LossFunction, compute_loss_value, get_loss_function
+from .._persistence import PersistenceMixin
 from .._sampling import (
-    SamplingStrategy,
     goss_sample,
-    random_sample,
-    apply_sampling,
-    MiniBatchIterator,
-    accumulate_histograms_minibatch,
 )
-from .._persistence import PersistenceMixin
 from .._validation import (
-    validate_X,
-    validate_y,
-    validate_sample_weight,
     validate_eval_set,
     validate_hyperparameters,
     validate_predict_input,
+    validate_sample_weight,
+    validate_X,
+    validate_y,
 )
 
 try:
@@ -310,9 +305,8 @@ def _fit_distributed(self, y: NDArray, n_samples: int):
         
         ctx.setup(X_data, y, self.n_bins)
         
-        import ray
         
-        for i in range(self.n_trees):
+        for _i in range(self.n_trees):
             # Compute gradients on each worker
             grad_hess_refs = [
                 w.compute_gradients.options(num_returns=2).remote(self._loss_fn) 
@@ -385,7 +379,6 @@ def _fit_multigpu(
         ctx.setup(X_data, y, n_bins=self.n_bins)
         
         try:
-            import ray
             
             # Training loop
             for i in range(self.n_trees):
@@ -408,8 +401,6 @@ def _fit_multigpu(
                 
                 # For proper tree building, we need the full binned data
                 # Use a simplified approach: fit tree on driver with full histogram info
-                from .._core._growth import TreeStructure, GrowthConfig
-                from .._core._split import find_best_split, compute_leaf_value
                 
                 tree = self._build_tree_from_histogram(
                     global_hist_grad,
@@ -457,7 +448,7 @@ def _build_tree_from_histogram(
         Uses recursive histogram-based tree building similar to LightGBM.
         """
         from .._core._growth import TreeStructure
-        from .._core._split import find_best_split, compute_leaf_value
+        from .._core._split import compute_leaf_value, find_best_split
         
         max_nodes = 2**(self.max_depth + 1) - 1
         features = np.full(max_nodes, -1, dtype=np.int32)
@@ -800,7 +791,6 @@ def _fit_cpu(
         # Determine sampling strategy
         use_goss = self.subsample_strategy == 'goss'
         use_random_sampling = self.subsample_strategy == 'random' and self.subsample < 1.0
-        use_minibatch = self.batch_size is not None and self.batch_size < n_samples
         
         # Train trees
         for i in range(self.n_trees):
@@ -1080,7 +1070,7 @@ class MultiClassGradientBoosting(PersistenceMixin):
     X_binned_: BinnedArray | None = field(default=None, init=False, repr=False)
     n_features_in_: int = field(default=0, init=False, repr=False)
 
-    def fit(self, X: NDArray, y: NDArray) -> "MultiClassGradientBoosting":
+    def fit(self, X: NDArray, y: NDArray) -> MultiClassGradientBoosting:
         """Fit the multi-class gradient boosting model.
 
         Args:
diff --git a/src/openboost/_models/_dart.py b/src/openboost/_models/_dart.py
index eb8d33e..27741c7 100644
--- a/src/openboost/_models/_dart.py
+++ b/src/openboost/_models/_dart.py
@@ -20,10 +20,9 @@
 import numpy as np
 
 from .._array import BinnedArray, array
-from .._backends import is_cuda
-from .._core._growth import TreeStructure, GrowthConfig
-from .._loss import get_loss_function, LossFunction
+from .._core._growth import TreeStructure
 from .._core._tree import fit_tree
+from .._loss import LossFunction, get_loss_function
 from .._persistence import PersistenceMixin
 
 if TYPE_CHECKING:
@@ -86,7 +85,7 @@ class DART(PersistenceMixin):
     _loss_fn: LossFunction | None = field(default=None, init=False, repr=False)
     _rng: np.random.Generator | None = field(default=None, init=False, repr=False)
     
-    def fit(self, X: NDArray, y: NDArray) -> "DART":
+    def fit(self, X: NDArray, y: NDArray) -> DART:
         """Fit the DART model.
         
         Args:
@@ -125,7 +124,7 @@ def fit(self, X: NDArray, y: NDArray) -> "DART":
         pred = np.full(n_samples, self.base_score_, dtype=np.float32)
         
         # Train trees
-        for i in range(self.n_trees):
+        for _i in range(self.n_trees):
             # Decide whether to apply dropout this round
             apply_dropout = (
                 len(self.trees_) > 0 and
@@ -176,7 +175,7 @@ def fit(self, X: NDArray, y: NDArray) -> "DART":
                 base = getattr(self, 'base_score_', np.float32(0.0))
                 pred = np.full(n_samples_tmp, base, dtype=np.float32)
                 excluded_set = set(dropped_indices)
-                for t_i, (t, w) in enumerate(zip(self.trees_, self.tree_weights_)):
+                for t_i, (t, w) in enumerate(zip(self.trees_, self.tree_weights_, strict=False)):
                     t_pred = t(self.X_binned_)
                     if hasattr(t_pred, 'copy_to_host'):
                         t_pred = t_pred.copy_to_host()
@@ -231,7 +230,7 @@ def _predict_without_trees(
         
         excluded_set = set(excluded_indices)
         
-        for i, (tree, weight) in enumerate(zip(self.trees_, self.tree_weights_)):
+        for i, (tree, weight) in enumerate(zip(self.trees_, self.tree_weights_, strict=False)):
             if i in excluded_set:
                 continue
             tree_pred = tree(X)
@@ -247,7 +246,7 @@ def _predict_internal(self, X: BinnedArray) -> NDArray:
         base = getattr(self, 'base_score_', np.float32(0.0))
         pred = np.full(n_samples, base, dtype=np.float32)
 
-        for tree, weight in zip(self.trees_, self.tree_weights_):
+        for tree, weight in zip(self.trees_, self.tree_weights_, strict=False):
             tree_pred = tree(X)
             if hasattr(tree_pred, 'copy_to_host'):
                 tree_pred = tree_pred.copy_to_host()
diff --git a/src/openboost/_models/_distributional.py b/src/openboost/_models/_distributional.py
index 8ab7e7b..087329f 100644
--- a/src/openboost/_models/_distributional.py
+++ b/src/openboost/_models/_distributional.py
@@ -36,20 +36,18 @@
 from __future__ import annotations
 
 from dataclasses import dataclass, field
-from pathlib import Path
 from typing import TYPE_CHECKING, Literal
 
 import numpy as np
 
 from .._array import BinnedArray, array
-from .._backends import is_cuda
+from .._core._growth import TreeStructure
+from .._core._tree import fit_tree
 from .._distributions import (
     Distribution,
     DistributionOutput,
     get_distribution,
 )
-from .._core._tree import fit_tree
-from .._core._growth import TreeStructure
 from .._persistence import PersistenceMixin
 
 if TYPE_CHECKING:
@@ -118,7 +116,7 @@ class DistributionalGBDT(PersistenceMixin):
     _base_scores: dict[str, float] = field(default_factory=dict, init=False, repr=False)
     n_features_in_: int = field(default=0, init=False, repr=False)
     
-    def fit(self, X: NDArray, y: NDArray) -> "DistributionalGBDT":
+    def fit(self, X: NDArray, y: NDArray) -> DistributionalGBDT:
         """Fit the distributional gradient boosting model.
         
         Args:
@@ -159,7 +157,7 @@ def fit(self, X: NDArray, y: NDArray) -> "DistributionalGBDT":
             params[param_name] = self.distribution_.link(param_name, raw_preds[param_name])
         
         # Training loop
-        for round_idx in range(self.n_trees):
+        for _round_idx in range(self.n_trees):
             # Update params from raw predictions (apply link functions)
             for param_name in self.distribution_.param_names:
                 params[param_name] = self.distribution_.link(
diff --git a/src/openboost/_models/_gam.py b/src/openboost/_models/_gam.py
index 8cf56c5..a55c90a 100644
--- a/src/openboost/_models/_gam.py
+++ b/src/openboost/_models/_gam.py
@@ -19,7 +19,7 @@
 
 from .._array import BinnedArray, array
 from .._backends import is_cuda
-from .._loss import get_loss_function, LossFunction
+from .._loss import LossFunction, get_loss_function
 from .._persistence import PersistenceMixin
 
 if TYPE_CHECKING:
@@ -67,7 +67,7 @@ class OpenBoostGAM(PersistenceMixin):
     X_binned_: BinnedArray | None = field(default=None, init=False, repr=False)
     _loss_fn: LossFunction | None = field(default=None, init=False, repr=False)
     
-    def fit(self, X: NDArray, y: NDArray) -> "OpenBoostGAM":
+    def fit(self, X: NDArray, y: NDArray) -> OpenBoostGAM:
         """Fit the GAM model.
         
         Args:
@@ -107,6 +107,7 @@ def fit(self, X: NDArray, y: NDArray) -> "OpenBoostGAM":
     def _fit_gpu(self, y: NDArray):
         """GPU training path - all features in parallel."""
         from numba import cuda
+
         from .._backends._cuda import build_histogram_cuda
         
         n_features = self.X_binned_.n_features
@@ -287,8 +288,8 @@ def plot_shape_function(self, feature_idx: int, feature_name: str | None = None)
         """
         try:
             import matplotlib.pyplot as plt
-        except ImportError:
-            raise ImportError("matplotlib required for plotting. Install with: pip install matplotlib")
+        except ImportError as err:
+            raise ImportError("matplotlib required for plotting. Install with: pip install matplotlib") from err
         
         if self.shape_values_ is None:
             raise RuntimeError("Model not fitted.")
diff --git a/src/openboost/_models/_linear_leaf.py b/src/openboost/_models/_linear_leaf.py
index 96c5b9e..5fabdfb 100644
--- a/src/openboost/_models/_linear_leaf.py
+++ b/src/openboost/_models/_linear_leaf.py
@@ -32,10 +32,9 @@
 import numpy as np
 
 from .._array import BinnedArray, array
-from .._backends import is_cuda
-from .._loss import get_loss_function, LossFunction
-from .._core._tree import fit_tree
 from .._core._growth import TreeStructure
+from .._core._tree import fit_tree
+from .._loss import LossFunction, get_loss_function
 from .._persistence import PersistenceMixin
 
 if TYPE_CHECKING:
@@ -89,7 +88,7 @@ def predict(self, X: NDArray) -> NDArray:
             node_id = int(leaf_node_ids[sample_idx])
 
             # Look up leaf index using integer node index as key.
-            if node_id in self.leaf_ids:
+            if node_id in self.leaf_ids:  # noqa: SIM108
                 leaf_idx = self.leaf_ids[node_id]
             else:
                 # Fallback: use the constant term from first leaf
@@ -119,10 +118,7 @@ def _get_leaf_node_indices(self, X: NDArray) -> NDArray:
 
         # Get binned data for tree traversal
         from .._array import BinnedArray as BA
-        if isinstance(X_binned, BA):
-            binned = X_binned.data
-        else:
-            binned = X_binned
+        binned = X_binned.data if isinstance(X_binned, BA) else X_binned
 
         if hasattr(binned, 'copy_to_host'):
             binned = binned.copy_to_host()
@@ -206,7 +202,7 @@ class LinearLeafGBDT(PersistenceMixin):
     _loss_fn: LossFunction | None = field(default=None, init=False, repr=False)
     n_features_in_: int = field(default=0, init=False, repr=False)
     
-    def fit(self, X: NDArray, y: NDArray) -> "LinearLeafGBDT":
+    def fit(self, X: NDArray, y: NDArray) -> LinearLeafGBDT:
         """Fit the linear leaf GBDT model.
         
         Args:
@@ -255,7 +251,7 @@ def fit(self, X: NDArray, y: NDArray) -> "LinearLeafGBDT":
             self.base_score_ = np.float32(np.mean(y))
         pred = np.full(n_samples, self.base_score_, dtype=np.float32)
         
-        for round_idx in range(self.n_trees):
+        for _round_idx in range(self.n_trees):
             # Compute gradients
             grad, hess = self._loss_fn(pred, y)
             grad = np.asarray(grad, dtype=np.float32)
diff --git a/src/openboost/_models/_sklearn.py b/src/openboost/_models/_sklearn.py
index 4d5c155..24318c3 100644
--- a/src/openboost/_models/_sklearn.py
+++ b/src/openboost/_models/_sklearn.py
@@ -39,9 +39,9 @@
 import numpy as np
 
 try:
-    from sklearn.base import BaseEstimator, RegressorMixin, ClassifierMixin
-    from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
+    from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin
     from sklearn.preprocessing import LabelEncoder
+    from sklearn.utils.validation import check_array, check_is_fitted, check_X_y
     SKLEARN_AVAILABLE = True
 except ImportError:
     SKLEARN_AVAILABLE = False
@@ -53,9 +53,9 @@ class RegressorMixin:
     class ClassifierMixin:
         pass
 
-from ._boosting import GradientBoosting, MultiClassGradientBoosting
-from .._callbacks import EarlyStopping, Logger, Callback
+from .._callbacks import EarlyStopping
 from .._importance import compute_feature_importances
+from ._boosting import GradientBoosting, MultiClassGradientBoosting
 
 if TYPE_CHECKING:
     from numpy.typing import NDArray
@@ -204,7 +204,7 @@ def fit(
         y: NDArray,
         sample_weight: NDArray | None = None,
         eval_set: list[tuple[NDArray, NDArray]] | None = None,
-    ) -> "OpenBoostRegressor":
+    ) -> OpenBoostRegressor:
         """Fit the gradient boosting regressor.
         
         Parameters
@@ -435,7 +435,7 @@ def fit(
         y: NDArray,
         sample_weight: NDArray | None = None,
         eval_set: list[tuple[NDArray, NDArray]] | None = None,
-    ) -> "OpenBoostClassifier":
+    ) -> OpenBoostClassifier:
         """Fit the gradient boosting classifier.
         
         Parameters
@@ -698,7 +698,7 @@ def fit(
         X: NDArray,
         y: NDArray,
         **kwargs,
-    ) -> "OpenBoostDistributionalRegressor":
+    ) -> OpenBoostDistributionalRegressor:
         """Fit the distributional regressor.
         
         Parameters
@@ -719,7 +719,7 @@ def fit(
         self.n_features_in_ = X.shape[1]
         
         # Import here to avoid circular imports
-        from ._distributional import NGBoost, DistributionalGBDT
+        from ._distributional import DistributionalGBDT, NGBoost
         
         ModelClass = NGBoost if self.use_natural_gradient else DistributionalGBDT
         
@@ -961,7 +961,7 @@ def fit(
         X: NDArray,
         y: NDArray,
         **kwargs,
-    ) -> "OpenBoostLinearLeafRegressor":
+    ) -> OpenBoostLinearLeafRegressor:
         """Fit the linear leaf regressor.
         
         Parameters
diff --git a/src/openboost/_persistence.py b/src/openboost/_persistence.py
index 948a85d..1342e91 100644
--- a/src/openboost/_persistence.py
+++ b/src/openboost/_persistence.py
@@ -144,7 +144,7 @@ def _dict_to_tree(data: dict[str, Any]) -> TreeStructure:
     Returns:
         TreeStructure instance
     """
-    from ._core._growth import TreeStructure, ScalarLeaves, VectorLeaves
+    from ._core._growth import ScalarLeaves, TreeStructure, VectorLeaves
 
     # Handle leaf values based on type
     values_type = data.get("values_type", "array")
@@ -211,12 +211,12 @@ def _get_persist_attrs(self) -> list[str]:
             attrs = list(self.__dataclass_fields__.keys())
             # Also include fitted attributes (sklearn convention: trailing _)
             # and other instance attributes not in dataclass fields
-            for k in vars(self).keys():
+            for k in vars(self):
                 if k not in attrs and not k.startswith("_"):
                     attrs.append(k)
             return attrs
         # Fallback: all non-private attributes
-        return [k for k in vars(self).keys() if not k.startswith("_")]
+        return [k for k in vars(self) if not k.startswith("_")]
 
     def _to_state_dict(self) -> dict[str, Any]:
         """Convert model to a serializable state dictionary.
@@ -285,7 +285,7 @@ def _from_state_dict(self, state: dict[str, Any]) -> None:
         import warnings
 
         _CURRENT_SERIALIZATION_VERSION = 1
-        saved_version = state.get("_serialization_version", None)
+        saved_version = state.get("_serialization_version")
         if saved_version is None:
             warnings.warn(
                 "Loading a model saved without a serialization version number. "
@@ -341,8 +341,9 @@ def _from_state_dict(self, state: dict[str, Any]) -> None:
 
         # Restore bin edges for transform
         if "_bin_edges" in state:
-            from ._array import BinnedArray
             import numpy as np
+
+            from ._array import BinnedArray
             
             # Create a minimal BinnedArray with just bin edges for transform
             n_features = state.get("_n_features", len(state["_bin_edges"]))
@@ -425,6 +426,7 @@ def load(cls: type[T], path: str | Path) -> T:
             >>> predictions = model.predict(X_test)
         """
         import warnings
+
         import joblib
 
         warnings.warn(
diff --git a/src/openboost/_predict.py b/src/openboost/_predict.py
index cec7310..7a29db6 100644
--- a/src/openboost/_predict.py
+++ b/src/openboost/_predict.py
@@ -15,9 +15,10 @@
 from ._backends import is_cuda
 
 if TYPE_CHECKING:
-    from ._tree import Tree
     from numpy.typing import NDArray
 
+    from ._tree import Tree
+
 
 def predict_ensemble(
     trees: list[Tree],
@@ -170,10 +171,7 @@ def kernel(X_binned, node_features, node_thresholds, node_left, node_right,
             while node_features[node] >= 0:  # Not a leaf
                 feat = node_features[node]
                 val = X_binned[feat, idx]  # Feature-major layout
-                if val <= node_thresholds[node]:
-                    node = node_left[node]
-                else:
-                    node = node_right[node]
+                node = node_left[node] if val <= node_thresholds[node] else node_right[node]
             
             # Add leaf value to prediction
             pred[idx] += learning_rate * node_values[node]
diff --git a/src/openboost/_profiler.py b/src/openboost/_profiler.py
index b3e429f..24fbb14 100644
--- a/src/openboost/_profiler.py
+++ b/src/openboost/_profiler.py
@@ -30,7 +30,6 @@
 
 from ._callbacks import Callback, TrainingState
 
-
 # =============================================================================
 # Phase timer
 # =============================================================================
@@ -233,8 +232,9 @@ def _get_timer(self, name: str) -> PhaseTimer:
 
     def _wrap_primitives(self) -> None:
         import sys
-        import openboost._core._primitives as prims_mod
+
         import openboost._core._growth as growth_mod
+        import openboost._core._primitives as prims_mod
 
         # Wrap the 4 core primitives
         for func_name in _PRIMITIVES_TO_WRAP:
@@ -261,15 +261,16 @@ def wrapper(*args, **kwargs):
         for mod_name in _FIT_TREE_MODULES:
             mod = sys.modules.get(mod_name)
             if mod and hasattr(mod, "fit_tree"):
-                original_ft = getattr(mod, "fit_tree")
+                original_ft = mod.fit_tree
                 self._originals[("fit_tree", mod_name)] = original_ft
                 wrapped_ft = make_wrapper(original_ft, fit_tree_timer)
-                setattr(mod, "fit_tree", wrapped_ft)
+                mod.fit_tree = wrapped_ft
 
     def _unwrap_primitives(self) -> None:
         import sys
-        import openboost._core._primitives as prims_mod
+
         import openboost._core._growth as growth_mod
+        import openboost._core._primitives as prims_mod
 
         for key, original in self._originals.items():
             kind, name = key
@@ -280,7 +281,7 @@ def _unwrap_primitives(self) -> None:
             elif kind == "fit_tree":
                 mod = sys.modules.get(name)
                 if mod:
-                    setattr(mod, "fit_tree", original)
+                    mod.fit_tree = original
         self._originals.clear()
 
     # ----- callback hooks -----
diff --git a/src/openboost/_sampling.py b/src/openboost/_sampling.py
index fb72747..11be689 100644
--- a/src/openboost/_sampling.py
+++ b/src/openboost/_sampling.py
@@ -11,9 +11,10 @@
 
 from __future__ import annotations
 
-from dataclasses import dataclass, field
+from collections.abc import Callable
+from dataclasses import dataclass
 from enum import Enum
-from typing import TYPE_CHECKING, Callable
+from typing import TYPE_CHECKING
 
 import numpy as np
 
@@ -167,10 +168,7 @@ def goss_sample(
     n_samples = len(grad)
     
     # Handle multi-dimensional gradients (e.g., distributional GBDT)
-    if grad.ndim > 1:
-        abs_grad = np.sum(np.abs(grad), axis=1)
-    else:
-        abs_grad = np.abs(grad)
+    abs_grad = np.sum(np.abs(grad), axis=1) if grad.ndim > 1 else np.abs(grad)
     
     # Number of samples to keep from each group
     n_top = int(n_samples * top_rate)
@@ -523,10 +521,7 @@ def create_memmap_binned(
     binned = ob_array(X, n_bins=n_bins, device='cpu')
     
     # Get the binned data
-    if hasattr(binned.data, 'copy_to_host'):
-        data = binned.data.copy_to_host()
-    else:
-        data = binned.data
+    data = binned.data.copy_to_host() if hasattr(binned.data, 'copy_to_host') else binned.data
     
     # Create memory-mapped file
     mmap = np.memmap(path, dtype=np.uint8, mode='w+', shape=data.shape)
diff --git a/src/openboost/_split.py b/src/openboost/_split.py
index 25265a9..f4e456a 100644
--- a/src/openboost/_split.py
+++ b/src/openboost/_split.py
@@ -3,7 +3,7 @@
 import numpy as np
 from numba import cuda
 
-from ._kernels import find_best_split_kernel, HIST_BLOCK_SIZE
+from ._kernels import HIST_BLOCK_SIZE, find_best_split_kernel
 
 
 def find_best_splits(
diff --git a/src/openboost/_training.py b/src/openboost/_training.py
index 7074935..30cda7f 100644
--- a/src/openboost/_training.py
+++ b/src/openboost/_training.py
@@ -22,8 +22,9 @@
 
 from __future__ import annotations
 
-from dataclasses import dataclass, field
-from typing import TYPE_CHECKING, Any, Callable
+from collections.abc import Callable
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any
 
 import numpy as np
 
diff --git a/src/openboost/_utils.py b/src/openboost/_utils.py
index 538be7f..23f6d11 100644
--- a/src/openboost/_utils.py
+++ b/src/openboost/_utils.py
@@ -29,7 +29,7 @@
 
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Literal, Any
+from typing import TYPE_CHECKING, Any, Literal
 
 import numpy as np
 
@@ -44,13 +44,13 @@
 def _check_sklearn():
     """Check if sklearn is available."""
     try:
-        import sklearn
+        import sklearn  # noqa: F401
         return True
-    except ImportError:
+    except ImportError as err:
         raise ImportError(
             "scikit-learn is required for evaluation metrics. "
             "Install with: pip install scikit-learn"
-        )
+        ) from err
 
 
 def roc_auc_score(
@@ -474,7 +474,7 @@ def crps_empirical(
     if samples.ndim == 1:
         samples = samples.reshape(-1, 1)
     
-    n_samples = samples.shape[0]
+    samples.shape[0]
     n_mc = samples.shape[1]
     
     # E|X - y| term
@@ -924,7 +924,6 @@ def suggest_params(
     n_unique = len(unique_y)
 
     # Detect task type from y if classification
-    is_binary = n_unique == 2
     is_multiclass = n_unique > 2 and n_unique <= 50 and task == 'classification'
     is_imbalanced = False
     if task == 'classification' and n_unique <= 50:
@@ -1030,13 +1029,13 @@ def cross_val_predict(
         >>> meta_model.fit(oof_pred.reshape(-1, 1), y)
     """
     try:
-        from sklearn.model_selection import KFold
         from sklearn.base import clone
-    except ImportError:
+        from sklearn.model_selection import KFold
+    except ImportError as err:
         raise ImportError(
             "sklearn is required for cross_val_predict. "
             "Install with: pip install scikit-learn"
-        )
+        ) from err
     
     X = np.asarray(X)
     y = np.asarray(y)
@@ -1105,13 +1104,13 @@ def cross_val_predict_proba(
         AttributeError: If model doesn't have predict_proba method.
     """
     try:
-        from sklearn.model_selection import StratifiedKFold
         from sklearn.base import clone
-    except ImportError:
+        from sklearn.model_selection import StratifiedKFold
+    except ImportError as err:
         raise ImportError(
             "sklearn is required for cross_val_predict_proba. "
             "Install with: pip install scikit-learn"
-        )
+        ) from err
 
     if not hasattr(model, 'predict_proba'):
         raise AttributeError(
@@ -1185,13 +1184,13 @@ def cross_val_predict_interval(
         AttributeError: If model doesn't have predict_interval method.
     """
     try:
-        from sklearn.model_selection import KFold
         from sklearn.base import clone
-    except ImportError:
+        from sklearn.model_selection import KFold
+    except ImportError as err:
         raise ImportError(
             "sklearn is required for cross_val_predict_interval. "
             "Install with: pip install scikit-learn"
-        )
+        ) from err
     
     if not hasattr(model, 'predict_interval'):
         raise AttributeError(
diff --git a/src/openboost/_validation.py b/src/openboost/_validation.py
index a842551..48c9c84 100644
--- a/src/openboost/_validation.py
+++ b/src/openboost/_validation.py
@@ -14,7 +14,7 @@
 
 if TYPE_CHECKING:
     from numpy.typing import NDArray
-    from ._array import BinnedArray
+
 
 
 class ValidationError(ValueError):
@@ -191,7 +191,7 @@ def validate_y(
     # Check for infinity
     if np.any(np.isinf(y)):
         raise ValueError(
-            f"y contains infinite values. Replace with finite values."
+            "y contains infinite values. Replace with finite values."
         )
 
     # Task-specific validation
@@ -213,12 +213,11 @@ def validate_y(
 
     elif task == "multiclass":
         unique_values = np.unique(y)
-        if not np.issubdtype(y.dtype, np.integer):
-            if not np.allclose(y, y.astype(int)):
-                raise ValueError(
-                    f"Multi-class classification expects integer class labels, "
-                    f"got non-integer values. Convert y to integers."
-                )
+        if not np.issubdtype(y.dtype, np.integer) and not np.allclose(y, y.astype(int)):
+            raise ValueError(
+                "Multi-class classification expects integer class labels, "
+                "got non-integer values. Convert y to integers."
+            )
         if np.min(y) != 0:
             warnings.warn(
                 f"Multi-class labels should start from 0, "

From 844ff4b492a0cd1e5bf9d3037ff6ad9ac94f803e Mon Sep 17 00:00:00 2001
From: J Xu <jxucoder@gmail.com>
Date: Sun, 22 Mar 2026 19:32:53 -0700
Subject: [PATCH 3/6] Remove .claude directory from tracking

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .claude/worktrees/silly-kirch | 1 -
 1 file changed, 1 deletion(-)
 delete mode 160000 .claude/worktrees/silly-kirch

diff --git a/.claude/worktrees/silly-kirch b/.claude/worktrees/silly-kirch
deleted file mode 160000
index 7450b78..0000000
--- a/.claude/worktrees/silly-kirch
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 7450b7841f2af64569a9cc573946c42b0fcae851

From 14231e3417f46f485f5d40d00dd87009976273cc Mon Sep 17 00:00:00 2001
From: J Xu <jxucoder@gmail.com>
Date: Sun, 22 Mar 2026 19:33:15 -0700
Subject: [PATCH 4/6] Add .claude/ to .gitignore

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 96b61f7..d5f513f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -214,6 +214,7 @@ __marimo__/
 repos/
 
 # Project-specific
+.claude/
 .cursor/
 agent_space/
 logs/

From bda4781adf6d61dc0990e0729c59eacfd4787751 Mon Sep 17 00:00:00 2001
From: J Xu <jxucoder@gmail.com>
Date: Sun, 22 Mar 2026 19:54:23 -0700
Subject: [PATCH 5/6] Guard GPU-native builder against missing/categorical data
 and convert Tree to TreeStructure

Addresses PR review comments:
- Skip fit_tree_gpu_native when BinnedArray has missing values or categorical
  features, since the GPU-native builder doesn't support them
- Convert legacy Tree to TreeStructure after GPU-native building so that
  feature importance, persistence, and sklearn wrappers work correctly
- Do fused prediction on the legacy Tree before converting (keeps perf benefit)
- Restructure prediction update to avoid double-append to trees_

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/openboost/_models/_boosting.py | 67 ++++++++++++++++++++----------
 1 file changed, 46 insertions(+), 21 deletions(-)

diff --git a/src/openboost/_models/_boosting.py b/src/openboost/_models/_boosting.py
index 1f263f4..cd49a9c 100644
--- a/src/openboost/_models/_boosting.py
+++ b/src/openboost/_models/_boosting.py
@@ -700,15 +700,29 @@ def _fit_gpu(
             else:
                 # Use GPU-native tree builder when no features require the
                 # growth-strategy path (reg_alpha, colsample, subsample, etc.)
+                # Also skip GPU-native when data has missing values or
+                # categorical features, which it doesn't support.
+                has_missing = (
+                    hasattr(self.X_binned_, 'has_missing')
+                    and len(self.X_binned_.has_missing) > 0
+                    and np.any(self.X_binned_.has_missing)
+                )
+                has_categorical = (
+                    hasattr(self.X_binned_, 'is_categorical')
+                    and len(self.X_binned_.is_categorical) > 0
+                    and np.any(self.X_binned_.is_categorical)
+                )
                 use_gpu_native = (
                     is_cuda()
                     and self.reg_alpha == 0.0
                     and self.colsample_bytree >= 1.0
                     and self.subsample >= 1.0
+                    and not has_missing
+                    and not has_categorical
                 )
                 if use_gpu_native:
                     from .._core._tree import fit_tree_gpu_native
-                    tree = fit_tree_gpu_native(
+                    legacy_tree = fit_tree_gpu_native(
                         self.X_binned_,
                         grad_gpu,
                         hess_gpu,
@@ -717,6 +731,25 @@ def _fit_gpu(
                         reg_lambda=self.reg_lambda,
                         min_gain=self.gamma,
                     )
+                    # Use fused prediction before converting to TreeStructure
+                    from .._core._predict import predict_tree_add_gpu
+                    predict_tree_add_gpu(
+                        legacy_tree, self.X_binned_, pred_gpu, self.learning_rate
+                    )
+                    # Convert legacy Tree to TreeStructure for compatibility
+                    # with feature importance, persistence, and sklearn wrappers
+                    features, thresholds, values, left, right = legacy_tree.to_arrays()
+                    tree = TreeStructure(
+                        features=features,
+                        thresholds=thresholds,
+                        left_children=left,
+                        right_children=right,
+                        values=values,
+                        n_nodes=len(features),
+                        depth=legacy_tree.depth,
+                        n_features=legacy_tree.n_features,
+                    )
+                    self.trees_.append(tree)
                 else:
                     tree = fit_tree(
                         self.X_binned_,
@@ -730,26 +763,18 @@ def _fit_gpu(
                         subsample=self.subsample,
                         colsample_bytree=self.colsample_bytree,
                     )
-            
-            # Update predictions on GPU
-            from .._core._tree import Tree
-            if isinstance(tree, Tree) and tree.on_gpu:
-                # Fused traversal + add: single kernel, no intermediate array
-                from .._core._predict import predict_tree_add_gpu
-                predict_tree_add_gpu(tree, self.X_binned_, pred_gpu, self.learning_rate)
-            else:
-                tree_pred = tree(self.X_binned_)
-                if hasattr(tree_pred, '__cuda_array_interface__'):
-                    from .._core._predict import _add_inplace_cuda
-                    _add_inplace_cuda(pred_gpu, tree_pred, self.learning_rate)
-                else:
-                    if hasattr(tree_pred, 'copy_to_host'):
-                        tree_pred = tree_pred.copy_to_host()
-                    pred_cpu = pred_gpu.copy_to_host()
-                    pred_cpu += self.learning_rate * tree_pred
-                    cuda.to_device(pred_cpu, to=pred_gpu)
-
-            self.trees_.append(tree)
+                    # Update predictions on GPU
+                    tree_pred = tree(self.X_binned_)
+                    if hasattr(tree_pred, '__cuda_array_interface__'):
+                        from .._core._predict import _add_inplace_cuda
+                        _add_inplace_cuda(pred_gpu, tree_pred, self.learning_rate)
+                    else:
+                        if hasattr(tree_pred, 'copy_to_host'):
+                            tree_pred = tree_pred.copy_to_host()
+                        pred_cpu = pred_gpu.copy_to_host()
+                        pred_cpu += self.learning_rate * tree_pred
+                        cuda.to_device(pred_cpu, to=pred_gpu)
+                    self.trees_.append(tree)
 
             # Only compute loss and copy to CPU when callbacks need it
             if cb_manager.callbacks:

From 9e8be0ca0c62785a4f0b94ffe6f337e4ab302edf Mon Sep 17 00:00:00 2001
From: J Xu <jxucoder@gmail.com>
Date: Sun, 22 Mar 2026 20:00:00 -0700
Subject: [PATCH 6/6] Update CLAUDE.md with profiling, parallel tests, and
 GPU-native constraints

- Add profiling commands and OPENBOOST_PROFILE env var
- Document pytest-xdist parallel execution and conftest.py
- Update ruff ignore list (E402, F821)
- Document GPU-native builder limitations (no missing/categorical)
- Add _profiler.py to architecture section

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 CLAUDE.md | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/CLAUDE.md b/CLAUDE.md
index afc6f10..be2fcd7 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -14,13 +14,19 @@ uv sync                                         # Install/sync dependencies
 uv sync --extra cuda                            # With GPU support
 uv sync --extra dev                             # With dev tools (test + bench + sklearn + ruff)
 
-# Testing
-uv run pytest tests/ -v --tb=short              # All tests (CPU)
+# Testing (parallelized with pytest-xdist, -n auto is in addopts)
+uv run pytest tests/ -v --tb=short              # All tests (CPU, parallel)
 uv run pytest tests/test_core.py -v             # Single test file
 uv run pytest tests/test_core.py::test_name -v  # Single test
+uv run pytest tests/ -n 0                       # Force serial (debugging)
 OPENBOOST_BACKEND=cuda uv run pytest tests/     # GPU tests
 OPENBOOST_BACKEND=cpu uv run pytest tests/      # Force CPU
 
+# Profiling
+uv run python benchmarks/profile_loop.py                # Profile training (50K samples default)
+uv run python benchmarks/profile_loop.py --summarize    # Machine-readable bottleneck summary
+OPENBOOST_PROFILE=1 uv run python script.py             # Profile any training run via env var
+
 # Linting
 uv run ruff check src/openboost/               # Lint
 uv run ruff check src/openboost/ --fix          # Autofix
@@ -63,6 +69,9 @@ DART, LinearLeaf     growth strategies
 - **`_distributional.py`** — `NaturalBoost`: distributional GBDT (natural gradient boosting)
 - **`_dart.py`**, **`_linear_leaf.py`**, **`_gam.py`** — Specialized model variants
 
+### Profiling (`_profiler.py`)
+`ProfilingCallback` instruments training by wrapping core primitives (`build_node_histograms`, `find_node_splits`, `partition_samples`, `compute_leaf_values`, `fit_tree`) with timers. Outputs JSON reports to `logs/` with per-phase breakdown, bottleneck identification, and run-over-run comparison. CLI runner: `benchmarks/profile_loop.py`.
+
 ### Loss Functions (`_loss.py`)
 50+ loss implementations. Each returns `(gradient, hessian)`. Custom losses are callables with signature `fn(pred, y) -> (grad, hess)`.
 
@@ -71,10 +80,13 @@ DART, LinearLeaf     growth strategies
 
 ## Key Conventions
 
-- **Python 3.10+** target. Ruff rules: E, F, I, UP, B, SIM (line length 100, E501 ignored).
+- **Python 3.10+** target. Ruff rules: E, F, I, UP, B, SIM (line length 100; E501, E402, F821 ignored).
 - **uv only** for package management — never `pip install` or `conda`.
 - All Numba-jitted functions use `@njit` or `@cuda.jit`. CPU kernels are in `_backends/_cpu.py`, CUDA in `_backends/_cuda.py`.
 - Test environment variable `OPENBOOST_BACKEND=cpu` forces CPU backend in CI.
+- Tests use `pytest-xdist` (`-n auto --dist loadfile`) for parallel execution. Shared fixtures are in `tests/conftest.py` (session-scoped datasets, function-scoped gradients).
+- **GPU-native builder** (`fit_tree_gpu_native`) does not support missing values or categorical features. The training loop in `_boosting.py` auto-falls back to `fit_tree()` when the data has NaN or categorical columns.
+- **Profiling**: `ProfilingCallback` wraps core primitives with timers. Enable via callback or `OPENBOOST_PROFILE=1` env var. Reports go to `logs/` as JSON.
 
 ## Working Style