From 37f8edeb31a53520e3b8691da1210616250017ea Mon Sep 17 00:00:00 2001 From: J Xu Date: Sun, 22 Mar 2026 19:23:03 -0700 Subject: [PATCH 1/6] Add GPU optimizations, profiling tooling, parallel tests, and benchmark restructuring - Wire fit_tree_gpu_native into training loop for ~7x GPU speedup - Add histogram subtraction in LevelWise growth (halves histogram work) - Cache GPU arrays in TreeStructure to avoid repeated to_device() calls - Skip loss computation when no callbacks registered - Add ProfilingCallback with structured JSON reports for improvement loops - Add pytest-xdist for parallel test execution (1.4x faster test suite) - Restructure benchmarks: replace scattered scripts with focused modules - Add shared test fixtures in conftest.py and new test files - Update CI workflow with fast/full/performance test stages Co-Authored-By: Claude Opus 4.6 --- .github/workflows/unit-tests.yml | 61 +- CLAUDE.md | 52 ++ benchmarks/check_performance.py | 177 ++++++ benchmarks/check_scaling.py | 119 ++++ benchmarks/{run.py => compare_cpu.py} | 10 +- benchmarks/compare_gpu.py | 551 ++++++++++++++++ benchmarks/ebm_benchmark.py | 502 --------------- benchmarks/ngboost_benchmark.py | 315 ---------- benchmarks/performance_report.py | 436 ------------- .../{modal_bench.py => profile_kernels.py} | 2 +- benchmarks/profile_loop.py | 100 +++ ...ml_integration.py => validate_datasets.py} | 14 +- benchmarks/xgboost_benchmark.py | 594 ------------------ pyproject.toml | 13 +- src/openboost/__init__.py | 2 + src/openboost/_core/_growth.py | 153 ++++- src/openboost/_models/_boosting.py | 113 +++- src/openboost/_profiler.py | 495 +++++++++++++++ tests/conftest.py | 188 ++++++ tests/test_binning_correctness.py | 170 +++++ tests/test_callbacks.py | 193 ++++++ tests/test_core.py | 225 +++---- tests/test_gam.py | 192 ++++++ tests/test_kernel_correctness.py | 475 ++++++++++++++ tests/test_linear_leaf.py | 180 ++++++ tests/test_loss_correctness.py | 298 +++++++++ tests/test_numerical_agreement.py | 268 ++++++++ uv.lock | 34 +- 28 files changed, 3875 insertions(+), 2057 deletions(-) create mode 100644 benchmarks/check_performance.py create mode 100644 benchmarks/check_scaling.py rename benchmarks/{run.py => compare_cpu.py} (98%) create mode 100644 benchmarks/compare_gpu.py delete mode 100644 benchmarks/ebm_benchmark.py delete mode 100644 benchmarks/ngboost_benchmark.py delete mode 100644 benchmarks/performance_report.py rename benchmarks/{modal_bench.py => profile_kernels.py} (99%) create mode 100644 benchmarks/profile_loop.py rename benchmarks/{openml_integration.py => validate_datasets.py} (98%) delete mode 100644 benchmarks/xgboost_benchmark.py create mode 100644 src/openboost/_profiler.py create mode 100644 tests/conftest.py create mode 100644 tests/test_binning_correctness.py create mode 100644 tests/test_callbacks.py create mode 100644 tests/test_gam.py create mode 100644 tests/test_kernel_correctness.py create mode 100644 tests/test_linear_leaf.py create mode 100644 tests/test_loss_correctness.py create mode 100644 tests/test_numerical_agreement.py diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index b7bb19c..62965f8 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -1,4 +1,4 @@ -name: Unit Tests +name: Tests on: push: @@ -7,12 +7,13 @@ on: branches: [main] jobs: - test: + # Fast tests: <3 min, runs on every PR + fast-tests: runs-on: ${{ matrix.os }} strategy: matrix: os: [ubuntu-latest, macos-latest] - python-version: ["3.10", "3.11", "3.12"] + python-version: ["3.10", "3.12"] steps: - uses: actions/checkout@v4 @@ -30,9 +31,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - # Install package with test and sklearn dependencies pip install -e ".[test,sklearn]" - # Also install xgboost for integration tests pip install "xgboost>=2.0" - name: Lint with ruff @@ -40,9 +39,59 @@ jobs: pip install "ruff>=0.4" ruff check src/openboost/ - - name: Run tests (CPU backend) + - name: Run fast tests (CPU backend) + env: + OPENBOOST_BACKEND: "cpu" + run: | + pytest tests/ -v --tb=short -m "not slow and not benchmark" + + # Full tests: includes slow tests, runs after fast tests pass + full-tests: + runs-on: ubuntu-latest + needs: fast-tests + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python 3.12 + uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -e ".[test,sklearn]" + pip install "xgboost>=2.0" + + - name: Run all tests (CPU backend) env: OPENBOOST_BACKEND: "cpu" run: | pytest tests/ -v --tb=short + # Performance regression check: main branch only + performance-check: + runs-on: ubuntu-latest + if: github.ref == 'refs/heads/main' + needs: full-tests + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python 3.12 + uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -e ".[test,sklearn]" + pip install "xgboost>=2.0" + + - name: Performance regression check + env: + OPENBOOST_BACKEND: "cpu" + run: | + python benchmarks/check_performance.py diff --git a/CLAUDE.md b/CLAUDE.md index fba7f6e..afc6f10 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -75,3 +75,55 @@ DART, LinearLeaf growth strategies - **uv only** for package management — never `pip install` or `conda`. - All Numba-jitted functions use `@njit` or `@cuda.jit`. CPU kernels are in `_backends/_cpu.py`, CUDA in `_backends/_cuda.py`. - Test environment variable `OPENBOOST_BACKEND=cpu` forces CPU backend in CI. + +## Working Style + +### 1. Plan Mode Default +- Enter plan mode for ANY non-trivial task (3+ steps or architectural decisions) +- If something goes sideways, STOP and re-plan immediately -- don't keep pushing +- Use plan mode for verification steps, not just building +- Write detailed specs upfront to reduce ambiguity + +### 2. Subagent Strategy +- Use subagents liberally to keep main context window clean +- Offload research, exploration, and parallel analysis to subagents +- For complex problems, throw more compute at it via subagents +- One task per subagent for focused execution + +### 3. Self-Improvement Loop +- After ANY correction from the user: update `tasks/lessons.md` with the pattern +- Write rules for yourself that prevent the same mistake +- Ruthlessly iterate on these lessons until mistake rate drops +- Review lessons at session start for relevant project + +### 4. Verification Before Done +- Never mark a task complete without proving it works +- Diff behavior between main and your changes when relevant +- Ask yourself: "Would a staff engineer approve this?" +- Run tests, check logs, demonstrate correctness + +### 5. Demand Elegance (Balanced) +- For non-trivial changes: pause and ask "is there a more elegant way?" +- If a fix feels hacky: "Knowing everything I know now, implement the elegant solution" +- Skip this for simple, obvious fixes -- don't over-engineer +- Challenge your own work before presenting it + +### 6. Autonomous Bug Fixing +- When given a bug report: just fix it. Don't ask for hand-holding +- Point at logs, errors, failing tests -- then resolve them +- Zero context switching required from the user +- Go fix failing CI tests without being told how + +## Task Management + +1. **Plan First**: Write plan to `tasks/todo.md` with checkable items +2. **Verify Plan**: Check in before starting implementation +3. **Track Progress**: Mark items complete as you go +4. **Explain Changes**: High-level summary at each step +5. **Document Results**: Add review section to `tasks/todo.md` +6. **Capture Lessons**: Update `tasks/lessons.md` after corrections + +## Core Principles + +- **Simplicity First**: Make every change as simple as possible. Impact minimal code. +- **No Laziness**: Find root causes. No temporary fixes. Senior developer standards. diff --git a/benchmarks/check_performance.py b/benchmarks/check_performance.py new file mode 100644 index 0000000..bc8aa5c --- /dev/null +++ b/benchmarks/check_performance.py @@ -0,0 +1,177 @@ +"""Performance regression check for CI. + +Runs a fixed, small benchmark and compares against stored baselines. +Fails if any metric degrades by more than 20%. + +Usage: + uv run python benchmarks/check_performance.py + uv run python benchmarks/check_performance.py --update-baselines +""" + +from __future__ import annotations + +import argparse +import json +import sys +import time +import tracemalloc +from pathlib import Path + +import numpy as np + +PROJECT_ROOT = Path(__file__).parent.parent +sys.path.insert(0, str(PROJECT_ROOT / "src")) + +BASELINE_FILE = Path(__file__).parent / "results" / "performance_baselines.json" + +# Regression threshold: fail if metric exceeds baseline by this factor +REGRESSION_THRESHOLD = 1.20 # 20% + + +def _generate_data(n_samples=5000, n_features=10, seed=42): + """Generate fixed synthetic dataset.""" + rng = np.random.RandomState(seed) + X = rng.randn(n_samples, n_features).astype(np.float32) + y = (X[:, 0] + 0.5 * X[:, 1] - 0.3 * X[:, 2] + + rng.randn(n_samples).astype(np.float32) * 0.1).astype(np.float32) + return X, y + + +def run_fixed_benchmark(): + """Run fixed benchmark and return results dict.""" + import openboost as ob + + X, y = _generate_data() + n_trees = 100 + max_depth = 6 + + # Measure fit time (median of 3 trials) + fit_times = [] + for _ in range(3): + model = ob.GradientBoosting( + n_trees=n_trees, max_depth=max_depth, learning_rate=0.1 + ) + t0 = time.perf_counter() + model.fit(X, y) + fit_times.append(time.perf_counter() - t0) + + # Measure predict time + predict_times = [] + for _ in range(3): + t0 = time.perf_counter() + model.predict(X) + predict_times.append(time.perf_counter() - t0) + + # Measure peak memory + tracemalloc.start() + model2 = ob.GradientBoosting( + n_trees=n_trees, max_depth=max_depth, learning_rate=0.1 + ) + model2.fit(X, y) + _, peak = tracemalloc.get_traced_memory() + tracemalloc.stop() + peak_mb = peak / 1024 / 1024 + + # Measure accuracy + pred = model.predict(X) + mse = float(np.mean((pred - y) ** 2)) + r2 = float(1 - np.sum((pred - y) ** 2) / np.sum((y - np.mean(y)) ** 2)) + + return { + "fit_time_median": float(sorted(fit_times)[1]), + "predict_time_median": float(sorted(predict_times)[1]), + "peak_memory_mb": float(peak_mb), + "mse": mse, + "r2": r2, + "n_samples": len(X), + "n_features": X.shape[1], + "n_trees": n_trees, + "max_depth": max_depth, + } + + +def save_baselines(results): + """Save results as new baselines.""" + BASELINE_FILE.parent.mkdir(parents=True, exist_ok=True) + with open(BASELINE_FILE, "w") as f: + json.dump(results, f, indent=2) + print(f"Baselines saved to {BASELINE_FILE}") + + +def load_baselines(): + """Load stored baselines.""" + with open(BASELINE_FILE) as f: + return json.load(f) + + +def check_regression(results, baselines): + """Compare results against baselines. Returns list of regressions.""" + regressions = [] + + # Time metrics: fail if current > baseline * threshold + for metric in ["fit_time_median", "predict_time_median"]: + if results[metric] > baselines[metric] * REGRESSION_THRESHOLD: + regressions.append( + f" {metric}: {results[metric]:.4f}s > " + f"{baselines[metric]:.4f}s * {REGRESSION_THRESHOLD} = " + f"{baselines[metric] * REGRESSION_THRESHOLD:.4f}s" + ) + + # Memory: fail if current > baseline * threshold + if results["peak_memory_mb"] > baselines["peak_memory_mb"] * REGRESSION_THRESHOLD: + regressions.append( + f" peak_memory_mb: {results['peak_memory_mb']:.2f}MB > " + f"{baselines['peak_memory_mb']:.2f}MB * {REGRESSION_THRESHOLD}" + ) + + # Accuracy: fail if MSE increases (model got worse) + if results["mse"] > baselines["mse"] * REGRESSION_THRESHOLD: + regressions.append( + f" mse: {results['mse']:.6f} > " + f"{baselines['mse']:.6f} * {REGRESSION_THRESHOLD}" + ) + + return regressions + + +def main(): + parser = argparse.ArgumentParser(description="Performance regression check") + parser.add_argument( + "--update-baselines", action="store_true", + help="Update baselines with current results" + ) + args = parser.parse_args() + + print("Running fixed benchmark...") + results = run_fixed_benchmark() + + print(f" fit_time: {results['fit_time_median']:.4f}s") + print(f" predict_time: {results['predict_time_median']:.4f}s") + print(f" peak_memory: {results['peak_memory_mb']:.2f}MB") + print(f" mse: {results['mse']:.6f}") + print(f" r2: {results['r2']:.4f}") + + if args.update_baselines: + save_baselines(results) + return + + if not BASELINE_FILE.exists(): + print(f"\nNo baselines found at {BASELINE_FILE}") + print("Run with --update-baselines to create them.") + save_baselines(results) + return + + baselines = load_baselines() + regressions = check_regression(results, baselines) + + if regressions: + print(f"\nPerformance regression detected ({REGRESSION_THRESHOLD:.0%} threshold):") + for r in regressions: + print(r) + sys.exit(1) + else: + print("\nNo performance regressions detected.") + + +if __name__ == "__main__": + main() diff --git a/benchmarks/check_scaling.py b/benchmarks/check_scaling.py new file mode 100644 index 0000000..47633c6 --- /dev/null +++ b/benchmarks/check_scaling.py @@ -0,0 +1,119 @@ +"""Scaling analysis for OpenBoost. + +Measures how training time scales with n_samples and n_features. +Computes scaling exponents to verify sub-quadratic behavior. + +Usage: + uv run python benchmarks/check_scaling.py + uv run python benchmarks/check_scaling.py --quick +""" + +from __future__ import annotations + +import argparse +import sys +import time +from pathlib import Path + +import numpy as np + +PROJECT_ROOT = Path(__file__).parent.parent +sys.path.insert(0, str(PROJECT_ROOT / "src")) + +import openboost as ob # noqa: E402 + + +def run_scaling_analysis(quick=False): + """Run scaling analysis across n_samples and n_features.""" + + if quick: + sample_grid = [1_000, 5_000, 10_000] + feature_grid = [10, 50] + n_trees = 20 + else: + sample_grid = [1_000, 5_000, 10_000, 50_000, 100_000] + feature_grid = [10, 50, 100] + n_trees = 50 + + max_depth = 6 + learning_rate = 0.1 + + results = [] + + print(f"{'n_samples':>10} {'n_features':>10} {'fit_time':>10} {'pred_time':>10}") + print("-" * 45) + + for n_features in feature_grid: + for n_samples in sample_grid: + rng = np.random.RandomState(42) + X = rng.randn(n_samples, n_features).astype(np.float32) + y = (X[:, 0] + 0.5 * X[:, 1] + rng.randn(n_samples).astype(np.float32) * 0.1).astype(np.float32) + + # Fit timing (single run for large data, 3 runs for small) + trials = 3 if n_samples <= 10_000 else 1 + fit_times = [] + for _ in range(trials): + m = ob.GradientBoosting( + n_trees=n_trees, max_depth=max_depth, learning_rate=learning_rate + ) + t0 = time.perf_counter() + m.fit(X, y) + fit_times.append(time.perf_counter() - t0) + + # Predict timing + pred_times = [] + for _ in range(3): + t0 = time.perf_counter() + m.predict(X) + pred_times.append(time.perf_counter() - t0) + + fit_time = sorted(fit_times)[len(fit_times) // 2] + pred_time = sorted(pred_times)[1] + + results.append({ + "n_samples": n_samples, + "n_features": n_features, + "fit_time": fit_time, + "pred_time": pred_time, + }) + + print(f"{n_samples:>10} {n_features:>10} {fit_time:>10.4f}s {pred_time:>10.4f}s") + + # Compute scaling exponents + print("\n" + "=" * 50) + print("Scaling Exponents (log(time) = alpha * log(n_samples) + beta)") + print(" alpha ≈ 1.0: linear scaling (optimal)") + print(" alpha ≈ 1.5: O(n^1.5) (acceptable)") + print(" alpha ≈ 2.0: quadratic (bad)") + print("=" * 50) + + for n_features in feature_grid: + subset = [r for r in results if r["n_features"] == n_features] + if len(subset) < 3: + continue + + log_n = np.log([r["n_samples"] for r in subset]) + log_t = np.log([r["fit_time"] for r in subset]) + + # Linear regression: log_t = alpha * log_n + beta + alpha, beta = np.polyfit(log_n, log_t, 1) + + print(f"\n n_features={n_features}: alpha = {alpha:.2f}") + if alpha < 1.3: + print(" -> Near-linear scaling") + elif alpha < 1.7: + print(" -> Slightly super-linear") + else: + print(" -> WARNING: scaling appears quadratic or worse") + + +def main(): + parser = argparse.ArgumentParser(description="OpenBoost Scaling Analysis") + parser.add_argument("--quick", action="store_true", help="Quick mode with fewer data points") + args = parser.parse_args() + + run_scaling_analysis(quick=args.quick) + + +if __name__ == "__main__": + main() diff --git a/benchmarks/run.py b/benchmarks/compare_cpu.py similarity index 98% rename from benchmarks/run.py rename to benchmarks/compare_cpu.py index d7aa1ab..5da011f 100644 --- a/benchmarks/run.py +++ b/benchmarks/compare_cpu.py @@ -4,11 +4,11 @@ dependencies beyond the project's [bench] extras. Usage: - uv run python benchmarks/run.py - uv run python benchmarks/run.py --quick - uv run python benchmarks/run.py --task regression - uv run python benchmarks/run.py --trials 5 - uv run python benchmarks/run.py --n-samples 100000 + uv run python benchmarks/compare_cpu.py + uv run python benchmarks/compare_cpu.py --quick + uv run python benchmarks/compare_cpu.py --task regression + uv run python benchmarks/compare_cpu.py --trials 5 + uv run python benchmarks/compare_cpu.py --n-samples 100000 Options: --task Run a specific task: regression, binary, multiclass, all (default: all) diff --git a/benchmarks/compare_gpu.py b/benchmarks/compare_gpu.py new file mode 100644 index 0000000..2433df1 --- /dev/null +++ b/benchmarks/compare_gpu.py @@ -0,0 +1,551 @@ +"""GPU benchmarks: OpenBoost vs competitors on Modal A100. + +Three comparisons in one file: + 1. GradientBoosting vs XGBoost — regression, binary, multiclass, poisson + 2. NaturalBoost vs NGBoost — distributional GBDT (uncertainty) + 3. OpenBoostGAM vs InterpretML — interpretable models (GAM) + +Usage: + # Run everything on Modal A100 + uv run modal run benchmarks/compare_gpu.py + + # Run a single comparison + uv run modal run benchmarks/compare_gpu.py --bench xgboost + uv run modal run benchmarks/compare_gpu.py --bench ngboost + uv run modal run benchmarks/compare_gpu.py --bench ebm + + # Run locally (CPU, smaller data) + uv run python benchmarks/compare_gpu.py --local + uv run python benchmarks/compare_gpu.py --local --bench ngboost +""" + +from __future__ import annotations + +import json +import time +from pathlib import Path + +PROJECT_ROOT = Path(__file__).parent.parent + +try: + import modal + + app = modal.App("openboost-gpu-bench") + + image = ( + modal.Image.from_registry( + "nvidia/cuda:12.4.0-devel-ubuntu22.04", add_python="3.12" + ) + .pip_install( + "numpy>=1.24", + "numba>=0.60", + "scikit-learn>=1.0", + "xgboost>=2.0", + "ngboost>=0.5", + "interpret>=0.6", + ) + .add_local_dir( + str(PROJECT_ROOT / "src" / "openboost"), + remote_path="/root/openboost", + ) + ) +except ImportError: + modal = None + app = None + image = None + + +# ============================================================================= +# Data generators +# ============================================================================= + + +def _generate_regression(n_samples, n_features=20, seed=42): + import numpy as np + + rng = np.random.RandomState(seed) + X = rng.randn(n_samples, n_features).astype(np.float32) + y = ( + np.sin(X[:, 0] * 2) + + 0.5 * X[:, 1] ** 2 + + 0.3 * X[:, 2] * X[:, 3] + + 0.1 * rng.randn(n_samples) + ).astype(np.float32) + return X, y + + +def _generate_binary(n_samples, n_features=20, seed=42): + import numpy as np + + rng = np.random.RandomState(seed) + X = rng.randn(n_samples, n_features).astype(np.float32) + logits = X[:, 0] + 0.5 * X[:, 1] - 0.3 * X[:, 2] + y = (rng.rand(n_samples) < 1 / (1 + np.exp(-logits))).astype(np.float32) + return X, y + + +def _generate_multiclass(n_samples, n_features=20, n_classes=5, seed=42): + import numpy as np + + rng = np.random.RandomState(seed) + X = rng.randn(n_samples, n_features).astype(np.float32) + scores = np.zeros((n_samples, n_classes)) + for k in range(n_classes): + scores[:, k] = X[:, k % n_features] + 0.5 * X[:, (k + 1) % n_features] + y = np.argmax(scores + 0.5 * rng.randn(n_samples, n_classes), axis=1) + return X, y.astype(np.int32) + + +def _generate_poisson(n_samples, n_features=20, seed=42): + import numpy as np + + rng = np.random.RandomState(seed) + X = rng.randn(n_samples, n_features).astype(np.float32) + log_mu = 1.0 + 0.5 * X[:, 0] + 0.3 * X[:, 1] - 0.2 * X[:, 2] + mu = np.exp(np.clip(log_mu, -5, 5)) + y = rng.poisson(mu).astype(np.float32) + return X, y + + +def _train_test_split(X, y, test_size=0.2, seed=42): + import numpy as np + + rng = np.random.RandomState(seed) + n = len(y) + idx = rng.permutation(n) + split = int(n * (1 - test_size)) + return X[idx[:split]], X[idx[split:]], y[idx[:split]], y[idx[split:]] + + +# ============================================================================= +# Timing helper +# ============================================================================= + + +def _time_fit(create_model_fn, X, y, n_trials=3, sync_gpu=False): + """Time model.fit() over n_trials, return (median_time, last_fitted_model).""" + times = [] + model = None + for _ in range(n_trials): + model = create_model_fn() + t0 = time.perf_counter() + model.fit(X, y) + if sync_gpu: + from numba import cuda + cuda.synchronize() + times.append(time.perf_counter() - t0) + times.sort() + return times[len(times) // 2], model + + +# ============================================================================= +# Benchmark 1: GradientBoosting vs XGBoost +# ============================================================================= + + +def bench_xgboost(n_samples=50_000, n_trees=100, max_depth=6, use_gpu=False): + """Compare OpenBoost vs XGBoost across tasks.""" + import numpy as np + from sklearn.metrics import accuracy_score, r2_score + + import openboost as ob + + sync = use_gpu + + # Warmup JIT — two iterations to ensure all CUDA kernels are compiled + X_w, y_w = _generate_regression(500) + for _ in range(2): + ob.GradientBoosting(n_trees=3, max_depth=max_depth).fit(X_w, y_w) + if sync: + from numba import cuda + cuda.synchronize() + + xgb_device = "cuda" if use_gpu else "cpu" + + results = {} + + tasks = [ + ("regression", _generate_regression, "mse", "R²"), + ("binary", _generate_binary, "logloss", "AUC"), + ("multiclass", _generate_multiclass, "softmax", "Accuracy"), + ("poisson", _generate_poisson, "poisson", "Deviance"), + ] + + for task_name, gen_fn, ob_loss, metric_label in tasks: + X, y = gen_fn(n_samples) + X_train, X_test, y_train, y_test = _train_test_split(X, y) + + # --- OpenBoost --- + if task_name == "multiclass": + n_classes = len(np.unique(y)) + ob_time, ob_model = _time_fit( + lambda nc=n_classes: ob.MultiClassGradientBoosting( + n_classes=nc, n_trees=n_trees, max_depth=max_depth, + learning_rate=0.1, + ), + X_train, y_train, sync_gpu=sync, + ) + ob_pred = np.argmax(ob_model.predict_proba(X_test), axis=1) + else: + ob_time, ob_model = _time_fit( + lambda loss=ob_loss: ob.GradientBoosting( + n_trees=n_trees, max_depth=max_depth, learning_rate=0.1, + loss=loss, + ), + X_train, y_train, sync_gpu=sync, + ) + ob_pred = ob_model.predict(X_test) + + # --- XGBoost --- + import xgboost as xgb + + if task_name == "regression": + xgb_time, xgb_model = _time_fit( + lambda: xgb.XGBRegressor( + n_estimators=n_trees, max_depth=max_depth, learning_rate=0.1, + tree_method="hist", device=xgb_device, verbosity=0, + ), + X_train, y_train, + ) + xgb_pred = xgb_model.predict(X_test) + elif task_name == "poisson": + xgb_time, xgb_model = _time_fit( + lambda: xgb.XGBRegressor( + n_estimators=n_trees, max_depth=max_depth, learning_rate=0.1, + tree_method="hist", device=xgb_device, objective="count:poisson", + verbosity=0, + ), + X_train, y_train, + ) + xgb_pred = xgb_model.predict(X_test) + else: + xgb_time, xgb_model = _time_fit( + lambda: xgb.XGBClassifier( + n_estimators=n_trees, max_depth=max_depth, learning_rate=0.1, + tree_method="hist", device=xgb_device, verbosity=0, + ), + X_train, y_train, + ) + xgb_pred = xgb_model.predict(X_test) + + # --- Metrics --- + if task_name == "regression": + ob_metric = r2_score(y_test, ob_pred) + xgb_metric = r2_score(y_test, xgb_pred) + elif task_name == "poisson": + ob_exp = np.exp(ob_pred) + ob_metric = float(np.mean(ob_exp - y_test * np.log(np.maximum(ob_exp, 1e-8)))) + xgb_metric = float(np.mean(xgb_pred - y_test * np.log(np.maximum(xgb_pred, 1e-8)))) + elif task_name == "binary": + ob_labels = (ob_pred > 0).astype(float) if np.any(ob_pred < 0) else ob_pred + ob_metric = accuracy_score(y_test, ob_labels) + xgb_metric = accuracy_score(y_test, xgb_pred) + else: + ob_metric = accuracy_score(y_test, ob_pred) + xgb_metric = accuracy_score(y_test, xgb_pred) + + speedup = xgb_time / ob_time + results[task_name] = { + "ob_time": ob_time, "xgb_time": xgb_time, "speedup": speedup, + "ob_metric": float(ob_metric), "xgb_metric": float(xgb_metric), + "metric_label": metric_label, + } + + # Print results + print(f"\n{'='*70}") + print(f" OpenBoost vs XGBoost | {n_samples:,} samples, {n_trees} trees, depth {max_depth}") + print(f" Device: {'GPU' if use_gpu else 'CPU'}") + print(f"{'='*70}") + print(f" {'Task':<14} {'OB (s)':<10} {'XGB (s)':<10} {'Speedup':<10} {'OB metric':<12} {'XGB metric':<12}") + print(f" {'─'*66}") + for task_name, r in results.items(): + faster = "OB" if r["speedup"] > 1 else "XGB" + print( + f" {task_name:<14} {r['ob_time']:<10.3f} {r['xgb_time']:<10.3f} " + f"{r['speedup']:.2f}x {faster:<4} {r['ob_metric']:<12.4f} {r['xgb_metric']:<12.4f}" + ) + + return results + + +# ============================================================================= +# Benchmark 2: NaturalBoost vs NGBoost +# ============================================================================= + + +def bench_ngboost(n_samples=10_000, n_trees=100, use_gpu=False): + """Compare NaturalBoost vs NGBoost (distributional GBDT).""" + import numpy as np + from sklearn.datasets import fetch_california_housing + + import openboost as ob + + sync = use_gpu + + # Warmup + X_w, y_w = _generate_regression(500) + ob.NaturalBoostNormal(n_trees=3, max_depth=3, learning_rate=0.1).fit(X_w, y_w) + if sync: + from numba import cuda + cuda.synchronize() + + results = {} + + # --- Synthetic data --- + for n in [n_samples]: + X, y = _generate_regression(n) + X_train, X_test, y_train, y_test = _train_test_split(X, y) + + # NGBoost + from ngboost import NGBRegressor + from ngboost.distns import Normal + + ngb = NGBRegressor(Dist=Normal, n_estimators=n_trees, learning_rate=0.1, verbose=False) + t0 = time.perf_counter() + ngb.fit(X_train, y_train) + ngb_time = time.perf_counter() - t0 + + ngb_dist = ngb.pred_dist(X_test) + ngb_nll = float(-ngb_dist.logpdf(y_test).mean()) + ngb_lower = ngb_dist.ppf(0.05) + ngb_upper = ngb_dist.ppf(0.95) + ngb_coverage = float(np.mean((y_test >= ngb_lower) & (y_test <= ngb_upper))) + + # NaturalBoost + nb = ob.NaturalBoostNormal(n_trees=n_trees, learning_rate=0.1, max_depth=3) + t0 = time.perf_counter() + nb.fit(X_train, y_train) + if sync: + from numba import cuda + cuda.synchronize() + nb_time = time.perf_counter() - t0 + + nb_nll = float(nb.nll(X_test, y_test)) + nb_lower, nb_upper = nb.predict_interval(X_test, alpha=0.1) + nb_coverage = float(np.mean((y_test >= nb_lower) & (y_test <= nb_upper))) + + results[f"synthetic_{n}"] = { + "ngb_time": ngb_time, "nb_time": nb_time, + "speedup": ngb_time / nb_time, + "ngb_nll": ngb_nll, "nb_nll": nb_nll, + "ngb_coverage": ngb_coverage, "nb_coverage": nb_coverage, + } + + # --- California Housing --- + data = fetch_california_housing() + X = data.data.astype(np.float32) + y = data.target.astype(np.float32) + X_train, X_test, y_train, y_test = _train_test_split(X, y) + + ngb = NGBRegressor(Dist=Normal, n_estimators=n_trees, learning_rate=0.1, verbose=False) + t0 = time.perf_counter() + ngb.fit(X_train, y_train) + ngb_time = time.perf_counter() - t0 + ngb_nll = float(-ngb.pred_dist(X_test).logpdf(y_test).mean()) + + nb = ob.NaturalBoostNormal(n_trees=n_trees, learning_rate=0.1, max_depth=3) + t0 = time.perf_counter() + nb.fit(X_train, y_train) + if sync: + from numba import cuda + cuda.synchronize() + nb_time = time.perf_counter() - t0 + nb_nll = float(nb.nll(X_test, y_test)) + + results["california_housing"] = { + "ngb_time": ngb_time, "nb_time": nb_time, + "speedup": ngb_time / nb_time, + "ngb_nll": ngb_nll, "nb_nll": nb_nll, + } + + # Print + print(f"\n{'='*70}") + print(f" NaturalBoost vs NGBoost | {n_trees} trees, Normal distribution") + print(f"{'='*70}") + print(f" {'Dataset':<22} {'NGBoost (s)':<14} {'NatBoost (s)':<14} {'Speedup':<10} {'NGB NLL':<10} {'NB NLL':<10}") + print(f" {'─'*78}") + for name, r in results.items(): + faster = "NB" if r["speedup"] > 1 else "NGB" + print( + f" {name:<22} {r['ngb_time']:<14.2f} {r['nb_time']:<14.2f} " + f"{r['speedup']:.2f}x {faster:<4} {r['ngb_nll']:<10.4f} {r['nb_nll']:<10.4f}" + ) + + return results + + +# ============================================================================= +# Benchmark 3: OpenBoostGAM vs InterpretML EBM +# ============================================================================= + + +def bench_ebm(n_samples=50_000, n_rounds=200, use_gpu=False): + """Compare OpenBoostGAM vs InterpretML EBM.""" + from sklearn.metrics import r2_score + + from openboost import OpenBoostGAM + + sync = use_gpu + + # Warmup + X_w, y_w = _generate_regression(500) + OpenBoostGAM(n_rounds=10).fit(X_w, y_w) + if sync: + from numba import cuda + cuda.synchronize() + + results = {} + + for n in [n_samples]: + X, y = _generate_regression(n) + X_train, X_test, y_train, y_test = _train_test_split(X, y) + + # OpenBoostGAM + gam = OpenBoostGAM(n_rounds=n_rounds, learning_rate=0.05) + t0 = time.perf_counter() + gam.fit(X_train, y_train) + if sync: + from numba import cuda + cuda.synchronize() + gam_time = time.perf_counter() - t0 + gam_r2 = float(r2_score(y_test, gam.predict(X_test))) + + # InterpretML EBM + from interpret.glassbox import ExplainableBoostingRegressor + + ebm = ExplainableBoostingRegressor( + max_rounds=n_rounds, learning_rate=0.05, + outer_bags=1, inner_bags=0, interactions=0, n_jobs=-1, + ) + t0 = time.perf_counter() + ebm.fit(X_train, y_train) + ebm_time = time.perf_counter() - t0 + ebm_r2 = float(r2_score(y_test, ebm.predict(X_test))) + + results[f"synthetic_{n}"] = { + "gam_time": gam_time, "ebm_time": ebm_time, + "speedup": ebm_time / gam_time, + "gam_r2": gam_r2, "ebm_r2": ebm_r2, + } + + # Print + print(f"\n{'='*70}") + print(f" OpenBoostGAM vs InterpretML EBM | {n_rounds} rounds") + print(f"{'='*70}") + print(f" {'Dataset':<22} {'GAM (s)':<12} {'EBM (s)':<12} {'Speedup':<10} {'GAM R²':<10} {'EBM R²':<10}") + print(f" {'─'*74}") + for name, r in results.items(): + print( + f" {name:<22} {r['gam_time']:<12.2f} {r['ebm_time']:<12.2f} " + f"{r['speedup']:.1f}x {r['gam_r2']:<10.4f} {r['ebm_r2']:<10.4f}" + ) + + return results + + +# ============================================================================= +# Run all benchmarks +# ============================================================================= + + +def run_all(use_gpu=False, bench=None, n_samples=None): + """Run selected or all benchmarks.""" + import sys + if use_gpu: + sys.path.insert(0, "/root") + + import openboost as ob + + if use_gpu: + ob.set_backend("cuda") + + print(f"OpenBoost backend: {ob.get_backend()}") + if use_gpu: + from numba import cuda + gpu_name = cuda.get_current_device().name + if isinstance(gpu_name, bytes): + gpu_name = gpu_name.decode() + print(f"GPU: {gpu_name}") + + all_results = {} + benches = [bench] if bench else ["xgboost", "ngboost", "ebm"] + + if "xgboost" in benches: + n = n_samples or (50_000 if use_gpu else 20_000) + all_results["xgboost"] = bench_xgboost(n_samples=n, use_gpu=use_gpu) + + if "ngboost" in benches: + try: + import ngboost # noqa: F401 + n = n_samples or (10_000 if not use_gpu else 50_000) + all_results["ngboost"] = bench_ngboost(n_samples=n, use_gpu=use_gpu) + except ImportError: + print("\n ngboost not installed, skipping. Install: pip install ngboost") + + if "ebm" in benches: + try: + import interpret # noqa: F401 + n = n_samples or (50_000 if use_gpu else 10_000) + all_results["ebm"] = bench_ebm(n_samples=n, use_gpu=use_gpu) + except ImportError: + print("\n interpret not installed, skipping. Install: pip install interpret") + + return all_results + + +# ============================================================================= +# Modal entry points +# ============================================================================= + +if modal is not None and app is not None: + + @app.function(gpu="A100", image=image, timeout=3600) + def _run_on_gpu(bench=None, n_samples=None): + return run_all(use_gpu=True, bench=bench, n_samples=n_samples) + + @app.local_entrypoint() + def main(bench: str = None, n_samples: int = None): + """Run benchmarks on Modal A100.""" + if bench: + print(f"Running '{bench}' benchmark on Modal A100...") + else: + print("Running all benchmarks on Modal A100...") + + results = _run_on_gpu.remote(bench=bench, n_samples=n_samples) + + # Save results + results_dir = PROJECT_ROOT / "benchmarks" / "results" + results_dir.mkdir(exist_ok=True) + timestamp = time.strftime("%Y%m%d_%H%M%S") + out_file = results_dir / f"gpu_benchmark_{timestamp}.json" + with open(out_file, "w") as f: + json.dump(results, f, indent=2) + print(f"\nResults saved to {out_file}") + + +# ============================================================================= +# Local execution +# ============================================================================= + +if __name__ == "__main__": + import argparse + import sys + + parser = argparse.ArgumentParser(description="OpenBoost GPU Benchmarks") + parser.add_argument("--local", action="store_true", help="Run locally on CPU") + parser.add_argument( + "--bench", choices=["xgboost", "ngboost", "ebm"], + help="Run a single benchmark (default: all)", + ) + parser.add_argument("--n-samples", type=int, help="Override dataset size") + args = parser.parse_args() + + if args.local: + sys.path.insert(0, str(PROJECT_ROOT / "src")) + run_all(use_gpu=False, bench=args.bench, n_samples=args.n_samples) + else: + print("Usage:") + print(" Modal: uv run modal run benchmarks/compare_gpu.py") + print(" Modal: uv run modal run benchmarks/compare_gpu.py --bench xgboost") + print(" Local: uv run python benchmarks/compare_gpu.py --local") + print(" Local: uv run python benchmarks/compare_gpu.py --local --bench ngboost") diff --git a/benchmarks/ebm_benchmark.py b/benchmarks/ebm_benchmark.py deleted file mode 100644 index 29c31d0..0000000 --- a/benchmarks/ebm_benchmark.py +++ /dev/null @@ -1,502 +0,0 @@ -"""Benchmark: OpenBoost GPU-GAM vs InterpretML EBM. - -Run locally: - cd openboost - uv run python benchmarks/ebm_benchmark.py --local - -Run on Modal (cloud A100): - cd openboost - uv run modal run benchmarks/ebm_benchmark.py -""" - -from __future__ import annotations - -from pathlib import Path - -PROJECT_ROOT = Path(__file__).parent.parent - -try: - import modal - - app = modal.App("openboost-ebm-bench") - - image = ( - modal.Image.from_registry("nvidia/cuda:12.4.0-devel-ubuntu22.04", add_python="3.12") - .pip_install( - "numpy>=1.24", - "numba>=0.60", - "cupy-cuda12x>=13.0", - "scikit-learn>=1.0", - "interpret>=0.6", - "xgboost>=2.0", - ) - .add_local_dir( - str(PROJECT_ROOT / "src" / "openboost"), - remote_path="/root/openboost", - ) - ) -except ImportError: - modal = None - app = None - image = None - - -def generate_data(n_samples: int, n_features: int, task: str = "regression"): - """Generate synthetic data for benchmarking.""" - import numpy as np - - np.random.seed(42) - X = np.random.randn(n_samples, n_features).astype(np.float32) - - if task == "regression": - # True model: additive (perfect for GAM) - y = ( - np.sin(X[:, 0] * 2) + # Non-linear effect - 0.5 * X[:, 1] + # Linear effect - np.where(X[:, 2] > 0, 0.3, -0.3) + # Step function - 0.1 * np.random.randn(n_samples) # Noise - ).astype(np.float32) - else: - # Classification - logits = X[:, 0] + 0.5 * X[:, 1] - 0.3 * X[:, 2] - probs = 1 / (1 + np.exp(-logits)) - y = (np.random.rand(n_samples) < probs).astype(np.float32) - - return X, y - - -def benchmark_gam_vs_ebm( - n_samples: int = 100_000, - n_features: int = 20, - n_rounds: int = 500, -): - """Compare OpenBoost GPU-GAM vs InterpretML EBM. - - Args: - n_samples: Number of training samples - n_features: Number of features - n_rounds: Number of boosting rounds (for fair comparison) - - Returns: - Benchmark results dict - """ - import sys - sys.path.insert(0, "/root") - - import numpy as np - import time - from sklearn.metrics import mean_squared_error, r2_score - from sklearn.model_selection import train_test_split - - print("=" * 60) - print("OpenBoost GPU-GAM vs InterpretML EBM Benchmark") - print("=" * 60) - - # Check GPU - try: - from numba import cuda - print(f"GPU: {cuda.get_current_device().name}") - except Exception as e: - print(f"GPU not available: {e}") - - # Generate data - print(f"\nGenerating data: {n_samples:,} samples × {n_features} features") - X, y = generate_data(n_samples, n_features, task="regression") - X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) - - results = { - "n_samples": n_samples, - "n_features": n_features, - "n_rounds": n_rounds, - } - - # ========================================================================= - # 1. OpenBoost GPU-GAM - # ========================================================================= - print("\n" + "-" * 40) - print("1. OpenBoost GPU-GAM") - print("-" * 40) - - try: - import openboost as ob - from openboost import OpenBoostGAM - - print(f"Backend: {ob.get_backend()}") - - gam = OpenBoostGAM( - n_rounds=n_rounds, - learning_rate=0.05, - reg_lambda=1.0, - loss='mse', - ) - - # Warmup (JIT compilation) - gam_warmup = OpenBoostGAM(n_rounds=10, learning_rate=0.1) - gam_warmup.fit(X_train[:1000], y_train[:1000]) - cuda.synchronize() - - # Benchmark training - start = time.perf_counter() - gam.fit(X_train, y_train) - cuda.synchronize() - ob_train_time = time.perf_counter() - start - - # Benchmark inference - start = time.perf_counter() - y_pred_ob = gam.predict(X_test) - cuda.synchronize() - ob_pred_time = time.perf_counter() - start - - ob_mse = mean_squared_error(y_test, y_pred_ob) - ob_r2 = r2_score(y_test, y_pred_ob) - - print(f"Train time: {ob_train_time:.3f}s") - print(f"Predict time: {ob_pred_time*1000:.2f}ms") - print(f"MSE: {ob_mse:.6f}") - print(f"R²: {ob_r2:.4f}") - - results["openboost_gam"] = { - "train_time_s": ob_train_time, - "predict_time_ms": ob_pred_time * 1000, - "mse": ob_mse, - "r2": ob_r2, - } - - except Exception as e: - print(f"OpenBoost GAM failed: {e}") - import traceback - traceback.print_exc() - results["openboost_gam"] = {"error": str(e)} - - # ========================================================================= - # 2. InterpretML EBM - # ========================================================================= - print("\n" + "-" * 40) - print("2. InterpretML EBM") - print("-" * 40) - - try: - from interpret.glassbox import ExplainableBoostingRegressor - - # EBM with comparable settings - # Note: EBM's "outer_bags" and "inner_bags" add bagging overhead - # For fair comparison, we disable some features - ebm = ExplainableBoostingRegressor( - max_rounds=n_rounds, - learning_rate=0.05, - min_samples_leaf=2, - max_bins=256, - outer_bags=1, # Disable bagging for speed comparison - inner_bags=0, - interactions=0, # No pairwise interactions (pure GAM) - n_jobs=-1, # Use all CPU cores - ) - - # Benchmark training - start = time.perf_counter() - ebm.fit(X_train, y_train) - ebm_train_time = time.perf_counter() - start - - # Benchmark inference - start = time.perf_counter() - y_pred_ebm = ebm.predict(X_test) - ebm_pred_time = time.perf_counter() - start - - ebm_mse = mean_squared_error(y_test, y_pred_ebm) - ebm_r2 = r2_score(y_test, y_pred_ebm) - - print(f"Train time: {ebm_train_time:.3f}s") - print(f"Predict time: {ebm_pred_time*1000:.2f}ms") - print(f"MSE: {ebm_mse:.6f}") - print(f"R²: {ebm_r2:.4f}") - - results["interpretml_ebm"] = { - "train_time_s": ebm_train_time, - "predict_time_ms": ebm_pred_time * 1000, - "mse": ebm_mse, - "r2": ebm_r2, - } - - except Exception as e: - print(f"InterpretML EBM failed: {e}") - import traceback - traceback.print_exc() - results["interpretml_ebm"] = {"error": str(e)} - - # ========================================================================= - # 3. XGBoost (baseline, non-interpretable) - # ========================================================================= - print("\n" + "-" * 40) - print("3. XGBoost (baseline)") - print("-" * 40) - - try: - import xgboost as xgb - - xgb_model = xgb.XGBRegressor( - n_estimators=n_rounds, - learning_rate=0.05, - max_depth=6, - tree_method="hist", - device="cuda", - n_jobs=-1, - ) - - # Benchmark training - start = time.perf_counter() - xgb_model.fit(X_train, y_train) - cuda.synchronize() - xgb_train_time = time.perf_counter() - start - - # Benchmark inference - start = time.perf_counter() - y_pred_xgb = xgb_model.predict(X_test) - cuda.synchronize() - xgb_pred_time = time.perf_counter() - start - - xgb_mse = mean_squared_error(y_test, y_pred_xgb) - xgb_r2 = r2_score(y_test, y_pred_xgb) - - print(f"Train time: {xgb_train_time:.3f}s") - print(f"Predict time: {xgb_pred_time*1000:.2f}ms") - print(f"MSE: {xgb_mse:.6f}") - print(f"R²: {xgb_r2:.4f}") - - results["xgboost"] = { - "train_time_s": xgb_train_time, - "predict_time_ms": xgb_pred_time * 1000, - "mse": xgb_mse, - "r2": xgb_r2, - } - - except Exception as e: - print(f"XGBoost failed: {e}") - results["xgboost"] = {"error": str(e)} - - # ========================================================================= - # Summary - # ========================================================================= - print("\n" + "=" * 60) - print("SUMMARY") - print("=" * 60) - - def safe_get(d, key, default="N/A"): - if "error" in d: - return default - return d.get(key, default) - - print(f"\n{'Model':<25} {'Train (s)':<12} {'Predict (ms)':<14} {'R²':<10}") - print("-" * 60) - - for name, key in [ - ("OpenBoost GPU-GAM", "openboost_gam"), - ("InterpretML EBM", "interpretml_ebm"), - ("XGBoost (GPU)", "xgboost"), - ]: - d = results.get(key, {}) - train = safe_get(d, "train_time_s") - pred = safe_get(d, "predict_time_ms") - r2 = safe_get(d, "r2") - - train_str = f"{train:.3f}" if isinstance(train, float) else train - pred_str = f"{pred:.2f}" if isinstance(pred, float) else pred - r2_str = f"{r2:.4f}" if isinstance(r2, float) else r2 - - print(f"{name:<25} {train_str:<12} {pred_str:<14} {r2_str:<10}") - - # Speedup calculation - if ("openboost_gam" in results and "interpretml_ebm" in results and - "error" not in results["openboost_gam"] and "error" not in results["interpretml_ebm"]): - speedup = results["interpretml_ebm"]["train_time_s"] / results["openboost_gam"]["train_time_s"] - print(f"\nOpenBoost GPU-GAM is {speedup:.1f}x faster than InterpretML EBM") - results["speedup_vs_ebm"] = speedup - - return results - - -def benchmark_scaling(max_samples: int = 1_000_000): - """Benchmark how both scale with data size.""" - import sys - sys.path.insert(0, "/root") - - import numpy as np - import time - from numba import cuda - - print("=" * 60) - print("Scaling Benchmark: OpenBoost GPU-GAM vs InterpretML EBM") - print("=" * 60) - print(f"GPU: {cuda.get_current_device().name}") - - import openboost as ob - from openboost import OpenBoostGAM - from interpret.glassbox import ExplainableBoostingRegressor - - n_features = 20 - n_rounds = 200 - - # Warmup - X_warm, y_warm = generate_data(1000, n_features) - OpenBoostGAM(n_rounds=10).fit(X_warm, y_warm) - cuda.synchronize() - - results = [] - - for n_samples in [10_000, 50_000, 100_000, 500_000, max_samples]: - if n_samples > max_samples: - break - - print(f"\n--- {n_samples:,} samples ---") - - X, y = generate_data(n_samples, n_features) - row = {"n_samples": n_samples} - - # OpenBoost GPU-GAM - try: - gam = OpenBoostGAM(n_rounds=n_rounds, learning_rate=0.05) - start = time.perf_counter() - gam.fit(X, y) - cuda.synchronize() - row["openboost_time"] = time.perf_counter() - start - print(f" OpenBoost GPU-GAM: {row['openboost_time']:.2f}s") - except Exception as e: - print(f" OpenBoost failed: {e}") - row["openboost_time"] = None - - # InterpretML EBM (only for smaller sizes due to time) - if n_samples <= 100_000: - try: - ebm = ExplainableBoostingRegressor( - max_rounds=n_rounds, - learning_rate=0.05, - outer_bags=1, - inner_bags=0, - interactions=0, - n_jobs=-1, - ) - start = time.perf_counter() - ebm.fit(X, y) - row["ebm_time"] = time.perf_counter() - start - print(f" InterpretML EBM: {row['ebm_time']:.2f}s") - except Exception as e: - print(f" EBM failed: {e}") - row["ebm_time"] = None - else: - print(f" InterpretML EBM: (skipped - too slow)") - row["ebm_time"] = None - - if row.get("openboost_time") and row.get("ebm_time"): - row["speedup"] = row["ebm_time"] / row["openboost_time"] - print(f" Speedup: {row['speedup']:.1f}x") - - results.append(row) - - print("\n" + "=" * 60) - print("SCALING SUMMARY") - print("=" * 60) - print(f"\n{'Samples':<12} {'OpenBoost (s)':<15} {'EBM (s)':<12} {'Speedup':<10}") - print("-" * 50) - for r in results: - ob_str = f"{r['openboost_time']:.2f}" if r.get('openboost_time') else "N/A" - ebm_str = f"{r['ebm_time']:.2f}" if r.get('ebm_time') else "N/A" - sp_str = f"{r['speedup']:.1f}x" if r.get('speedup') else "N/A" - print(f"{r['n_samples']:<12,} {ob_str:<15} {ebm_str:<12} {sp_str:<10}") - - return results - - -if modal is not None and app is not None: - _benchmark_gam_vs_ebm_modal = app.function(gpu="A100", image=image, timeout=1800)(benchmark_gam_vs_ebm) - _benchmark_scaling_modal = app.function(gpu="A100", image=image, timeout=3600)(benchmark_scaling) - - @app.local_entrypoint() - def main(): - """Run benchmarks on Modal.""" - print("Running GAM vs EBM benchmark on Modal A100...") - - results = _benchmark_gam_vs_ebm_modal.remote( - n_samples=100_000, - n_features=20, - n_rounds=500, - ) - - print("\n\nFinal Results:") - print(results) - - -# For local execution without Modal -if __name__ == "__main__": - import sys - - if len(sys.argv) > 1 and sys.argv[1] == "--local": - print("Running locally...") - - import numpy as np - import time - from sklearn.metrics import r2_score - from sklearn.model_selection import train_test_split - - sys.path.insert(0, str(PROJECT_ROOT / "src")) - - n_samples = 50_000 - n_features = 20 - n_rounds = 200 - - X, y = generate_data(n_samples, n_features) - X_train, X_test, y_train, y_test = train_test_split( - X, y, test_size=0.2, random_state=42 - ) - - print(f"Data: {n_samples:,} samples x {n_features} features") - - # OpenBoost GPU-GAM - try: - import openboost as ob - from openboost import OpenBoostGAM - - print(f"\nOpenBoost backend: {ob.get_backend()}") - - # Warmup JIT - OpenBoostGAM(n_rounds=10, learning_rate=0.1).fit( - X_train[:1000], y_train[:1000] - ) - - gam = OpenBoostGAM(n_rounds=n_rounds, learning_rate=0.05) - start = time.perf_counter() - gam.fit(X_train, y_train) - train_time = time.perf_counter() - start - - y_pred = gam.predict(X_test) - print(f"OpenBoost GPU-GAM: {train_time:.2f}s, R²={r2_score(y_test, y_pred):.4f}") - except Exception as e: - print(f"OpenBoost failed: {e}") - import traceback - traceback.print_exc() - - # InterpretML EBM - try: - from interpret.glassbox import ExplainableBoostingRegressor - - ebm = ExplainableBoostingRegressor( - max_rounds=n_rounds, - learning_rate=0.05, - outer_bags=1, - inner_bags=0, - interactions=0, - n_jobs=-1, - ) - start = time.perf_counter() - ebm.fit(X_train, y_train) - train_time = time.perf_counter() - start - - y_pred = ebm.predict(X_test) - print(f"InterpretML EBM: {train_time:.2f}s, R²={r2_score(y_test, y_pred):.4f}") - except ImportError: - print("InterpretML not installed. Run: pip install interpret") - except Exception as e: - print(f"EBM failed: {e}") - else: - print("Usage:") - print(" Modal: uv run modal run benchmarks/ebm_benchmark.py") - print(" Local: uv run python benchmarks/ebm_benchmark.py --local") - diff --git a/benchmarks/ngboost_benchmark.py b/benchmarks/ngboost_benchmark.py deleted file mode 100644 index e8b827b..0000000 --- a/benchmarks/ngboost_benchmark.py +++ /dev/null @@ -1,315 +0,0 @@ -"""Benchmark: OpenBoost NaturalBoost vs Official NGBoost. - -Compare: -1. Training speed -2. Prediction speed -3. NLL (negative log-likelihood) - prediction quality -4. Calibration of prediction intervals - -Usage: - uv run python benchmarks/ngboost_benchmark.py -""" - -from __future__ import annotations - -import sys -import time -from pathlib import Path - -import numpy as np -from sklearn.datasets import fetch_california_housing, make_regression -from sklearn.model_selection import train_test_split - -PROJECT_ROOT = Path(__file__).parent.parent -sys.path.insert(0, str(PROJECT_ROOT / "src")) - - -def benchmark_synthetic(n_samples: int = 10000, n_features: int = 20, n_trees: int = 100): - """Benchmark on synthetic data.""" - print(f"\n{'='*70}") - print(f"SYNTHETIC DATA: {n_samples:,} samples, {n_features} features, {n_trees} trees") - print('='*70) - - # Generate data - X, y = make_regression(n_samples=n_samples, n_features=n_features, noise=10, random_state=42) - X = X.astype(np.float32) - y = y.astype(np.float32) - - X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) - - results = {} - - # --- Official NGBoost --- - try: - from ngboost import NGBRegressor - from ngboost.distns import Normal - - print("\n[Official NGBoost]") - model_official = NGBRegressor( - Dist=Normal, - n_estimators=n_trees, - learning_rate=0.1, - verbose=False, - ) - - t0 = time.perf_counter() - model_official.fit(X_train, y_train) - train_time_official = time.perf_counter() - t0 - - t0 = time.perf_counter() - pred_official = model_official.predict(X_test) - pred_time_official = time.perf_counter() - t0 - - # Get distribution params for NLL - dist_official = model_official.pred_dist(X_test) - nll_official = -dist_official.logpdf(y_test).mean() - - # Prediction intervals - lower_official = dist_official.ppf(0.05) - upper_official = dist_official.ppf(0.95) - coverage_official = np.mean((y_test >= lower_official) & (y_test <= upper_official)) - - rmse_official = np.sqrt(np.mean((pred_official - y_test)**2)) - - results['official'] = { - 'train_time': train_time_official, - 'pred_time': pred_time_official, - 'nll': nll_official, - 'rmse': rmse_official, - 'coverage_90': coverage_official, - } - - print(f" Train time: {train_time_official:.2f}s") - print(f" Pred time: {pred_time_official*1000:.1f}ms") - print(f" NLL: {nll_official:.4f}") - print(f" RMSE: {rmse_official:.4f}") - print(f" 90% coverage: {coverage_official:.1%}") - - except Exception as e: - print(f" Error: {e}") - results['official'] = None - - # --- OpenBoost NaturalBoost --- - try: - import openboost as ob - - # Warmup JIT - ob.NaturalBoostNormal(n_trees=3, learning_rate=0.1, max_depth=3).fit( - X_train[:500], y_train[:500] - ) - - print("\n[OpenBoost NaturalBoost]") - model_openboost = ob.NaturalBoostNormal( - n_trees=n_trees, - learning_rate=0.1, - max_depth=3, - ) - - t0 = time.perf_counter() - model_openboost.fit(X_train, y_train) - train_time_openboost = time.perf_counter() - t0 - - t0 = time.perf_counter() - pred_openboost = model_openboost.predict(X_test) - pred_time_openboost = time.perf_counter() - t0 - - # NLL - nll_openboost = model_openboost.score(X_test, y_test) - if hasattr(nll_openboost, 'mean'): - nll_openboost = nll_openboost.mean() - - # Prediction intervals - lower_openboost, upper_openboost = model_openboost.predict_interval(X_test, alpha=0.1) - coverage_openboost = np.mean((y_test >= lower_openboost) & (y_test <= upper_openboost)) - - rmse_openboost = np.sqrt(np.mean((pred_openboost - y_test)**2)) - - results['openboost'] = { - 'train_time': train_time_openboost, - 'pred_time': pred_time_openboost, - 'nll': nll_openboost, - 'rmse': rmse_openboost, - 'coverage_90': coverage_openboost, - } - - print(f" Train time: {train_time_openboost:.2f}s") - print(f" Pred time: {pred_time_openboost*1000:.1f}ms") - print(f" NLL: {nll_openboost:.4f}") - print(f" RMSE: {rmse_openboost:.4f}") - print(f" 90% coverage: {coverage_openboost:.1%}") - - except Exception as e: - print(f" Error: {e}") - import traceback - traceback.print_exc() - results['openboost'] = None - - # --- Comparison --- - if results.get('official') and results.get('openboost'): - print("\n[Comparison]") - speedup = results['official']['train_time'] / results['openboost']['train_time'] - print(f" Training speedup: {speedup:.2f}x {'(OpenBoost faster)' if speedup > 1 else '(NGBoost faster)'}") - - pred_speedup = results['official']['pred_time'] / results['openboost']['pred_time'] - print(f" Prediction speedup: {pred_speedup:.2f}x") - - nll_diff = results['openboost']['nll'] - results['official']['nll'] - print(f" NLL difference: {nll_diff:+.4f} {'(OpenBoost better)' if nll_diff < 0 else '(NGBoost better)'}") - - return results - - -def benchmark_california_housing(n_trees: int = 100): - """Benchmark on California Housing dataset.""" - print(f"\n{'='*70}") - print(f"CALIFORNIA HOUSING DATASET: {n_trees} trees") - print('='*70) - - # Load data - data = fetch_california_housing() - X, y = data.data.astype(np.float32), data.target.astype(np.float32) - - X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) - - print(f"Train: {len(X_train):,} samples, Test: {len(X_test):,} samples") - - results = {} - - # --- Official NGBoost --- - try: - from ngboost import NGBRegressor - from ngboost.distns import Normal - - print("\n[Official NGBoost]") - model_official = NGBRegressor( - Dist=Normal, - n_estimators=n_trees, - learning_rate=0.1, - verbose=False, - ) - - t0 = time.perf_counter() - model_official.fit(X_train, y_train) - train_time = time.perf_counter() - t0 - - pred = model_official.predict(X_test) - dist = model_official.pred_dist(X_test) - nll = -dist.logpdf(y_test).mean() - rmse = np.sqrt(np.mean((pred - y_test)**2)) - - results['official'] = {'train_time': train_time, 'nll': nll, 'rmse': rmse} - print(f" Train time: {train_time:.2f}s | NLL: {nll:.4f} | RMSE: {rmse:.4f}") - - except Exception as e: - print(f" Error: {e}") - - # --- OpenBoost NaturalBoost --- - try: - import openboost as ob - - # Warmup JIT - ob.NaturalBoostNormal(n_trees=3, learning_rate=0.1, max_depth=3).fit( - X_train[:500], y_train[:500] - ) - - print("\n[OpenBoost NaturalBoost]") - model = ob.NaturalBoostNormal(n_trees=n_trees, learning_rate=0.1, max_depth=3) - - t0 = time.perf_counter() - model.fit(X_train, y_train) - train_time = time.perf_counter() - t0 - - pred = model.predict(X_test) - nll = model.score(X_test, y_test) - if hasattr(nll, 'mean'): - nll = nll.mean() - rmse = np.sqrt(np.mean((pred - y_test)**2)) - - results['openboost'] = {'train_time': train_time, 'nll': nll, 'rmse': rmse} - print(f" Train time: {train_time:.2f}s | NLL: {nll:.4f} | RMSE: {rmse:.4f}") - - except Exception as e: - print(f" Error: {e}") - import traceback - traceback.print_exc() - - # --- Comparison --- - if results.get('official') and results.get('openboost'): - print("\n[Comparison]") - speedup = results['official']['train_time'] / results['openboost']['train_time'] - print(f" Training speedup: {speedup:.2f}x {'(OpenBoost faster)' if speedup > 1 else '(NGBoost faster)'}") - - return results - - -def benchmark_scaling(): - """Benchmark training time scaling with data size.""" - print(f"\n{'='*70}") - print("SCALING BENCHMARK") - print('='*70) - - sizes = [1000, 5000, 10000, 20000] - n_trees = 50 - - print(f"\n{'Size':<10} {'NGBoost':<12} {'OpenBoost':<12} {'Speedup':<10}") - print("-" * 44) - - # Warmup JIT on small data - try: - import openboost as ob - warmup_X, warmup_y = make_regression(n_samples=500, n_features=10, noise=10, random_state=0) - ob.NaturalBoostNormal(n_trees=3, learning_rate=0.1, max_depth=3).fit( - warmup_X.astype(np.float32), warmup_y.astype(np.float32) - ) - except Exception: - pass - - for n in sizes: - X, y = make_regression(n_samples=n, n_features=10, noise=10, random_state=42) - X = X.astype(np.float32) - y = y.astype(np.float32) - - # Official NGBoost - try: - from ngboost import NGBRegressor - from ngboost.distns import Normal - - model = NGBRegressor(Dist=Normal, n_estimators=n_trees, learning_rate=0.1, verbose=False) - t0 = time.perf_counter() - model.fit(X, y) - time_official = time.perf_counter() - t0 - except Exception: - time_official = float('nan') - - # OpenBoost - try: - import openboost as ob - - model = ob.NaturalBoostNormal(n_trees=n_trees, learning_rate=0.1, max_depth=3) - t0 = time.perf_counter() - model.fit(X, y) - time_openboost = time.perf_counter() - t0 - except Exception: - time_openboost = float('nan') - - speedup = time_official / time_openboost if time_openboost > 0 else 0 - print(f"{n:<10} {time_official:<12.2f}s {time_openboost:<12.2f}s {speedup:<10.2f}x") - - -if __name__ == '__main__': - print("="*70) - print("OpenBoost NaturalBoost vs Official NGBoost Benchmark") - print("="*70) - - # Quick benchmark - benchmark_synthetic(n_samples=5000, n_features=10, n_trees=50) - - # Real dataset - benchmark_california_housing(n_trees=50) - - # Scaling - benchmark_scaling() - - print("\n" + "="*70) - print("BENCHMARK COMPLETE") - print("="*70) diff --git a/benchmarks/performance_report.py b/benchmarks/performance_report.py deleted file mode 100644 index 9c09165..0000000 --- a/benchmarks/performance_report.py +++ /dev/null @@ -1,436 +0,0 @@ -"""Generate comprehensive performance report for OpenBoost. - -GPU Performance Validation — compares: -1. NaturalBoost vs NGBoost (distributional GBDT) -2. OpenBoostGAM vs InterpretML EBM (interpretable models) - -Run on Modal: - uv run modal run benchmarks/performance_report.py - -Run locally (if you have GPU): - uv run python benchmarks/performance_report.py --local -""" - -from __future__ import annotations - -import json -import time -from pathlib import Path - -PROJECT_ROOT = Path(__file__).parent.parent - -try: - import modal - - app = modal.App("openboost-perf-report") - - image = ( - modal.Image.from_registry("nvidia/cuda:12.4.0-devel-ubuntu22.04", add_python="3.12") - .pip_install( - "numpy>=1.24", - "numba>=0.60", - "scikit-learn>=1.0", - "ngboost>=0.5", - "interpret>=0.6", - "xgboost>=2.0", - "tabulate>=0.9", - "scipy>=1.10", - ) - .add_local_dir( - str(PROJECT_ROOT / "src" / "openboost"), - remote_path="/root/openboost", - ) - ) -except ImportError: - modal = None - app = None - image = None - - -def generate_data(n_samples: int, n_features: int, noise: float = 10.0, seed: int = 42): - """Generate synthetic regression data.""" - import numpy as np - - np.random.seed(seed) - X = np.random.randn(n_samples, n_features).astype(np.float32) - - # True model: linear + non-linear effects - y = ( - 2.0 * X[:, 0] + - np.sin(X[:, 1] * 2) + - 0.5 * X[:, 2] ** 2 + - noise * np.random.randn(n_samples) - ).astype(np.float32) - - return X, y - - -def run_performance_report(): - """Run all performance benchmarks.""" - import sys - sys.path.insert(0, "/root") - - import numpy as np - from sklearn.datasets import make_regression, fetch_california_housing - from sklearn.model_selection import train_test_split - from numba import cuda - - gpu_name = cuda.get_current_device().name - if isinstance(gpu_name, bytes): - gpu_name = gpu_name.decode() - - results = { - "gpu_device": gpu_name, - "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"), - "benchmarks": {} - } - - # ========================================================================= - # Benchmark 1: NaturalBoost vs NGBoost - # ========================================================================= - print("=" * 70) - print("BENCHMARK 1: NaturalBoost vs NGBoost") - print("=" * 70) - - import openboost as ob - from ngboost import NGBRegressor - from ngboost.distns import Normal - - ob.set_backend("cuda") - print(f"OpenBoost backend: {ob.get_backend()}") - - ngboost_results = [] - - for n_samples in [250_000, 500_000, 1_000_000]: - print(f"\n{n_samples:,} samples:") - - X, y = generate_data(n_samples, n_features=20, noise=10.0) - X_train, X_test, y_train, y_test = train_test_split( - X, y, test_size=0.2, random_state=42 - ) - - # NGBoost - ngb = NGBRegressor( - Dist=Normal, - n_estimators=100, - learning_rate=0.1, - verbose=False - ) - start = time.perf_counter() - ngb.fit(X_train, y_train) - ngb_time = time.perf_counter() - start - - # NGBoost predictions and metrics - ngb_pred = ngb.predict(X_test) - ngb_dist = ngb.pred_dist(X_test) - ngb_nll = float(-ngb_dist.logpdf(y_test).mean()) - ngb_lower = ngb_dist.ppf(0.05) - ngb_upper = ngb_dist.ppf(0.95) - ngb_coverage = float(np.mean((y_test >= ngb_lower) & (y_test <= ngb_upper))) - - # NaturalBoost - warmup first - nb_warmup = ob.NaturalBoostNormal(n_trees=2, max_depth=3) - nb_warmup.fit(X_train[:500], y_train[:500]) - cuda.synchronize() - - # NaturalBoost - nb = ob.NaturalBoostNormal(n_trees=100, learning_rate=0.1, max_depth=3) - start = time.perf_counter() - nb.fit(X_train, y_train) - cuda.synchronize() - nb_time = time.perf_counter() - start - - # NaturalBoost predictions and metrics - nb_pred = nb.predict(X_test) - nb_nll = float(nb.nll(X_test, y_test)) - nb_lower, nb_upper = nb.predict_interval(X_test, alpha=0.1) - nb_coverage = float(np.mean((y_test >= nb_lower) & (y_test <= nb_upper))) - - speedup = ngb_time / nb_time - - print(f" NGBoost: {ngb_time:.2f}s (NLL: {ngb_nll:.4f}, Coverage: {ngb_coverage:.1%})") - print(f" NaturalBoost: {nb_time:.2f}s (NLL: {nb_nll:.4f}, Coverage: {nb_coverage:.1%})") - print(f" Speedup: {speedup:.2f}x") - - ngboost_results.append({ - "samples": n_samples, - "ngboost_time": ngb_time, - "ngboost_nll": ngb_nll, - "ngboost_coverage": ngb_coverage, - "naturalboost_time": nb_time, - "naturalboost_nll": nb_nll, - "naturalboost_coverage": nb_coverage, - "speedup": speedup, - }) - - results["benchmarks"]["ngboost"] = ngboost_results - - # California Housing benchmark - print(f"\nCalifornia Housing Dataset:") - data = fetch_california_housing() - X, y = data.data.astype(np.float32), data.target.astype(np.float32) - X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) - print(f" Samples: {len(X_train):,} train, {len(X_test):,} test") - - # NGBoost on California Housing - ngb = NGBRegressor(Dist=Normal, n_estimators=100, learning_rate=0.1, verbose=False) - start = time.perf_counter() - ngb.fit(X_train, y_train) - ngb_time_cal = time.perf_counter() - start - - # NaturalBoost on California Housing - nb = ob.NaturalBoostNormal(n_trees=100, learning_rate=0.1, max_depth=3) - start = time.perf_counter() - nb.fit(X_train, y_train) - cuda.synchronize() - nb_time_cal = time.perf_counter() - start - - speedup_cal = ngb_time_cal / nb_time_cal - print(f" NGBoost: {ngb_time_cal:.2f}s") - print(f" NaturalBoost: {nb_time_cal:.2f}s") - print(f" Speedup: {speedup_cal:.2f}x") - - results["benchmarks"]["ngboost_california"] = { - "samples": len(X_train), - "ngboost_time": ngb_time_cal, - "naturalboost_time": nb_time_cal, - "speedup": speedup_cal, - } - - # ========================================================================= - # Benchmark 2: OpenBoostGAM vs InterpretML EBM - # ========================================================================= - print("\n" + "=" * 70) - print("BENCHMARK 2: OpenBoostGAM vs InterpretML EBM") - print("=" * 70) - - from interpret.glassbox import ExplainableBoostingRegressor - from openboost import OpenBoostGAM - from sklearn.metrics import r2_score - - ebm_results = [] - - for n_samples in [500_000, 1_000_000, 2_000_000]: - print(f"\n{n_samples:,} samples:") - - X, y = generate_data(n_samples, n_features=20, noise=0.1) - - # InterpretML EBM - ebm = ExplainableBoostingRegressor( - max_rounds=200, - learning_rate=0.05, - outer_bags=1, - inner_bags=0, - interactions=0, - n_jobs=-1, - ) - start = time.perf_counter() - ebm.fit(X, y) - ebm_time = time.perf_counter() - start - ebm_r2 = r2_score(y, ebm.predict(X)) - - # OpenBoostGAM - warmup - gam_warmup = OpenBoostGAM(n_rounds=10) - gam_warmup.fit(X[:1000], y[:1000]) - cuda.synchronize() - - # OpenBoostGAM - gam = OpenBoostGAM(n_rounds=200, learning_rate=0.05) - start = time.perf_counter() - gam.fit(X, y) - cuda.synchronize() - gam_time = time.perf_counter() - start - gam_r2 = r2_score(y, gam.predict(X)) - - speedup = ebm_time / gam_time - - print(f" EBM: {ebm_time:.2f}s (R²: {ebm_r2:.4f})") - print(f" OpenBoostGAM: {gam_time:.2f}s (R²: {gam_r2:.4f})") - print(f" Speedup: {speedup:.1f}x") - - ebm_results.append({ - "samples": n_samples, - "ebm_time": ebm_time, - "ebm_r2": ebm_r2, - "openboostgam_time": gam_time, - "openboostgam_r2": gam_r2, - "speedup": speedup, - }) - - results["benchmarks"]["ebm"] = ebm_results - - # ========================================================================= - # Summary - # ========================================================================= - print("\n" + "=" * 70) - print("PERFORMANCE SUMMARY") - print("=" * 70) - - print("\nNaturalBoost vs NGBoost (100 trees, Normal distribution):") - print(f"{'Samples':<14} {'NGBoost (s)':<14} {'NaturalBoost (s)':<18} {'Speedup':<10}") - print("-" * 56) - for r in ngboost_results: - print(f"{r['samples']:<14,} {r['ngboost_time']:<14.2f} {r['naturalboost_time']:<18.2f} {r['speedup']:<10.2f}x") - - print("\nOpenBoostGAM vs InterpretML EBM (200 rounds, 20 features):") - print(f"{'Samples':<14} {'EBM (s)':<12} {'OpenBoostGAM (s)':<18} {'Speedup':<10}") - print("-" * 54) - for r in ebm_results: - print(f"{r['samples']:<14,} {r['ebm_time']:<12.2f} {r['openboostgam_time']:<18.2f} {r['speedup']:<10.1f}x") - - # Acceptance criteria check - print("\n" + "=" * 70) - print("ACCEPTANCE CRITERIA") - print("=" * 70) - - # Check NaturalBoost >1.3x faster - nb_faster = all(r['speedup'] > 1.0 for r in ngboost_results) - nb_speedup_1m = next(r['speedup'] for r in ngboost_results if r['samples'] == 1_000_000) - print(f"[{'✓' if nb_faster else '✗'}] NaturalBoost faster than NGBoost at all sizes") - print(f"[{'✓' if nb_speedup_1m > 1.3 else '✗'}] NaturalBoost >1.3x faster at 1M samples (actual: {nb_speedup_1m:.2f}x)") - - # Check OpenBoostGAM >10x faster at 2M - gam_speedup_2m = next(r['speedup'] for r in ebm_results if r['samples'] == 2_000_000) - print(f"[{'✓' if gam_speedup_2m > 10 else '✗'}] OpenBoostGAM >10x faster at 2M samples (actual: {gam_speedup_2m:.1f}x)") - - # Check comparable accuracy - gam_r2_comparable = all(abs(r['openboostgam_r2'] - r['ebm_r2']) < 0.1 for r in ebm_results) - print(f"[{'✓' if gam_r2_comparable else '✗'}] OpenBoostGAM comparable R² to EBM") - - results["acceptance_criteria"] = { - "naturalboost_faster_all": nb_faster, - "naturalboost_speedup_1m": nb_speedup_1m, - "openboostgam_speedup_2m": gam_speedup_2m, - "r2_comparable": gam_r2_comparable, - } - - return results - - -class BytesEncoder(json.JSONEncoder): - """Custom JSON encoder that handles bytes objects.""" - def default(self, obj): - if isinstance(obj, bytes): - return obj.decode('utf-8', errors='replace') - return super().default(obj) - - -def convert_bytes_in_dict(obj): - """Recursively convert bytes to strings in a dictionary.""" - if isinstance(obj, bytes): - return obj.decode('utf-8', errors='replace') - elif isinstance(obj, dict): - return {k: convert_bytes_in_dict(v) for k, v in obj.items()} - elif isinstance(obj, list): - return [convert_bytes_in_dict(item) for item in obj] - return obj - - -if modal is not None and app is not None: - _run_performance_report_modal = app.function( - gpu="A100", image=image, timeout=14400 - )(run_performance_report) - - @app.local_entrypoint() - def main(): - """Run performance report.""" - print("Running OpenBoost Performance Report on Modal A100...") - print("This may take 10-20 minutes.\n") - - results = _run_performance_report_modal.remote() - - results = convert_bytes_in_dict(results) - - results_dir = PROJECT_ROOT / "benchmarks" / "results" - results_dir.mkdir(exist_ok=True) - - timestamp = time.strftime("%Y%m%d_%H%M%S") - results_file = results_dir / f"performance_report_{timestamp}.json" - - with open(results_file, "w") as f: - json.dump(results, f, indent=2, cls=BytesEncoder) - - print(f"\nResults saved to: {results_file}") - - print("\n" + "=" * 70) - print("README MARKDOWN (copy-paste ready):") - print("=" * 70) - - print(""" -### NaturalBoost vs NGBoost - -| Samples | NGBoost | NaturalBoost (GPU) | Speedup | -|---------|---------|-------------------|---------|""") - for r in results["benchmarks"]["ngboost"]: - print(f"| {r['samples']:,} | {r['ngboost_time']:.1f}s | {r['naturalboost_time']:.1f}s | {r['speedup']:.1f}x |") - - print(""" -*Benchmark: Normal distribution, 100 trees, 20 features, A100 GPU* - -### OpenBoostGAM vs InterpretML EBM - -| Samples | EBM (CPU) | OpenBoostGAM (GPU) | Speedup | -|---------|-----------|-------------------|---------|""") - for r in results["benchmarks"]["ebm"]: - print(f"| {r['samples']:,} | {r['ebm_time']:.0f}s | {r['openboostgam_time']:.1f}s | {r['speedup']:.0f}x |") - - print(""" -*Benchmark: 200 rounds, 20 features, pure GAM (no interactions), A100 GPU* -""") - - -# For local execution without Modal -if __name__ == "__main__": - import sys - - if len(sys.argv) > 1 and sys.argv[1] == "--local": - print("Running locally (requires GPU)...") - - import numpy as np - from sklearn.model_selection import train_test_split - - # Add openboost to path - sys.path.insert(0, str(PROJECT_ROOT / "src")) - - import openboost as ob - print(f"OpenBoost backend: {ob.get_backend()}") - - # Quick benchmark - print("\nQuick NaturalBoost vs NGBoost benchmark (5K samples):") - X, y = generate_data(5000, 20, noise=10.0) - X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) - - try: - from ngboost import NGBRegressor - from ngboost.distns import Normal - - ngb = NGBRegressor(Dist=Normal, n_estimators=50, learning_rate=0.1, verbose=False) - start = time.perf_counter() - ngb.fit(X_train, y_train) - ngb_time = time.perf_counter() - start - print(f" NGBoost: {ngb_time:.2f}s") - except ImportError: - print(" NGBoost not installed. Run: pip install ngboost") - ngb_time = None - - try: - # Warmup JIT - ob.NaturalBoostNormal(n_trees=3, learning_rate=0.1, max_depth=3).fit( - X_train[:500], y_train[:500] - ) - - nb = ob.NaturalBoostNormal(n_trees=50, learning_rate=0.1, max_depth=3) - start = time.perf_counter() - nb.fit(X_train, y_train) - nb_time = time.perf_counter() - start - print(f" NaturalBoost: {nb_time:.2f}s") - - if ngb_time: - print(f" Speedup: {ngb_time / nb_time:.2f}x") - except Exception as e: - print(f" NaturalBoost failed: {e}") - - else: - print("Usage:") - print(" Modal: uv run modal run benchmarks/performance_report.py") - print(" Local: uv run python benchmarks/performance_report.py --local") diff --git a/benchmarks/modal_bench.py b/benchmarks/profile_kernels.py similarity index 99% rename from benchmarks/modal_bench.py rename to benchmarks/profile_kernels.py index c9971c1..3ed5bf7 100644 --- a/benchmarks/modal_bench.py +++ b/benchmarks/profile_kernels.py @@ -2,7 +2,7 @@ Run from Mac with: cd openboost - uv run modal run benchmarks/modal_bench.py + uv run modal run benchmarks/profile_kernels.py This will execute benchmarks on a cloud A100 GPU. """ diff --git a/benchmarks/profile_loop.py b/benchmarks/profile_loop.py new file mode 100644 index 0000000..2a17e6f --- /dev/null +++ b/benchmarks/profile_loop.py @@ -0,0 +1,100 @@ +"""Profile OpenBoost training and identify bottlenecks. + +Part of the self-recursive improvement loop: + 1. Run this script to profile and identify the top bottleneck + 2. Optimize the target code + 3. Re-run to verify improvement and compare with previous run + +Usage: + uv run python benchmarks/profile_loop.py + uv run python benchmarks/profile_loop.py --n-samples 200000 --n-features 50 + uv run python benchmarks/profile_loop.py --n-trees 200 --max-depth 8 + uv run python benchmarks/profile_loop.py --summarize + uv run python benchmarks/profile_loop.py --growth leafwise +""" + +from __future__ import annotations + +import argparse +import sys +import time + +import numpy as np + + +def main(): + parser = argparse.ArgumentParser(description="Profile OpenBoost training loop") + parser.add_argument("--n-samples", type=int, default=50_000) + parser.add_argument("--n-features", type=int, default=20) + parser.add_argument("--n-trees", type=int, default=100) + parser.add_argument("--max-depth", type=int, default=6) + parser.add_argument("--learning-rate", type=float, default=0.1) + parser.add_argument("--loss", type=str, default="mse") + parser.add_argument("--output-dir", type=str, default="logs/") + parser.add_argument("--summarize", action="store_true", + help="Print machine-readable summary for improvement loops") + parser.add_argument("--seed", type=int, default=42) + args = parser.parse_args() + + # Generate synthetic dataset + rng = np.random.RandomState(args.seed) + X = rng.randn(args.n_samples, args.n_features).astype(np.float32) + y = (X[:, 0] + 0.5 * X[:, 1] - 0.3 * X[:, 2] + + rng.randn(args.n_samples).astype(np.float32) * 0.1).astype(np.float32) + + import openboost as ob + from openboost._profiler import ProfilingCallback, print_profile_summary + + profiler = ProfilingCallback(output_dir=args.output_dir) + model = ob.GradientBoosting( + n_trees=args.n_trees, + max_depth=args.max_depth, + learning_rate=args.learning_rate, + loss=args.loss, + ) + + print(f"Profiling: {args.n_samples:,} samples, {args.n_features} features, " + f"{args.n_trees} trees, depth={args.max_depth}") + + # Warmup JIT (first fit compiles Numba kernels) + warmup_model = ob.GradientBoosting(n_trees=2, max_depth=args.max_depth, loss=args.loss) + warmup_model.fit(X[:1000], y[:1000]) + + wall_start = time.perf_counter() + model.fit(X, y, callbacks=[profiler]) + wall_time = time.perf_counter() - wall_start + + print(f"Wall time: {wall_time:.2f}s") + print(f"Report: {profiler.report_path}") + + if args.summarize: + print() + report = profiler.report + report["_path"] = str(profiler.report_path) + print_profile_summary(report) + else: + # Print compact phase table + report = profiler.report + print(f"\n{'Phase':<20} {'Time (s)':>10} {'%':>8} {'Calls':>8}") + print("-" * 50) + for phase, data in report["phases"].items(): + calls = str(data["calls"]) if data["calls"] is not None else "-" + print(f"{phase:<20} {data['total_s']:>10.3f} {data['pct']:>7.1f}% {calls:>8}") + print("-" * 50) + print(f"{'TOTAL':<20} {report['total_time_s']:>10.3f}") + + if report.get("bottlenecks"): + print(f"\nTop bottleneck: {report['bottlenecks'][0]['phase']} " + f"({report['bottlenecks'][0]['pct']}%)") + print(f" Target: {report['bottlenecks'][0]['target']}") + print(f" Recommendation: {report['bottlenecks'][0]['recommendation']}") + + if report.get("comparison"): + comp = report["comparison"] + delta = comp["delta_total_pct"] + sign = "+" if delta > 0 else "" + print(f"\nvs previous run: {sign}{delta}% total time") + + +if __name__ == "__main__": + main() diff --git a/benchmarks/openml_integration.py b/benchmarks/validate_datasets.py similarity index 98% rename from benchmarks/openml_integration.py rename to benchmarks/validate_datasets.py index 5edd495..0f48c15 100644 --- a/benchmarks/openml_integration.py +++ b/benchmarks/validate_datasets.py @@ -3,19 +3,19 @@ Pre-release validation of OpenBoost vs XGBoost performance on real-world datasets. Run on Modal (GPU): - uv run modal run benchmarks/openml_integration.py + uv run modal run benchmarks/validate_datasets.py Run locally (small datasets only): - uv run python benchmarks/openml_integration.py --local + uv run python benchmarks/validate_datasets.py --local Run specific datasets: - uv run modal run benchmarks/openml_integration.py --datasets cpu_act higgs + uv run modal run benchmarks/validate_datasets.py --datasets cpu_act higgs Run specific configs: - uv run modal run benchmarks/openml_integration.py --configs baseline deep_tree + uv run modal run benchmarks/validate_datasets.py --configs baseline deep_tree Run extended suite: - uv run modal run benchmarks/openml_integration.py --extended + uv run modal run benchmarks/validate_datasets.py --extended """ from __future__ import annotations @@ -1269,8 +1269,8 @@ def main( print(f"\nResults saved to: {results_file}") else: print("Usage:") - print(" Modal: uv run modal run benchmarks/openml_integration.py") - print(" Local: uv run python benchmarks/openml_integration.py --local") + print(" Modal: uv run modal run benchmarks/validate_datasets.py") + print(" Local: uv run python benchmarks/validate_datasets.py --local") print("") print("Options:") print(" --datasets cpu_act higgs Run specific datasets") diff --git a/benchmarks/xgboost_benchmark.py b/benchmarks/xgboost_benchmark.py deleted file mode 100644 index cf16644..0000000 --- a/benchmarks/xgboost_benchmark.py +++ /dev/null @@ -1,594 +0,0 @@ -"""Benchmark: OpenBoost vs XGBoost on Multiple Tasks. - -Run locally: - uv run python benchmarks/xgboost_benchmark.py --local - -Run on Modal (cloud A100): - uv run modal run benchmarks/xgboost_benchmark.py -""" - -from __future__ import annotations - -from pathlib import Path - -PROJECT_ROOT = Path(__file__).parent.parent - -try: - import modal - - app = modal.App("openboost-xgboost-bench") - - image = ( - modal.Image.from_registry("nvidia/cuda:12.4.0-devel-ubuntu22.04", add_python="3.12") - .pip_install( - "numpy>=1.24", - "numba>=0.60", - "scikit-learn>=1.0", - "xgboost>=2.0", - ) - .add_local_dir( - str(PROJECT_ROOT / "src" / "openboost"), - remote_path="/root/openboost", - ) - ) -except ImportError: - modal = None - app = None - image = None - - -# ============================================================================= -# Data Generators -# ============================================================================= - -def generate_regression_data(n_samples: int, n_features: int, noise: float = 0.1): - """Generate regression data.""" - import numpy as np - np.random.seed(42) - X = np.random.randn(n_samples, n_features).astype(np.float32) - # Non-linear target with interactions - y = ( - np.sin(X[:, 0] * 2) + - 0.5 * X[:, 1] ** 2 + - 0.3 * X[:, 2] * X[:, 3] + - noise * np.random.randn(n_samples) - ).astype(np.float32) - return X, y - - -def generate_binary_data(n_samples: int, n_features: int): - """Generate binary classification data.""" - import numpy as np - np.random.seed(42) - X = np.random.randn(n_samples, n_features).astype(np.float32) - logits = X[:, 0] + 0.5 * X[:, 1] - 0.3 * X[:, 2] + 0.2 * X[:, 3] - probs = 1 / (1 + np.exp(-logits)) - y = (np.random.rand(n_samples) < probs).astype(np.float32) - return X, y - - -def generate_multiclass_data(n_samples: int, n_features: int, n_classes: int = 5): - """Generate multi-class classification data.""" - import numpy as np - np.random.seed(42) - X = np.random.randn(n_samples, n_features).astype(np.float32) - # Create class boundaries based on first few features - scores = np.zeros((n_samples, n_classes)) - for k in range(n_classes): - scores[:, k] = X[:, k % n_features] + 0.5 * X[:, (k + 1) % n_features] - y = np.argmax(scores + 0.5 * np.random.randn(n_samples, n_classes), axis=1) - return X, y.astype(np.int32) - - -def generate_quantile_data(n_samples: int, n_features: int): - """Generate heteroscedastic data for quantile regression.""" - import numpy as np - np.random.seed(42) - X = np.random.randn(n_samples, n_features).astype(np.float32) - # Heteroscedastic noise: variance depends on X[:, 0] - noise_std = 0.5 + np.abs(X[:, 0]) - y = (X[:, 0] + 0.5 * X[:, 1] + noise_std * np.random.randn(n_samples)).astype(np.float32) - return X, y - - -def generate_poisson_data(n_samples: int, n_features: int): - """Generate count data for Poisson regression.""" - import numpy as np - np.random.seed(42) - X = np.random.randn(n_samples, n_features).astype(np.float32) - # Log-linear model - log_mu = 1.0 + 0.5 * X[:, 0] + 0.3 * X[:, 1] - 0.2 * X[:, 2] - mu = np.exp(np.clip(log_mu, -5, 5)) - y = np.random.poisson(mu).astype(np.float32) - return X, y - - -def generate_gamma_data(n_samples: int, n_features: int): - """Generate positive continuous data for Gamma regression.""" - import numpy as np - np.random.seed(42) - X = np.random.randn(n_samples, n_features).astype(np.float32) - # Log-linear model for mean - log_mu = 2.0 + 0.3 * X[:, 0] + 0.2 * X[:, 1] - mu = np.exp(np.clip(log_mu, -3, 5)) - # Gamma with shape=2 - shape = 2.0 - scale = mu / shape - y = np.random.gamma(shape, scale).astype(np.float32) - return X, y - - -# ============================================================================= -# Benchmark Functions -# ============================================================================= - -def benchmark_regression(X_train, X_test, y_train, y_test, n_trees=100, max_depth=6, use_gpu=False): - """Benchmark regression task.""" - import numpy as np - import time - from sklearn.metrics import mean_squared_error, r2_score - - results = {} - - # OpenBoost - import openboost as ob - model = ob.GradientBoosting( - n_trees=n_trees, - max_depth=max_depth, - learning_rate=0.1, - loss='mse', - ) - - # Warmup - ob.GradientBoosting(n_trees=5, max_depth=3).fit(X_train[:1000], y_train[:1000]) - if use_gpu: - from numba import cuda - cuda.synchronize() - - start = time.perf_counter() - model.fit(X_train, y_train) - if use_gpu: - cuda.synchronize() - train_time = time.perf_counter() - start - - start = time.perf_counter() - y_pred = model.predict(X_test) - if use_gpu: - cuda.synchronize() - pred_time = time.perf_counter() - start - - results['openboost'] = { - 'train_time': train_time, - 'pred_time': pred_time * 1000, - 'r2': r2_score(y_test, y_pred), - 'rmse': np.sqrt(mean_squared_error(y_test, y_pred)), - } - - # XGBoost - import xgboost as xgb - xgb_model = xgb.XGBRegressor( - n_estimators=n_trees, - max_depth=max_depth, - learning_rate=0.1, - tree_method='hist', - device='cuda' if use_gpu else 'cpu', - ) - - start = time.perf_counter() - xgb_model.fit(X_train, y_train) - if use_gpu: - cuda.synchronize() - train_time = time.perf_counter() - start - - start = time.perf_counter() - y_pred_xgb = xgb_model.predict(X_test) - if use_gpu: - cuda.synchronize() - pred_time = time.perf_counter() - start - - results['xgboost'] = { - 'train_time': train_time, - 'pred_time': pred_time * 1000, - 'r2': r2_score(y_test, y_pred_xgb), - 'rmse': np.sqrt(mean_squared_error(y_test, y_pred_xgb)), - } - - return results - - -def benchmark_binary(X_train, X_test, y_train, y_test, n_trees=100, max_depth=6, use_gpu=False): - """Benchmark binary classification task.""" - import numpy as np - import time - from sklearn.metrics import roc_auc_score, accuracy_score - - results = {} - - # OpenBoost - import openboost as ob - - # Warmup JIT - ob.GradientBoosting(n_trees=5, max_depth=3, loss='logloss').fit( - X_train[:1000], y_train[:1000] - ) - if use_gpu: - from numba import cuda - cuda.synchronize() - - model = ob.GradientBoosting( - n_trees=n_trees, - max_depth=max_depth, - learning_rate=0.1, - loss='logloss', - ) - - start = time.perf_counter() - model.fit(X_train, y_train) - if use_gpu: - from numba import cuda - cuda.synchronize() - train_time = time.perf_counter() - start - - start = time.perf_counter() - y_pred_raw = model.predict(X_test) - if use_gpu: - cuda.synchronize() - pred_time = time.perf_counter() - start - - # Convert to probabilities - y_pred_prob = 1 / (1 + np.exp(-y_pred_raw)) - y_pred = (y_pred_prob > 0.5).astype(int) - - results['openboost'] = { - 'train_time': train_time, - 'pred_time': pred_time * 1000, - 'auc': roc_auc_score(y_test, y_pred_prob), - 'accuracy': accuracy_score(y_test, y_pred), - } - - # XGBoost - import xgboost as xgb - xgb_model = xgb.XGBClassifier( - n_estimators=n_trees, - max_depth=max_depth, - learning_rate=0.1, - tree_method='hist', - device='cuda' if use_gpu else 'cpu', - ) - - start = time.perf_counter() - xgb_model.fit(X_train, y_train) - if use_gpu: - cuda.synchronize() - train_time = time.perf_counter() - start - - start = time.perf_counter() - y_pred_xgb_prob = xgb_model.predict_proba(X_test)[:, 1] - if use_gpu: - cuda.synchronize() - pred_time = time.perf_counter() - start - - y_pred_xgb = (y_pred_xgb_prob > 0.5).astype(int) - - results['xgboost'] = { - 'train_time': train_time, - 'pred_time': pred_time * 1000, - 'auc': roc_auc_score(y_test, y_pred_xgb_prob), - 'accuracy': accuracy_score(y_test, y_pred_xgb), - } - - return results - - -def benchmark_multiclass(X_train, X_test, y_train, y_test, n_classes=5, n_trees=100, max_depth=6, use_gpu=False): - """Benchmark multi-class classification task.""" - import numpy as np - import time - from sklearn.metrics import accuracy_score, log_loss - - results = {} - - # OpenBoost MultiClass - import openboost as ob - - # Warmup JIT - ob.MultiClassGradientBoosting( - n_classes=n_classes, n_trees=5, max_depth=3 - ).fit(X_train[:1000], y_train[:1000]) - if use_gpu: - from numba import cuda - cuda.synchronize() - - model = ob.MultiClassGradientBoosting( - n_classes=n_classes, - n_trees=n_trees, - max_depth=max_depth, - learning_rate=0.1, - ) - - start = time.perf_counter() - model.fit(X_train, y_train) - if use_gpu: - from numba import cuda - cuda.synchronize() - train_time = time.perf_counter() - start - - start = time.perf_counter() - y_pred_prob = model.predict_proba(X_test) - if use_gpu: - cuda.synchronize() - pred_time = time.perf_counter() - start - - y_pred = np.argmax(y_pred_prob, axis=1) - - results['openboost'] = { - 'train_time': train_time, - 'pred_time': pred_time * 1000, - 'accuracy': accuracy_score(y_test, y_pred), - 'logloss': log_loss(y_test, y_pred_prob), - } - - # XGBoost - import xgboost as xgb - xgb_model = xgb.XGBClassifier( - n_estimators=n_trees, - max_depth=max_depth, - learning_rate=0.1, - tree_method='hist', - device='cuda' if use_gpu else 'cpu', - objective='multi:softprob', - num_class=n_classes, - ) - - start = time.perf_counter() - xgb_model.fit(X_train, y_train) - if use_gpu: - cuda.synchronize() - train_time = time.perf_counter() - start - - start = time.perf_counter() - y_pred_xgb_prob = xgb_model.predict_proba(X_test) - if use_gpu: - cuda.synchronize() - pred_time = time.perf_counter() - start - - y_pred_xgb = np.argmax(y_pred_xgb_prob, axis=1) - - results['xgboost'] = { - 'train_time': train_time, - 'pred_time': pred_time * 1000, - 'accuracy': accuracy_score(y_test, y_pred_xgb), - 'logloss': log_loss(y_test, y_pred_xgb_prob), - } - - return results - - -def benchmark_poisson(X_train, X_test, y_train, y_test, n_trees=100, max_depth=6, use_gpu=False): - """Benchmark Poisson regression task.""" - import numpy as np - import time - - def poisson_deviance(y_true, y_pred): - """Compute Poisson deviance.""" - y_pred = np.maximum(y_pred, 1e-8) - return 2 * np.mean(y_pred - y_true + y_true * np.log(np.maximum(y_true, 1e-8) / y_pred)) - - results = {} - - # OpenBoost - import openboost as ob - - # Warmup JIT - ob.GradientBoosting(n_trees=5, max_depth=3, loss='poisson').fit( - X_train[:1000], y_train[:1000] - ) - if use_gpu: - from numba import cuda - cuda.synchronize() - - model = ob.GradientBoosting( - n_trees=n_trees, - max_depth=max_depth, - learning_rate=0.1, - loss='poisson', - ) - - start = time.perf_counter() - model.fit(X_train, y_train) - if use_gpu: - from numba import cuda - cuda.synchronize() - train_time = time.perf_counter() - start - - start = time.perf_counter() - y_pred_raw = model.predict(X_test) - if use_gpu: - cuda.synchronize() - pred_time = time.perf_counter() - start - - # Poisson uses log link - y_pred = np.exp(y_pred_raw) - - results['openboost'] = { - 'train_time': train_time, - 'pred_time': pred_time * 1000, - 'deviance': poisson_deviance(y_test, y_pred), - 'mean_pred': np.mean(y_pred), - } - - # XGBoost - import xgboost as xgb - xgb_model = xgb.XGBRegressor( - n_estimators=n_trees, - max_depth=max_depth, - learning_rate=0.1, - tree_method='hist', - device='cuda' if use_gpu else 'cpu', - objective='count:poisson', - ) - - start = time.perf_counter() - xgb_model.fit(X_train, y_train) - if use_gpu: - cuda.synchronize() - train_time = time.perf_counter() - start - - start = time.perf_counter() - y_pred_xgb = xgb_model.predict(X_test) - if use_gpu: - cuda.synchronize() - pred_time = time.perf_counter() - start - - results['xgboost'] = { - 'train_time': train_time, - 'pred_time': pred_time * 1000, - 'deviance': poisson_deviance(y_test, y_pred_xgb), - 'mean_pred': np.mean(y_pred_xgb), - } - - return results - - -# ============================================================================= -# Main Benchmark Runner -# ============================================================================= - -def print_results(task_name, results, metric1_name, metric2_name): - """Print formatted results.""" - print(f"\n{'─' * 60}") - print(f"Task: {task_name}") - print(f"{'─' * 60}") - print(f"{'Model':<15} {'Train (s)':<12} {'Pred (ms)':<12} {metric1_name:<12} {metric2_name:<12}") - print(f"{'─' * 60}") - - metric_keys = [k for k in list(results['openboost'].keys()) if k not in ('train_time', 'pred_time')] - for name in ['openboost', 'xgboost']: - r = results[name] - m1 = r[metric_keys[0]] if metric_keys else 0.0 - m2 = r[metric_keys[1]] if len(metric_keys) > 1 else 0.0 - print(f"{name:<15} {r['train_time']:<12.3f} {r['pred_time']:<12.2f} {m1:<12.4f} {m2:<12.4f}") - - # Speedup - speedup = results['xgboost']['train_time'] / results['openboost']['train_time'] - print(f"{'─' * 60}") - print(f"Speedup: {speedup:.2f}x {'(OpenBoost faster)' if speedup > 1 else '(XGBoost faster)'}") - - -def run_all_benchmarks(n_samples=50_000, n_features=20, n_trees=100, max_depth=6, use_gpu=False): - """Run all benchmark tasks.""" - from sklearn.model_selection import train_test_split - - print("=" * 60) - print("OPENBOOST vs XGBOOST BENCHMARK") - print("=" * 60) - print(f"Config: {n_samples:,} samples, {n_features} features, {n_trees} trees, depth {max_depth}") - print(f"Device: {'GPU' if use_gpu else 'CPU'}") - - all_results = {} - - # 1. Regression - print("\n[1/5] Regression (MSE)...") - X, y = generate_regression_data(n_samples, n_features) - X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) - results = benchmark_regression(X_train, X_test, y_train, y_test, n_trees, max_depth, use_gpu) - print_results("Regression (MSE)", results, "R²", "RMSE") - all_results['regression'] = results - - # 2. Binary Classification - print("\n[2/5] Binary Classification (LogLoss)...") - X, y = generate_binary_data(n_samples, n_features) - X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) - results = benchmark_binary(X_train, X_test, y_train, y_test, n_trees, max_depth, use_gpu) - print_results("Binary Classification", results, "AUC", "Accuracy") - all_results['binary'] = results - - # 3. Multi-class Classification - print("\n[3/5] Multi-class Classification (Softmax)...") - n_classes = 5 - X, y = generate_multiclass_data(n_samples, n_features, n_classes) - X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) - results = benchmark_multiclass(X_train, X_test, y_train, y_test, n_classes, n_trees, max_depth, use_gpu) - print_results("Multi-class (5 classes)", results, "Accuracy", "LogLoss") - all_results['multiclass'] = results - - # 4. Poisson Regression - print("\n[4/5] Poisson Regression...") - X, y = generate_poisson_data(n_samples, n_features) - X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) - results = benchmark_poisson(X_train, X_test, y_train, y_test, n_trees, max_depth, use_gpu) - print_results("Poisson Regression", results, "Deviance", "Mean Pred") - all_results['poisson'] = results - - # Summary - print("\n" + "=" * 60) - print("SUMMARY") - print("=" * 60) - print(f"\n{'Task':<25} {'OpenBoost (s)':<15} {'XGBoost (s)':<15} {'Speedup':<10}") - print("─" * 65) - - for task, res in all_results.items(): - ob_time = res['openboost']['train_time'] - xgb_time = res['xgboost']['train_time'] - speedup = xgb_time / ob_time - faster = "OB" if speedup > 1 else "XGB" - print(f"{task:<25} {ob_time:<15.3f} {xgb_time:<15.3f} {speedup:.2f}x ({faster})") - - return all_results - - -# ============================================================================= -# Modal Entry Points (only defined when modal is installed) -# ============================================================================= - -if modal is not None and app is not None: - - @app.function(gpu="A100", image=image, timeout=1800) - def benchmark_gpu(n_samples: int = 100_000, n_features: int = 20, n_trees: int = 100): - """Run benchmark on GPU.""" - import sys - sys.path.insert(0, "/root") - - from numba import cuda - print(f"GPU: {cuda.get_current_device().name}") - - return run_all_benchmarks( - n_samples=n_samples, - n_features=n_features, - n_trees=n_trees, - max_depth=6, - use_gpu=True, - ) - - @app.local_entrypoint() - def main(): - """Run benchmark on Modal.""" - print("Running OpenBoost vs XGBoost benchmark on Modal A100...") - results = benchmark_gpu.remote(n_samples=100_000, n_features=20, n_trees=100) - print("\n\nFinal Results:") - print(results) - - -# ============================================================================= -# Local Execution -# ============================================================================= - -if __name__ == "__main__": - import sys - - if len(sys.argv) > 1 and sys.argv[1] == "--local": - print("Running locally on CPU...") - - sys.path.insert(0, str(PROJECT_ROOT / "src")) - - run_all_benchmarks( - n_samples=20_000, # Smaller for CPU - n_features=20, - n_trees=50, - max_depth=6, - use_gpu=False, - ) - else: - print("Usage:") - print(" Modal: uv run modal run benchmarks/xgboost_benchmark.py") - print(" Local: uv run python benchmarks/xgboost_benchmark.py --local") diff --git a/pyproject.toml b/pyproject.toml index 9136a45..87a468a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -54,6 +54,7 @@ sklearn = [ test = [ "pytest>=7.0", "pytest-cov>=4.0", + "pytest-xdist>=3.0", ] # Benchmarking bench = [ @@ -92,7 +93,16 @@ ignore = ["E501"] # Line length handled separately [tool.pytest.ini_options] testpaths = ["tests"] -addopts = "-v --tb=short" +addopts = "-v --tb=short -n auto --dist loadfile" +markers = [ + "slow: marks tests that take >10s (deselect with '-m \"not slow\"')", + "gpu: marks tests requiring CUDA GPU", + "xgboost: marks tests requiring xgboost package", + "lightgbm: marks tests requiring lightgbm package", + "numerical: marks numerical agreement tests against reference implementations", + "parity: marks CPU/GPU parity tests", + "benchmark: marks performance benchmark tests (not run in CI)", +] [dependency-groups] dev = [ @@ -104,6 +114,7 @@ dev = [ "mypy>=1.19.1", "pytest>=7.0", "pytest-cov>=4.0", + "pytest-xdist>=3.0", "ruff>=0.4", "xgboost>=2.0", ] diff --git a/src/openboost/__init__.py b/src/openboost/__init__.py index 4bbbc56..206e100 100644 --- a/src/openboost/__init__.py +++ b/src/openboost/__init__.py @@ -161,6 +161,7 @@ CallbackManager, TrainingState, ) +from ._profiler import ProfilingCallback # ============================================================================= # Feature Importance (Phase 13) @@ -352,6 +353,7 @@ def __getattr__(name: str): "HistoryCallback", "CallbackManager", "TrainingState", + "ProfilingCallback", # Feature importance (Phase 13) "compute_feature_importances", "get_feature_importance_dict", diff --git a/src/openboost/_core/_growth.py b/src/openboost/_core/_growth.py index e9cb773..b45cf67 100644 --- a/src/openboost/_core/_growth.py +++ b/src/openboost/_core/_growth.py @@ -214,6 +214,9 @@ class TreeStructure: # Phase 14.3: Categorical split support is_categorical_split: NDArray | None = None # (n_nodes,) bool - True if categorical split cat_bitsets: NDArray | None = None # (n_nodes,) uint64 - bitmask for categories going left + + # Cached GPU arrays for fast repeated prediction (avoids re-transferring) + _gpu_arrays: dict | None = field(default=None, repr=False) def get_leaf_values(self, leaf_ids: NDArray) -> NDArray: """Get leaf values for given leaf IDs. @@ -335,38 +338,51 @@ def _predict_standard_cpu(self, binned: NDArray) -> NDArray: # Get leaf values (works with both NDArray and LeafValues) return self.get_leaf_values(leaf_ids) - def _predict_standard_gpu(self, binned) -> NDArray: - """GPU prediction for standard trees.""" - from numba import cuda - from .._backends._cuda import predict_cuda, predict_with_categorical_cuda, to_device + def _ensure_gpu_arrays(self): + """Cache tree structure arrays on GPU to avoid repeated transfers.""" + if self._gpu_arrays is not None: + return self._gpu_arrays + from .._backends._cuda import to_device + self._gpu_arrays = { + 'features': to_device(self.features), + 'thresholds': to_device(self.thresholds.astype(np.uint8)), + 'values': to_device(self.values if isinstance(self.values, np.ndarray) else self.leaf_values_array), + 'left': to_device(self.left_children), + 'right': to_device(self.right_children), + 'missing_left': to_device(self.missing_go_left) if self.missing_go_left is not None else None, + } has_categorical = ( self.is_categorical_split is not None and self.cat_bitsets is not None and np.any(self.is_categorical_split) ) - if has_categorical: + self._gpu_arrays['is_categorical'] = to_device(self.is_categorical_split) + self._gpu_arrays['cat_bitsets'] = to_device(self.cat_bitsets) + return self._gpu_arrays + + def _predict_standard_gpu(self, binned) -> NDArray: + """GPU prediction for standard trees.""" + from .._backends._cuda import predict_cuda, predict_with_categorical_cuda + + ga = self._ensure_gpu_arrays() + + if 'is_categorical' in ga: return predict_with_categorical_cuda( binned, - to_device(self.features), - to_device(self.thresholds.astype(np.uint8)), - to_device(self.values), - to_device(self.left_children), - to_device(self.right_children), - tree_missing_left=to_device(self.missing_go_left) if self.missing_go_left is not None else None, - is_categorical_split=to_device(self.is_categorical_split), - cat_bitsets=to_device(self.cat_bitsets), + ga['features'], ga['thresholds'], ga['values'], + ga['left'], ga['right'], + tree_missing_left=ga['missing_left'], + is_categorical_split=ga['is_categorical'], + cat_bitsets=ga['cat_bitsets'], ) return predict_cuda( binned, - to_device(self.features), - to_device(self.thresholds.astype(np.uint8)), - to_device(self.values), - to_device(self.left_children), - to_device(self.right_children), - tree_missing_left=to_device(self.missing_go_left) if self.missing_go_left is not None else None, + ga['features'], ga['thresholds'], ga['values'], + ga['left'], ga['right'], + tree_missing_left=ga['missing_left'], ) def _predict_symmetric(self, binned: NDArray) -> NDArray: @@ -517,20 +533,34 @@ def grow( # Build level by level for depth in range(config.max_depth): nodes_at_level = get_nodes_at_depth(depth) - + # Filter to nodes that have samples active_nodes = self._get_active_nodes(sample_node_ids, nodes_at_level) if not active_nodes: break - # Build histograms for active nodes - histograms = build_node_histograms( - binned, grad, hess, sample_node_ids, active_nodes - ) + # Histogram subtraction: build only smaller children, subtract for larger + if depth > 0 and parent_histograms: + build_nodes, subtract_info = self._plan_histogram_subtraction( + active_nodes, sample_node_ids, parent_histograms + ) + # Build histograms only for the subset that needs full computation + histograms = build_node_histograms( + binned, grad, hess, sample_node_ids, build_nodes + ) if build_nodes else {} + # Derive larger children's histograms via subtraction (O(features*bins)) + for child_id, (parent_hist, sibling_id) in subtract_info.items(): + histograms[child_id] = subtract_histogram( + parent_hist, histograms[sibling_id], child_id + ) + else: + histograms = build_node_histograms( + binned, grad, hess, sample_node_ids, active_nodes + ) # Column subsampling: zero out non-selected feature histograms if col_mask is not None: - for node_id, hist in histograms.items(): + for _node_id, hist in histograms.items(): hist.hist_grad[~col_mask] = 0.0 hist.hist_hess[~col_mask] = 0.0 @@ -544,7 +574,7 @@ def grow( is_categorical=is_categorical, # Phase 14.3 n_categories=n_categories, # Phase 14.3 ) - + # Only update depth if at least one valid split was found if splits: actual_depth = depth + 1 @@ -558,15 +588,15 @@ def grow( missing_go_left[node_id] = node_split.missing_go_left # Phase 14 is_categorical_split[node_id] = node_split.is_categorical # Phase 14.3 cat_bitsets[node_id] = node_split.cat_bitset # Phase 14.3 - + # Partition samples (handles missing via learned direction) if splits: sample_node_ids = partition_samples( - binned, sample_node_ids, splits, + binned, sample_node_ids, splits, missing_go_left=missing_go_left # Phase 14 ) - - # Store histograms for potential subtraction (future optimization) + + # Store histograms for subtraction at next level parent_histograms = histograms # Compute leaf values for all leaf nodes @@ -598,6 +628,69 @@ def grow( cat_bitsets=cat_bitsets[:n_nodes] if any_cat else None, # Phase 14.3 ) + def _plan_histogram_subtraction( + self, + active_nodes: list[int], + sample_node_ids, + parent_histograms: dict[int, NodeHistogram], + ) -> tuple[list[int], dict[int, tuple[NodeHistogram, int]]]: + """Plan which children to build vs subtract. + + For each parent that split, build the histogram for the smaller child + and derive the larger child via subtraction: larger = parent - smaller. + This halves histogram computation on average. + + Returns: + build_nodes: Nodes whose histograms must be built from samples. + subtract_info: {child_id: (parent_histogram, sibling_id_to_subtract_from)} + """ + if hasattr(sample_node_ids, 'copy_to_host'): + ids_cpu = sample_node_ids.copy_to_host() + else: + ids_cpu = np.asarray(sample_node_ids) + + # Count samples per node + node_counts: dict[int, int] = {} + for nid in active_nodes: + node_counts[nid] = int(np.sum(ids_cpu == nid)) + + build_nodes: list[int] = [] + subtract_info: dict[int, tuple[NodeHistogram, int]] = {} + + # Group children by parent + processed_parents: set[int] = set() + for nid in active_nodes: + parent_id = (nid - 1) // 2 + if parent_id in processed_parents: + continue + if parent_id not in parent_histograms: + # No parent histogram — must build from samples + build_nodes.append(nid) + continue + + # Find sibling + left_child = 2 * parent_id + 1 + right_child = 2 * parent_id + 2 + sibling = right_child if nid == left_child else left_child + + # Both children must be active for subtraction + if sibling not in node_counts: + build_nodes.append(nid) + continue + + processed_parents.add(parent_id) + parent_hist = parent_histograms[parent_id] + + # Build the smaller child, subtract for the larger + if node_counts[left_child] <= node_counts[right_child]: + build_nodes.append(left_child) + subtract_info[right_child] = (parent_hist, left_child) + else: + build_nodes.append(right_child) + subtract_info[left_child] = (parent_hist, right_child) + + return build_nodes, subtract_info + def _get_active_nodes(self, sample_node_ids, candidate_nodes: list[int]) -> list[int]: """Get nodes that have samples assigned to them.""" if hasattr(sample_node_ids, 'copy_to_host'): diff --git a/src/openboost/_models/_boosting.py b/src/openboost/_models/_boosting.py index 0ecc6c2..a9a1e7a 100644 --- a/src/openboost/_models/_boosting.py +++ b/src/openboost/_models/_boosting.py @@ -14,6 +14,7 @@ from __future__ import annotations +import os from dataclasses import dataclass, field from pathlib import Path from typing import TYPE_CHECKING, Callable, Literal @@ -227,6 +228,12 @@ def fit( """ # Clear any previous fit self.trees_ = [] + + # Auto-enable profiling via env var + if os.environ.get("OPENBOOST_PROFILE"): + from .._profiler import ProfilingCallback + _profile_dir = os.environ.get("OPENBOOST_PROFILE_DIR", "logs/") + callbacks = list(callbacks or []) + [ProfilingCallback(output_dir=_profile_dir)] # Validate inputs (Phase 20.3) X = validate_X(X, allow_nan=True, context="fit") @@ -700,45 +707,72 @@ def _fit_gpu( colsample_bytree=self.colsample_bytree, ) else: - # Standard training (with optional row/col subsampling in fit_tree) - tree = fit_tree( - self.X_binned_, - grad_gpu, - hess_gpu, - max_depth=self.max_depth, - min_child_weight=self.min_child_weight, - reg_lambda=self.reg_lambda, - reg_alpha=self.reg_alpha, - gamma=self.gamma, - subsample=self.subsample, - colsample_bytree=self.colsample_bytree, + # Use GPU-native tree builder when no features require the + # growth-strategy path (reg_alpha, colsample, subsample, etc.) + use_gpu_native = ( + is_cuda() + and self.reg_alpha == 0.0 + and self.colsample_bytree >= 1.0 + and self.subsample >= 1.0 ) + if use_gpu_native: + from .._core._tree import fit_tree_gpu_native + tree = fit_tree_gpu_native( + self.X_binned_, + grad_gpu, + hess_gpu, + max_depth=self.max_depth, + min_child_weight=self.min_child_weight, + reg_lambda=self.reg_lambda, + min_gain=self.gamma, + ) + else: + tree = fit_tree( + self.X_binned_, + grad_gpu, + hess_gpu, + max_depth=self.max_depth, + min_child_weight=self.min_child_weight, + reg_lambda=self.reg_lambda, + reg_alpha=self.reg_alpha, + gamma=self.gamma, + subsample=self.subsample, + colsample_bytree=self.colsample_bytree, + ) - # Update predictions - tree_pred = tree(self.X_binned_) - if hasattr(tree_pred, 'copy_to_host'): - tree_pred_cpu = tree_pred.copy_to_host() + # Update predictions on GPU + from .._core._tree import Tree + if isinstance(tree, Tree) and tree.on_gpu: + # Fused traversal + add: single kernel, no intermediate array + from .._core._predict import predict_tree_add_gpu + predict_tree_add_gpu(tree, self.X_binned_, pred_gpu, self.learning_rate) else: - tree_pred_cpu = tree_pred - - # Update GPU predictions - pred_cpu = pred_gpu.copy_to_host() - pred_cpu += self.learning_rate * tree_pred_cpu - cuda.to_device(pred_cpu, to=pred_gpu) - + tree_pred = tree(self.X_binned_) + if hasattr(tree_pred, '__cuda_array_interface__'): + from .._core._predict import _add_inplace_cuda + _add_inplace_cuda(pred_gpu, tree_pred, self.learning_rate) + else: + if hasattr(tree_pred, 'copy_to_host'): + tree_pred = tree_pred.copy_to_host() + pred_cpu = pred_gpu.copy_to_host() + pred_cpu += self.learning_rate * tree_pred + cuda.to_device(pred_cpu, to=pred_gpu) + self.trees_.append(tree) - - # Compute losses for callbacks using actual loss function - state.train_loss = _compute_loss_value(self.loss, pred_cpu, y, quantile_alpha=self.quantile_alpha, tweedie_rho=self.tweedie_rho) - if eval_set: - X_val, y_val = eval_set[0] - val_pred = self.predict(X_val) - state.val_loss = _compute_loss_value(self.loss, val_pred, y_val, quantile_alpha=self.quantile_alpha, tweedie_rho=self.tweedie_rho) + # Only compute loss and copy to CPU when callbacks need it + if cb_manager.callbacks: + pred_cpu = pred_gpu.copy_to_host() + state.train_loss = _compute_loss_value(self.loss, pred_cpu, y, quantile_alpha=self.quantile_alpha, tweedie_rho=self.tweedie_rho) - # Check if callbacks want to stop - if not cb_manager.on_round_end(state): - break + if eval_set: + X_val, y_val = eval_set[0] + val_pred = self.predict(X_val) + state.val_loss = _compute_loss_value(self.loss, val_pred, y_val, quantile_alpha=self.quantile_alpha, tweedie_rho=self.tweedie_rho) + + # Check if callbacks want to stop + if not cb_manager.on_round_end(state): + break cb_manager.on_train_end(state) @@ -907,6 +941,19 @@ def predict(self, X: NDArray | BinnedArray) -> NDArray: # Accumulate tree predictions with base score base = getattr(self, 'base_score_', np.float32(0.0)) + + # Use GPU accumulation when data is on GPU + if is_cuda() and hasattr(X_binned.data, '__cuda_array_interface__'): + from numba import cuda + + from .._core._predict import _add_inplace_cuda, _fill_cuda + pred_gpu = cuda.device_array(n_samples, dtype=np.float32) + _fill_cuda(pred_gpu, float(base)) + for tree in self.trees_: + tree_pred = tree(X_binned) + _add_inplace_cuda(pred_gpu, tree_pred, self.learning_rate) + return pred_gpu.copy_to_host() + pred = np.full(n_samples, base, dtype=np.float32) for tree in self.trees_: tree_pred = tree(X_binned) diff --git a/src/openboost/_profiler.py b/src/openboost/_profiler.py new file mode 100644 index 0000000..b3e429f --- /dev/null +++ b/src/openboost/_profiler.py @@ -0,0 +1,495 @@ +"""Profiling callback for OpenBoost training. + +Instruments the training loop to produce structured JSON reports that +break down time by phase (histogram building, split finding, partitioning, +etc.). Designed for self-recursive improvement loops: profile → identify +bottleneck → optimize → re-profile → verify improvement. + +Usage: + # Explicit callback + from openboost import GradientBoosting, ProfilingCallback + profiler = ProfilingCallback(output_dir="logs/") + model = GradientBoosting(n_trees=100) + model.fit(X, y, callbacks=[profiler]) + print(profiler.report_path) + + # Environment variable (zero-code-change) + OPENBOOST_PROFILE=1 uv run python train.py +""" + +from __future__ import annotations + +import json +import os +import platform +import subprocess +import time +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + +from ._callbacks import Callback, TrainingState + + +# ============================================================================= +# Phase timer +# ============================================================================= + +class PhaseTimer: + """Accumulates time for a named phase across multiple calls.""" + + __slots__ = ("name", "_use_cuda", "_times", "_start") + + def __init__(self, name: str, use_cuda: bool = False): + self.name = name + self._use_cuda = use_cuda + self._times: list[float] = [] + self._start: float | None = None + + def start(self) -> None: + if self._use_cuda: + from numba import cuda + cuda.synchronize() + self._start = time.perf_counter() + + def stop(self) -> float: + if self._use_cuda: + from numba import cuda + cuda.synchronize() + elapsed = time.perf_counter() - self._start + self._times.append(elapsed) + self._start = None + return elapsed + + @property + def total(self) -> float: + return sum(self._times) + + @property + def count(self) -> int: + return len(self._times) + + @property + def mean(self) -> float: + return self.total / self.count if self._times else 0.0 + + def to_dict(self, total_time: float) -> dict: + return { + "total_s": round(self.total, 6), + "pct": round(100 * self.total / total_time, 2) if total_time > 0 else 0, + "calls": self.count, + "mean_s": round(self.mean, 6), + } + + +# ============================================================================= +# Bottleneck recommendations +# ============================================================================= + +PHASE_RECOMMENDATIONS: dict[str, tuple[str, str]] = { + "histogram_build": ( + "_backends/_cpu.py:build_histogram_cpu, _backends/_cuda.py:_build_histogram_shared_kernel", + "shared-memory tiling, feature batching, reducing n_bins", + ), + "split_find": ( + "_core/_primitives.py:find_node_splits, _core/_split.py:find_best_split", + "GPU parallel scan, vectorized prefix-sum split evaluation", + ), + "partition": ( + "_core/_primitives.py:partition_samples", + "radix-sort-based partitioning, sorted index schemes", + ), + "gradient_compute": ( + "_loss.py loss functions", + "fused GPU kernels, avoiding CPU-GPU copies for custom losses", + ), + "prediction_update": ( + "_models/_boosting.py prediction update loop", + "fusing tree traversal + add, batching prediction updates", + ), + "leaf_values": ( + "_core/_primitives.py:compute_leaf_values", + "GPU reduction kernel, batch leaf computation", + ), + "tree_overhead": ( + "_core/_tree.py:fit_tree, _core/_growth.py:LevelWiseGrowth.grow", + "reduce Python overhead in growth loop, minimize object allocation", + ), + "grad_pred_loss": ( + "_models/_boosting.py training loop (loss_fn, tree predict, loss eval)", + "fuse gradient+prediction, skip loss eval when no callbacks need it", + ), +} + + +# ============================================================================= +# Hardware info +# ============================================================================= + +def _collect_hardware_info() -> dict: + info: dict[str, Any] = { + "cpu": platform.processor() or platform.machine(), + "cpu_cores": os.cpu_count(), + "ram_gb": None, + "gpu": None, + "gpu_memory_gb": None, + } + if platform.system() == "Darwin": + try: + result = subprocess.run( + ["sysctl", "-n", "hw.memsize"], + capture_output=True, text=True, timeout=5, + ) + info["ram_gb"] = round(int(result.stdout.strip()) / (1024**3), 1) + except Exception: + pass + elif platform.system() == "Linux": + try: + with open("/proc/meminfo") as f: + for line in f: + if line.startswith("MemTotal"): + info["ram_gb"] = round(int(line.split()[1]) / (1024**2), 1) + break + except Exception: + pass + try: + from numba import cuda + if cuda.is_available(): + dev = cuda.get_current_device() + info["gpu"] = dev.name.decode() if isinstance(dev.name, bytes) else str(dev.name) + except Exception: + pass + return info + + +def _get_git_sha() -> str | None: + try: + result = subprocess.run( + ["git", "rev-parse", "--short", "HEAD"], + capture_output=True, text=True, timeout=5, + ) + return result.stdout.strip() if result.returncode == 0 else None + except Exception: + return None + + +# ============================================================================= +# Profiling callback +# ============================================================================= + +_PRIMITIVES_TO_WRAP = [ + "build_node_histograms", + "find_node_splits", + "partition_samples", + "compute_leaf_values", +] + +_PHASE_NAMES = { + "build_node_histograms": "histogram_build", + "find_node_splits": "split_find", + "partition_samples": "partition", + "compute_leaf_values": "leaf_values", +} + +# Modules where fit_tree is imported and needs wrapping +_FIT_TREE_MODULES = [ + "openboost._core._tree", + "openboost._models._boosting", +] + + +class ProfilingCallback(Callback): + """Profile training phases and produce structured JSON reports. + + Wraps core primitive functions with timers during training to measure + per-phase time breakdown. Writes a JSON report to output_dir on completion. + + Args: + output_dir: Directory for profile JSON files. Created if missing. + compare_last: If True, compare with the most recent previous profile. + """ + + def __init__(self, output_dir: str = "logs/", compare_last: bool = True): + self.output_dir = Path(output_dir) + self.compare_last = compare_last + + self._timers: dict[str, PhaseTimer] = {} + self._tree_timers: list[dict[str, float]] = [] + self._round_start: float = 0.0 + self._round_phase_snapshot: dict[str, float] = {} + self._train_start: float = 0.0 + self._originals: dict[str, Any] = {} + self._use_cuda: bool = False + + self.report_path: Path | None = None + self.report: dict | None = None + + def _get_timer(self, name: str) -> PhaseTimer: + if name not in self._timers: + self._timers[name] = PhaseTimer(name, use_cuda=self._use_cuda) + return self._timers[name] + + # ----- wrapping / unwrapping ----- + + def _wrap_primitives(self) -> None: + import sys + import openboost._core._primitives as prims_mod + import openboost._core._growth as growth_mod + + # Wrap the 4 core primitives + for func_name in _PRIMITIVES_TO_WRAP: + original = getattr(prims_mod, func_name) + self._originals[("prim", func_name)] = original + phase_name = _PHASE_NAMES[func_name] + timer = self._get_timer(phase_name) + + def make_wrapper(orig, tmr): + def wrapper(*args, **kwargs): + tmr.start() + result = orig(*args, **kwargs) + tmr.stop() + return result + return wrapper + + wrapped = make_wrapper(original, timer) + setattr(prims_mod, func_name, wrapped) + if hasattr(growth_mod, func_name): + setattr(growth_mod, func_name, wrapped) + + # Wrap fit_tree to capture total tree-building time (includes orchestration) + fit_tree_timer = self._get_timer("fit_tree") + for mod_name in _FIT_TREE_MODULES: + mod = sys.modules.get(mod_name) + if mod and hasattr(mod, "fit_tree"): + original_ft = getattr(mod, "fit_tree") + self._originals[("fit_tree", mod_name)] = original_ft + wrapped_ft = make_wrapper(original_ft, fit_tree_timer) + setattr(mod, "fit_tree", wrapped_ft) + + def _unwrap_primitives(self) -> None: + import sys + import openboost._core._primitives as prims_mod + import openboost._core._growth as growth_mod + + for key, original in self._originals.items(): + kind, name = key + if kind == "prim": + setattr(prims_mod, name, original) + if hasattr(growth_mod, name): + setattr(growth_mod, name, original) + elif kind == "fit_tree": + mod = sys.modules.get(name) + if mod: + setattr(mod, "fit_tree", original) + self._originals.clear() + + # ----- callback hooks ----- + + def on_train_begin(self, state: TrainingState) -> None: + from ._backends import is_cuda + self._use_cuda = is_cuda() + self._timers.clear() + self._tree_timers.clear() + self._wrap_primitives() + self._train_start = time.perf_counter() + + def on_round_begin(self, state: TrainingState) -> None: + self._round_start = time.perf_counter() + self._round_phase_snapshot = { + name: timer.total for name, timer in self._timers.items() + } + + def on_round_end(self, state: TrainingState) -> bool: + round_total = time.perf_counter() - self._round_start + tree_entry: dict[str, float] = {"round": state.round_idx, "total_s": round_total} + for name, timer in self._timers.items(): + prev = self._round_phase_snapshot.get(name, 0.0) + tree_entry[f"{name}_s"] = round(timer.total - prev, 6) + # Compute per-tree derived phases + ft = tree_entry.get("fit_tree_s", 0) + prims = sum(tree_entry.get(f"{p}_s", 0) for p in + ("histogram_build", "split_find", "partition", "leaf_values")) + tree_entry["tree_overhead_s"] = round(max(0.0, ft - prims), 6) + tree_entry["grad_pred_loss_s"] = round(max(0.0, round_total - ft), 6) + self._tree_timers.append(tree_entry) + return True + + def on_train_end(self, state: TrainingState) -> None: + total_time = time.perf_counter() - self._train_start + self._unwrap_primitives() + + # Compute derived phases from per-tree data + # round_total = gradient_compute + fit_tree + prediction_update + loss_eval + # fit_tree = primitives + orchestration_overhead + total_round_time = sum(t["total_s"] for t in self._tree_timers) + fit_tree_total = self._timers["fit_tree"].total if "fit_tree" in self._timers else 0 + # Time outside fit_tree but inside rounds = grad compute + pred update + loss eval + outside_tree = max(0.0, total_round_time - fit_tree_total) + # Primitives total + prims_total = sum( + self._timers[p].total for p in ("histogram_build", "split_find", "partition", "leaf_values") + if p in self._timers + ) + # Orchestration = fit_tree - primitives (Python overhead in growth strategies) + orchestration = max(0.0, fit_tree_total - prims_total) + + # Build phases dict (show the most useful breakdown) + phases = {} + for name in ("histogram_build", "split_find", "partition", "leaf_values"): + if name in self._timers: + phases[name] = self._timers[name].to_dict(total_time) + # Add fit_tree orchestration overhead + if orchestration > 0: + n_trees = self._timers["fit_tree"].count if "fit_tree" in self._timers else 0 + phases["tree_overhead"] = { + "total_s": round(orchestration, 6), + "pct": round(100 * orchestration / total_time, 2) if total_time > 0 else 0, + "calls": n_trees, + "mean_s": round(orchestration / n_trees, 6) if n_trees > 0 else 0, + } + # Add outside-tree time (gradient + prediction + loss eval) + if outside_tree > 0: + n_rounds = len(self._tree_timers) + phases["grad_pred_loss"] = { + "total_s": round(outside_tree, 6), + "pct": round(100 * outside_tree / total_time, 2) if total_time > 0 else 0, + "calls": n_rounds, + "mean_s": round(outside_tree / n_rounds, 6) if n_rounds > 0 else 0, + } + # Other: time outside the training loop entirely (setup, teardown) + accounted = fit_tree_total + outside_tree + other_time = max(0.0, total_time - accounted) + if other_time > 0.001: + phases["other"] = { + "total_s": round(other_time, 6), + "pct": round(100 * other_time / total_time, 2) if total_time > 0 else 0, + "calls": None, + "mean_s": None, + } + + # Bottlenecks: top 3 phases by pct (excluding "other") + ranked = sorted( + [(name, data) for name, data in phases.items() if name != "other"], + key=lambda x: x[1]["pct"], + reverse=True, + ) + bottlenecks = [] + for rank, (phase, data) in enumerate(ranked[:3], 1): + target, rec = PHASE_RECOMMENDATIONS.get(phase, ("unknown", "investigate")) + bottlenecks.append({ + "rank": rank, + "phase": phase, + "pct": data["pct"], + "target": target, + "recommendation": rec, + }) + + # Dataset / model info + model = state.model + n_trees_actual = len(getattr(model, "trees_", [])) + dataset_info = { + "n_samples": (model.X_binned_.n_samples + if getattr(model, "X_binned_", None) else None), + "n_features": getattr(model, "n_features_in_", None), + "n_trees": n_trees_actual, + "max_depth": getattr(model, "max_depth", None), + "learning_rate": getattr(model, "learning_rate", None), + "loss": str(getattr(model, "loss", None)), + "backend": "cuda" if self._use_cuda else "cpu", + } + + report = { + "version": "1.0", + "timestamp": datetime.now(timezone.utc).isoformat(), + "git_sha": _get_git_sha(), + "hardware": _collect_hardware_info(), + "dataset": dataset_info, + "total_time_s": round(total_time, 6), + "phases": phases, + "per_tree": self._tree_timers, + "bottlenecks": bottlenecks, + } + + # Comparison with previous run + self.output_dir.mkdir(parents=True, exist_ok=True) + if self.compare_last: + comparison = self._compare_with_previous(report) + if comparison: + report["comparison"] = comparison + + # Write report + ts = datetime.now().strftime("%Y%m%d_%H%M%S") + self.report_path = self.output_dir / f"profile_{ts}.json" + with open(self.report_path, "w") as f: + json.dump(report, f, indent=2) + + self.report = report + + # ----- comparison ----- + + def _compare_with_previous(self, current: dict) -> dict | None: + existing = sorted(self.output_dir.glob("profile_*.json")) + if not existing: + return None + prev_path = existing[-1] + try: + with open(prev_path) as f: + prev = json.load(f) + except (json.JSONDecodeError, OSError): + return None + + prev_total = prev.get("total_time_s", 0) + cur_total = current["total_time_s"] + + comparison: dict[str, Any] = { + "previous_run": str(prev_path), + "delta_total_pct": _pct_delta(prev_total, cur_total), + "phase_deltas": {}, + } + for phase in current["phases"]: + if phase in prev.get("phases", {}): + prev_s = prev["phases"][phase]["total_s"] + cur_s = current["phases"][phase]["total_s"] + comparison["phase_deltas"][phase] = { + "previous_s": prev_s, + "current_s": cur_s, + "delta_pct": _pct_delta(prev_s, cur_s), + } + return comparison + + +def _pct_delta(old: float, new: float) -> float: + if old == 0: + return 0.0 + return round(100 * (new - old) / old, 2) + + +# ============================================================================= +# Summary printer (machine-readable for improvement loops) +# ============================================================================= + +def print_profile_summary(report: dict) -> None: + """Print a machine-readable summary of a profile report.""" + print("=== PROFILE SUMMARY ===") + print(f"TOTAL: {report['total_time_s']:.2f}s") + print(f"BACKEND: {report['dataset'].get('backend', 'unknown')}") + + if report.get("bottlenecks"): + top = report["bottlenecks"][0] + print(f"TOP BOTTLENECK: {top['phase']} ({top['pct']}%)") + print(f"TARGET: {top['target']}") + print(f"RECOMMENDATION: {top['recommendation']}") + + if report.get("comparison"): + comp = report["comparison"] + delta = comp["delta_total_pct"] + sign = "+" if delta > 0 else "" + print(f"DELTA vs PREVIOUS: {sign}{delta}% total") + for phase, pd in comp.get("phase_deltas", {}).items(): + if abs(pd["delta_pct"]) >= 5: + s = "+" if pd["delta_pct"] > 0 else "" + print(f" {phase}: {s}{pd['delta_pct']}%") + else: + print("DELTA vs PREVIOUS: (no previous run)") + + print(f"REPORT: {report.get('_path', 'N/A')}") diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..13bf9a4 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,188 @@ +"""Shared test fixtures for OpenBoost. + +Centralizes dataset generation, pre-binned arrays, and gradient fixtures +to eliminate duplication across test files. All data fixtures use explicit +RandomState objects (not np.random.seed) to avoid cross-test contamination. +""" + +import numpy as np +import pytest + +import openboost as ob + +# ============================================================================= +# CUDA detection and auto-skip +# ============================================================================= + +try: + from numba import cuda + CUDA_AVAILABLE = cuda.is_available() +except Exception: + CUDA_AVAILABLE = False + + +def pytest_collection_modifyitems(config, items): + """Auto-skip GPU and parity tests when CUDA is unavailable.""" + if not CUDA_AVAILABLE: + skip_gpu = pytest.mark.skip(reason="CUDA not available") + for item in items: + if "gpu" in item.keywords or "parity" in item.keywords: + item.add_marker(skip_gpu) + + +# ============================================================================= +# Regression datasets +# ============================================================================= + +@pytest.fixture(scope="session") +def regression_100x5(): + """Small regression dataset: 100 samples, 5 features, linear target. + + y = X[:,0] + 0.5 * X[:,1] + noise(0.1) + """ + rng = np.random.RandomState(42) + X = rng.randn(100, 5).astype(np.float32) + y = (X[:, 0] + 0.5 * X[:, 1] + rng.randn(100).astype(np.float32) * 0.1).astype(np.float32) + return X, y + + +@pytest.fixture(scope="session") +def regression_200x10(): + """Medium regression dataset: 200 samples, 10 features, linear target. + + y = X[:,0] + 0.5 * X[:,1] - 0.3 * X[:,2] + noise(0.1) + """ + rng = np.random.RandomState(42) + X = rng.randn(200, 10).astype(np.float32) + y = (X[:, 0] + 0.5 * X[:, 1] - 0.3 * X[:, 2] + rng.randn(200).astype(np.float32) * 0.1).astype(np.float32) + return X, y + + +@pytest.fixture(scope="session") +def regression_500x10(): + """Larger regression dataset: 500 samples, 10 features. + + y = X[:,0] + 0.5 * X[:,1] - 0.3 * X[:,2] + noise(0.1) + """ + rng = np.random.RandomState(42) + X = rng.randn(500, 10).astype(np.float32) + y = (X[:, 0] + 0.5 * X[:, 1] - 0.3 * X[:, 2] + rng.randn(500).astype(np.float32) * 0.1).astype(np.float32) + return X, y + + +# ============================================================================= +# Classification datasets +# ============================================================================= + +@pytest.fixture(scope="session") +def binary_500x10(): + """Binary classification dataset: 500 samples, 10 features. + + Labels derived from a linear boundary on first two features. + """ + rng = np.random.RandomState(42) + X = rng.randn(500, 10).astype(np.float32) + logits = X[:, 0] + 0.5 * X[:, 1] - 0.3 * X[:, 2] + y = (logits > 0).astype(np.float32) + return X, y + + +@pytest.fixture(scope="session") +def multiclass_300x5(): + """3-class classification dataset: 300 samples, 5 features.""" + rng = np.random.RandomState(42) + X = rng.randn(300, 5).astype(np.float32) + # 3 classes based on which of 3 linear combos is largest + scores = np.column_stack([X[:, 0], X[:, 1], X[:, 2]]) + y = scores.argmax(axis=1).astype(np.float32) + return X, y + + +# ============================================================================= +# Specialized datasets +# ============================================================================= + +@pytest.fixture(scope="session") +def count_data_200x5(): + """Poisson count data: 200 samples, 5 features. + + y ~ Poisson(exp(0.5 * X[:,0] + 0.3 * X[:,1])) + """ + rng = np.random.RandomState(42) + X = rng.randn(200, 5).astype(np.float32) + rate = np.exp(0.5 * X[:, 0] + 0.3 * X[:, 1]) + y = rng.poisson(rate).astype(np.float32) + return X, y + + +@pytest.fixture(scope="session") +def positive_continuous_200x5(): + """Positive continuous data for Gamma/Tweedie: 200 samples, 5 features. + + y = exp(0.5 * X[:,0] + 0.3 * X[:,1]) + noise + """ + rng = np.random.RandomState(42) + X = rng.randn(200, 5).astype(np.float32) + y = (np.exp(0.5 * X[:, 0] + 0.3 * X[:, 1]) + rng.exponential(0.1, 200)).astype(np.float32) + return X, y + + +# ============================================================================= +# Pre-binned datasets +# ============================================================================= + +@pytest.fixture(scope="session") +def binned_100x5(regression_100x5): + """Pre-binned version of regression_100x5.""" + X, y = regression_100x5 + return ob.array(X), y + + +@pytest.fixture(scope="session") +def binned_200x10(regression_200x10): + """Pre-binned version of regression_200x10.""" + X, y = regression_200x10 + return ob.array(X), y + + +# ============================================================================= +# Gradient/hessian fixtures +# ============================================================================= + +@pytest.fixture +def mse_grads_100(regression_100x5): + """MSE gradients from zero predictions for regression_100x5. + + Returns (grad, hess) with grad = 2*(0-y) = -2y, hess = 2. + """ + _, y = regression_100x5 + pred = np.zeros(100, dtype=np.float32) + grad = (2 * (pred - y)).astype(np.float32) + hess = np.ones(100, dtype=np.float32) * 2 + return grad, hess + + +@pytest.fixture +def mse_grads_200(regression_200x10): + """MSE gradients from zero predictions for regression_200x10.""" + _, y = regression_200x10 + pred = np.zeros(200, dtype=np.float32) + grad = (2 * (pred - y)).astype(np.float32) + hess = np.ones(200, dtype=np.float32) * 2 + return grad, hess + + +# ============================================================================= +# Pre-fitted model fixtures +# ============================================================================= + +@pytest.fixture +def fitted_regressor(regression_500x10): + """Pre-fitted OpenBoostRegressor (20 trees, max_depth=4). + + Function-scoped: fresh model for each test. + """ + X, y = regression_500x10 + model = ob.GradientBoosting(n_trees=20, max_depth=4, learning_rate=0.1) + model.fit(X, y) + return model, X, y diff --git a/tests/test_binning_correctness.py b/tests/test_binning_correctness.py new file mode 100644 index 0000000..b116338 --- /dev/null +++ b/tests/test_binning_correctness.py @@ -0,0 +1,170 @@ +"""BinnedArray correctness tests for OpenBoost. + +Verifies that data binning (quantization to uint8) is correct, +consistent between fit and transform, and handles edge cases. +""" + +import numpy as np +import pytest + +import openboost as ob + + +class TestBinningConsistency: + """Verify that binning is consistent between fit and transform.""" + + def test_transform_matches_training_bins(self): + """Re-binning training data with transform should reproduce original bins.""" + rng = np.random.RandomState(42) + X = rng.randn(200, 5).astype(np.float32) + + binned = ob.array(X) + # Transform the same data using training bin edges + re_binned = binned.transform(X) + + np.testing.assert_array_equal( + binned.data, re_binned.data, + err_msg="Re-binning training data should produce identical bins" + ) + + def test_transform_preserves_shape(self): + """Transform output should have correct shape.""" + rng = np.random.RandomState(42) + X_train = rng.randn(100, 5).astype(np.float32) + X_test = rng.randn(50, 5).astype(np.float32) + + binned = ob.array(X_train) + test_binned = binned.transform(X_test) + + assert test_binned.n_samples == 50 + assert test_binned.n_features == 5 + assert test_binned.data.shape == (5, 50) + assert test_binned.data.dtype == np.uint8 + + def test_transform_out_of_range_values(self): + """Values outside training range should be clipped to valid bins.""" + X_train = np.array([[0.0], [1.0], [2.0], [3.0]], dtype=np.float32) + X_test = np.array([[-10.0], [10.0]], dtype=np.float32) + + binned = ob.array(X_train) + test_binned = binned.transform(X_test) + + # Should be valid bin values (not 255 since no NaN) + assert np.all(test_binned.data < 255), "Out-of-range values should not be missing bin" + assert np.all(test_binned.data >= 0), "Bins should be non-negative" + + +class TestBinEdges: + """Verify bin edge properties.""" + + def test_bin_edges_monotonic(self): + """Bin edges should be strictly increasing per feature.""" + rng = np.random.RandomState(42) + X = rng.randn(500, 5).astype(np.float32) + binned = ob.array(X) + + for feat_idx, edges in enumerate(binned.bin_edges): + edges_arr = np.array(edges) + if len(edges_arr) > 1: + diffs = np.diff(edges_arr) + assert np.all(diffs > 0), ( + f"Feature {feat_idx}: bin edges not strictly increasing" + ) + + def test_bin_count_respects_max(self): + """Number of bins should not exceed 254 (bin 255 is reserved).""" + rng = np.random.RandomState(42) + X = rng.randn(1000, 3).astype(np.float32) + binned = ob.array(X) + + # No sample should have bin 255 (no NaN in this data) + assert np.max(binned.data) < 255, "Max bin should be < 255 when no NaN" + + +class TestMissingValues: + """Verify NaN handling in binning.""" + + def test_nan_maps_to_missing_bin(self): + """NaN values should be binned as 255 (MISSING_BIN).""" + X = np.array([ + [1.0, np.nan], + [2.0, 3.0], + [np.nan, 4.0], + ], dtype=np.float32) + + binned = ob.array(X) + + # Feature 0, sample 2 should be 255 + assert binned.data[0, 2] == 255, "NaN in feature 0, sample 2 should be bin 255" + # Feature 1, sample 0 should be 255 + assert binned.data[1, 0] == 255, "NaN in feature 1, sample 0 should be bin 255" + # Non-NaN values should not be 255 + assert binned.data[0, 0] != 255, "Non-NaN value should not be bin 255" + assert binned.data[0, 1] != 255, "Non-NaN value should not be bin 255" + + def test_no_nan_means_no_missing_bin(self): + """Without NaN, no sample should land in bin 255.""" + rng = np.random.RandomState(42) + X = rng.randn(200, 5).astype(np.float32) + + binned = ob.array(X) + + assert np.all(binned.data != 255), "No bin 255 when no NaN in data" + + def test_all_nan_feature(self): + """A feature that is all NaN should have all samples in bin 255.""" + X = np.array([ + [1.0, np.nan], + [2.0, np.nan], + [3.0, np.nan], + ], dtype=np.float32) + + binned = ob.array(X) + + assert np.all(binned.data[1, :] == 255), "All-NaN feature should be all bin 255" + + +class TestBinningEdgeCases: + """Edge cases for binning.""" + + def test_constant_feature(self): + """Constant feature should produce valid binning.""" + X = np.ones((50, 2), dtype=np.float32) + X[:, 1] = np.arange(50, dtype=np.float32) # Feature 1 varies + + binned = ob.array(X) + + # Feature 0 (constant) should have all samples in the same bin + unique_bins = np.unique(binned.data[0, :]) + assert len(unique_bins) == 1, f"Constant feature should have 1 bin, got {len(unique_bins)}" + + def test_two_unique_values(self): + """Two distinct values should produce two bins.""" + X = np.array([[0.0], [0.0], [1.0], [1.0]], dtype=np.float32) + + binned = ob.array(X) + + unique_bins = np.unique(binned.data[0, :]) + assert len(unique_bins) == 2, f"Two values should produce 2 bins, got {len(unique_bins)}" + + def test_very_large_values(self): + """Large values should not cause overflow.""" + X = np.array([[1e10, -1e10], [1e15, -1e15]], dtype=np.float32) + + binned = ob.array(X) + + assert binned.data.dtype == np.uint8 + assert np.all(np.isfinite(binned.data.astype(float))) + + def test_single_sample(self): + """Single sample should bin correctly.""" + X = np.array([[1.0, 2.0, 3.0]], dtype=np.float32) + + binned = ob.array(X) + + assert binned.n_samples == 1 + assert binned.n_features == 3 + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/tests/test_callbacks.py b/tests/test_callbacks.py new file mode 100644 index 0000000..955b4d7 --- /dev/null +++ b/tests/test_callbacks.py @@ -0,0 +1,193 @@ +"""Tests for the callback system. + +Verifies EarlyStopping, Logger, ModelCheckpoint, LearningRateScheduler, +HistoryCallback, and custom callbacks work correctly. +""" + +import pytest + +import openboost as ob + + +class TestEarlyStopping: + """Tests for EarlyStopping callback.""" + + def test_stops_when_val_loss_plateaus(self, regression_500x10): + """Training should stop before n_trees when loss plateaus.""" + X, y = regression_500x10 + X_val, y_val = X[:100], y[:100] + + es = ob.EarlyStopping(patience=5) + model = ob.GradientBoosting(n_trees=1000, max_depth=4, learning_rate=0.3) + model.fit(X, y, callbacks=[es], eval_set=[(X_val, y_val)]) + + # Should have stopped early (well before 1000 trees) + assert len(model.trees_) < 1000, ( + f"Should stop early but trained all {len(model.trees_)} trees" + ) + + def test_patience_respected(self, regression_500x10): + """With higher patience, training should run longer.""" + X, y = regression_500x10 + X_val, y_val = X[:100], y[:100] + + es_short = ob.EarlyStopping(patience=3) + model_short = ob.GradientBoosting(n_trees=500, max_depth=4, learning_rate=0.3) + model_short.fit(X, y, callbacks=[es_short], eval_set=[(X_val, y_val)]) + + es_long = ob.EarlyStopping(patience=20) + model_long = ob.GradientBoosting(n_trees=500, max_depth=4, learning_rate=0.3) + model_long.fit(X, y, callbacks=[es_long], eval_set=[(X_val, y_val)]) + + # Longer patience should train at least as many trees + assert len(model_long.trees_) >= len(model_short.trees_) + + def test_restore_best(self, regression_500x10): + """With restore_best=True, model should use best iteration's trees.""" + X, y = regression_500x10 + X_val, y_val = X[:100], y[:100] + + es = ob.EarlyStopping(patience=5, restore_best=True) + model = ob.GradientBoosting(n_trees=500, max_depth=4, learning_rate=0.3) + model.fit(X, y, callbacks=[es], eval_set=[(X_val, y_val)]) + + # If early stopping fired, best_iteration should be set + if hasattr(model, 'best_iteration_') and model.best_iteration_ is not None: + assert len(model.trees_) <= model.best_iteration_ + 1 + 5 + + +class TestHistoryCallback: + """Tests for HistoryCallback.""" + + def test_records_train_loss(self, regression_200x10): + """Should record training loss each round.""" + X, y = regression_200x10 + + history = ob.HistoryCallback() + model = ob.GradientBoosting(n_trees=10, max_depth=3) + model.fit(X, y, callbacks=[history]) + + assert 'train_loss' in history.history + assert len(history.history['train_loss']) == 10 + + def test_records_val_loss(self, regression_200x10): + """Should record validation loss when eval_set provided.""" + X, y = regression_200x10 + X_val, y_val = X[:50], y[:50] + + history = ob.HistoryCallback() + model = ob.GradientBoosting(n_trees=10, max_depth=3) + model.fit(X, y, callbacks=[history], eval_set=[(X_val, y_val)]) + + assert 'val_loss' in history.history + assert len(history.history['val_loss']) == 10 + + def test_train_loss_decreases(self, regression_200x10): + """Recorded training loss should generally decrease.""" + X, y = regression_200x10 + + history = ob.HistoryCallback() + model = ob.GradientBoosting(n_trees=20, max_depth=3, learning_rate=0.1) + model.fit(X, y, callbacks=[history]) + + losses = history.history['train_loss'] + # First loss should be > last loss + assert losses[-1] < losses[0], ( + f"Training loss should decrease: first={losses[0]:.4f}, last={losses[-1]:.4f}" + ) + + +class TestLoggerCallback: + """Tests for Logger callback.""" + + def test_logger_does_not_crash(self, regression_100x5, capsys): + """Logger should print without crashing.""" + X, y = regression_100x5 + + logger = ob.Logger(period=5) + model = ob.GradientBoosting(n_trees=10, max_depth=2) + model.fit(X, y, callbacks=[logger]) + + # Just verify it didn't crash — output format may vary + + +class TestMultipleCallbacks: + """Tests for using multiple callbacks together.""" + + def test_early_stopping_and_history(self, regression_500x10): + """EarlyStopping + HistoryCallback should work together.""" + X, y = regression_500x10 + X_val, y_val = X[:100], y[:100] + + es = ob.EarlyStopping(patience=5) + history = ob.HistoryCallback() + + model = ob.GradientBoosting(n_trees=500, max_depth=4, learning_rate=0.3) + model.fit(X, y, callbacks=[es, history], eval_set=[(X_val, y_val)]) + + # History should only have as many entries as rounds trained + n_trained = len(model.trees_) + assert len(history.history['train_loss']) == n_trained + + def test_all_callbacks_together(self, regression_200x10): + """Multiple callbacks should all receive events.""" + X, y = regression_200x10 + + history = ob.HistoryCallback() + logger = ob.Logger(period=100) # Don't spam output + + model = ob.GradientBoosting(n_trees=10, max_depth=3) + model.fit(X, y, callbacks=[history, logger]) + + assert len(history.history['train_loss']) == 10 + + +class TestCustomCallback: + """Tests for custom callback classes.""" + + def test_custom_callback_receives_events(self, regression_100x5): + """Custom callback should receive on_train_begin and on_round_end.""" + class EventTracker(ob.Callback): + def __init__(self): + self.began = False + self.round_count = 0 + self.ended = False + + def on_train_begin(self, state): + self.began = True + + def on_round_end(self, state): + self.round_count += 1 + return True + + def on_train_end(self, state): + self.ended = True + + X, y = regression_100x5 + tracker = EventTracker() + + model = ob.GradientBoosting(n_trees=5, max_depth=2) + model.fit(X, y, callbacks=[tracker]) + + assert tracker.began, "on_train_begin should be called" + assert tracker.round_count == 5, f"on_round_end called {tracker.round_count} times, expected 5" + assert tracker.ended, "on_train_end should be called" + + def test_custom_callback_can_stop_training(self, regression_100x5): + """Custom callback returning False should stop training.""" + class StopAtThree(ob.Callback): + def on_round_end(self, state): + return state.round_idx < 2 # Stop after 3 rounds (0, 1, 2) + + X, y = regression_100x5 + + model = ob.GradientBoosting(n_trees=100, max_depth=2) + model.fit(X, y, callbacks=[StopAtThree()]) + + assert len(model.trees_) <= 3, ( + f"Should stop at 3 trees, got {len(model.trees_)}" + ) + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/tests/test_core.py b/tests/test_core.py index 23ab40e..a7ce7d6 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1,6 +1,7 @@ """Core tests for OpenBoost. These tests run on CPU (for Mac development) and verify basic functionality. +Uses shared fixtures from conftest.py for data generation. """ import numpy as np @@ -11,125 +12,110 @@ class TestArray: """Tests for ob.array() and binning.""" - - def test_basic_binning(self): + + def test_basic_binning(self, regression_100x5): """Test that array() bins data correctly.""" - np.random.seed(42) - X = np.random.randn(100, 5) - + X, _ = regression_100x5 binned = ob.array(X, n_bins=256) - + assert binned.n_samples == 100 assert binned.n_features == 5 assert binned.data.shape == (5, 100) # Feature-major assert binned.data.dtype == np.uint8 - + def test_bin_range(self): """Test that bin values are in valid range.""" - X = np.random.randn(1000, 10) + rng = np.random.RandomState(99) + X = rng.randn(1000, 10) binned = ob.array(X) - + assert binned.data.min() >= 0 assert binned.data.max() <= 255 - + def test_bin_edges_stored(self): """Test that bin edges are stored for inverse transform.""" - X = np.random.randn(100, 3) + rng = np.random.RandomState(99) + X = rng.randn(100, 3) binned = ob.array(X) - + assert len(binned.bin_edges) == 3 for edges in binned.bin_edges: assert len(edges) > 0 # At least some bin edges - + def test_n_bins_capped_at_255(self): - """Test that n_bins > 255 is capped (Phase 14: bin 255 reserved for NaN).""" - X = np.random.randn(10, 2) - - # n_bins > 255 should be silently capped to 255 - # (bin 255 is reserved for missing values) + """Test that n_bins > 255 is capped (bin 255 reserved for NaN).""" + rng = np.random.RandomState(99) + X = rng.randn(10, 2) + binned = ob.array(X, n_bins=300) - + # Data should not contain bin 255 (no NaN in this data) assert np.max(binned.data) < 255 - + def test_invalid_shape(self): """Test that 1D input raises error.""" - X = np.random.randn(100) - + rng = np.random.RandomState(99) + X = rng.randn(100) + with pytest.raises(ValueError, match="must be 2D"): ob.array(X) class TestFitTree: """Tests for ob.fit_tree().""" - - def test_basic_fit(self): + + def test_basic_fit(self, binned_100x5, mse_grads_100): """Test basic tree fitting.""" - np.random.seed(42) - X = np.random.randn(100, 5) - y = X[:, 0] + 0.5 * X[:, 1] # Simple linear target - - binned = ob.array(X) - - # MSE gradients - pred = np.zeros(100, dtype=np.float32) - grad = (2 * (pred - y)).astype(np.float32) - hess = np.ones(100, dtype=np.float32) * 2 - + binned, _ = binned_100x5 + grad, hess = mse_grads_100 + tree = ob.fit_tree(binned, grad, hess, max_depth=3) - + assert tree.n_nodes > 0 assert tree.depth <= 3 - - def test_tree_prediction(self): + + def test_tree_prediction(self, regression_100x5): """Test that tree predictions work.""" - np.random.seed(42) - X = np.random.randn(100, 5) - y = X[:, 0] # Simple target - + X, y = regression_100x5 binned = ob.array(X) - + pred = np.zeros(100, dtype=np.float32) grad = (2 * (pred - y)).astype(np.float32) hess = np.ones(100, dtype=np.float32) * 2 - + tree = ob.fit_tree(binned, grad, hess, max_depth=4) predictions = tree(binned) - + assert predictions.shape == (100,) assert predictions.dtype == np.float32 - - def test_training_reduces_loss(self): + + def test_training_reduces_loss(self, regression_200x10): """Test that multiple rounds reduce loss.""" - np.random.seed(42) - X = np.random.randn(200, 10) - y = X[:, 0] + 0.5 * X[:, 1] - 0.3 * X[:, 2] - y = y.astype(np.float32) - + X, y = regression_200x10 binned = ob.array(X) pred = np.zeros(200, dtype=np.float32) - + initial_loss = np.mean((pred - y) ** 2) - - # Train for a few rounds + for _ in range(10): grad = (2 * (pred - y)).astype(np.float32) hess = np.ones(200, dtype=np.float32) * 2 tree = ob.fit_tree(binned, grad, hess, max_depth=4) pred = pred + 0.3 * tree(binned) - + final_loss = np.mean((pred - y) ** 2) - + assert final_loss < initial_loss, f"Loss should decrease: {final_loss} < {initial_loss}" - + def test_max_depth_respected(self): """Test that max_depth is respected.""" - X = np.random.randn(100, 5) + rng = np.random.RandomState(99) + X = rng.randn(100, 5) binned = ob.array(X) - - grad = np.random.randn(100).astype(np.float32) + + grad = rng.randn(100).astype(np.float32) hess = np.ones(100, dtype=np.float32) - + for depth in [1, 2, 3, 5]: tree = ob.fit_tree(binned, grad, hess, max_depth=depth) assert tree.depth <= depth, f"Tree depth {tree.depth} > max_depth {depth}" @@ -137,12 +123,12 @@ def test_max_depth_respected(self): class TestBackend: """Tests for backend detection and dispatch.""" - + def test_get_backend(self): """Test that get_backend returns valid value.""" backend = ob.get_backend() assert backend in ("cuda", "cpu") - + def test_set_backend_cpu(self): """Test forcing CPU backend.""" original = ob.get_backend() @@ -152,13 +138,11 @@ def test_set_backend_cpu(self): assert ob.is_cpu() assert not ob.is_cuda() finally: - # Restore if original == "cuda": - try: + import contextlib + with contextlib.suppress(RuntimeError): ob.set_backend("cuda") - except RuntimeError: - pass # CUDA not available - + def test_invalid_backend(self): """Test that invalid backend raises error.""" with pytest.raises(ValueError, match="must be 'cuda' or 'cpu'"): @@ -167,122 +151,115 @@ def test_invalid_backend(self): class TestEdgeCases: """Tests for edge cases.""" - + def test_constant_feature(self): """Test handling of constant features.""" - X = np.random.randn(100, 3) + rng = np.random.RandomState(99) + X = rng.randn(100, 3) X[:, 1] = 5.0 # Constant feature - + binned = ob.array(X) - - # Should still work - grad = np.random.randn(100).astype(np.float32) + + grad = rng.randn(100).astype(np.float32) hess = np.ones(100, dtype=np.float32) tree = ob.fit_tree(binned, grad, hess, max_depth=3) - + assert tree.n_nodes > 0 - + def test_small_dataset(self): """Test with very small dataset.""" - X = np.random.randn(10, 2) + rng = np.random.RandomState(99) + X = rng.randn(10, 2) binned = ob.array(X) - - grad = np.random.randn(10).astype(np.float32) + + grad = rng.randn(10).astype(np.float32) hess = np.ones(10, dtype=np.float32) - + tree = ob.fit_tree(binned, grad, hess, max_depth=2) pred = tree(binned) - + assert pred.shape == (10,) - + def test_single_feature(self): """Test with single feature.""" - X = np.random.randn(100, 1) + rng = np.random.RandomState(99) + X = rng.randn(100, 1) binned = ob.array(X) - - grad = np.random.randn(100).astype(np.float32) + + grad = rng.randn(100).astype(np.float32) hess = np.ones(100, dtype=np.float32) - + tree = ob.fit_tree(binned, grad, hess, max_depth=3) pred = tree(binned) - + assert pred.shape == (100,) class TestGradientBoosting: """Tests for high-level GradientBoosting API.""" - + def test_import(self): """Test that openboost can be imported.""" assert hasattr(ob, "GradientBoosting") assert ob.__version__ == "1.0.0rc1" - - def test_fit_predict_small(self): + + def test_fit_predict_small(self, regression_100x5): """Test fit/predict on small data.""" - np.random.seed(42) - X = np.random.randn(100, 5).astype(np.float32) - y = np.random.randn(100).astype(np.float32) - + X, y = regression_100x5 + model = ob.GradientBoosting(n_trees=3, max_depth=3) model.fit(X, y) - + pred = model.predict(X) - + assert pred.shape == y.shape assert pred.dtype == np.float32 - - def test_mse_decreases(self): + + def test_mse_decreases(self, regression_200x10): """Test that MSE decreases with more trees.""" - np.random.seed(42) - X = np.random.randn(200, 10).astype(np.float32) - y = X[:, 0] + 0.5 * X[:, 1] + np.random.randn(200).astype(np.float32) * 0.1 - + X, y = regression_200x10 + model_few = ob.GradientBoosting(n_trees=2, max_depth=3) model_few.fit(X, y) mse_few = np.mean((model_few.predict(X) - y) ** 2) - + model_many = ob.GradientBoosting(n_trees=10, max_depth=3) model_many.fit(X, y) mse_many = np.mean((model_many.predict(X) - y) ** 2) - + assert mse_many < mse_few, f"More trees should reduce MSE: {mse_many} >= {mse_few}" - - def test_learning_rate_effect(self): + + def test_learning_rate_effect(self, regression_100x5): """Test that lower learning rate requires more trees.""" - np.random.seed(42) - X = np.random.randn(100, 5).astype(np.float32) - y = X[:, 0] * 2 + np.random.randn(100).astype(np.float32) * 0.1 - - # High LR, few trees + X, y = regression_100x5 + model_high_lr = ob.GradientBoosting(n_trees=5, max_depth=3, learning_rate=0.5) model_high_lr.fit(X, y) pred_high = model_high_lr.predict(X) - - # Low LR, few trees - should fit worse + model_low_lr = ob.GradientBoosting(n_trees=5, max_depth=3, learning_rate=0.01) model_low_lr.fit(X, y) pred_low = model_low_lr.predict(X) - + mse_high = np.mean((pred_high - y) ** 2) mse_low = np.mean((pred_low - y) ** 2) - - # With same trees, high LR should fit better (for small n_trees) + assert mse_high < mse_low - + def test_deterministic(self): """Test that results are deterministic.""" - np.random.seed(42) - X = np.random.randn(50, 3).astype(np.float32) - y = np.random.randn(50).astype(np.float32) - + rng = np.random.RandomState(42) + X = rng.randn(50, 3).astype(np.float32) + y = rng.randn(50).astype(np.float32) + model1 = ob.GradientBoosting(n_trees=3, max_depth=2) model1.fit(X, y) pred1 = model1.predict(X) - + model2 = ob.GradientBoosting(n_trees=3, max_depth=2) model2.fit(X, y) pred2 = model2.predict(X) - + np.testing.assert_array_equal(pred1, pred2) diff --git a/tests/test_gam.py b/tests/test_gam.py new file mode 100644 index 0000000..a9fe1e6 --- /dev/null +++ b/tests/test_gam.py @@ -0,0 +1,192 @@ +"""Tests for OpenBoostGAM model. + +Verifies that the GPU-accelerated Generalized Additive Model works +correctly on CPU. These are the first CPU tests for this model variant. +""" + +import numpy as np +import pytest + +import openboost as ob + + +class TestGAMBasic: + """Basic functionality tests.""" + + def test_basic_fit_predict(self, regression_200x10): + """Fit and predict should produce correct shapes.""" + X, y = regression_200x10 + + gam = ob.OpenBoostGAM(n_rounds=50, learning_rate=0.05, reg_lambda=1.0) + gam.fit(X, y) + pred = gam.predict(X) + + assert pred.shape == y.shape + assert pred.dtype == np.float32 + assert np.all(np.isfinite(pred)) + + def test_shape_values_shape(self, regression_200x10): + """shape_values_ should be (n_features, 256).""" + X, y = regression_200x10 + + gam = ob.OpenBoostGAM(n_rounds=20, learning_rate=0.05) + gam.fit(X, y) + + assert gam.shape_values_ is not None + assert gam.shape_values_.shape == (10, 256), ( + f"Expected shape (10, 256), got {gam.shape_values_.shape}" + ) + + def test_training_reduces_loss(self, regression_200x10): + """Training should reduce loss compared to baseline.""" + X, y = regression_200x10 + + gam = ob.OpenBoostGAM(n_rounds=100, learning_rate=0.05) + gam.fit(X, y) + pred = gam.predict(X) + + mse = np.mean((pred - y) ** 2) + baseline_mse = np.var(y) + + assert mse < baseline_mse * 0.5, ( + f"GAM MSE ({mse:.4f}) should be well below baseline ({baseline_mse:.4f})" + ) + + def test_deterministic(self, regression_100x5): + """Same input should produce identical output.""" + X, y = regression_100x5 + + gam1 = ob.OpenBoostGAM(n_rounds=20, learning_rate=0.05) + gam1.fit(X, y) + pred1 = gam1.predict(X) + + gam2 = ob.OpenBoostGAM(n_rounds=20, learning_rate=0.05) + gam2.fit(X, y) + pred2 = gam2.predict(X) + + np.testing.assert_array_equal(pred1, pred2) + + +class TestGAMInterpretability: + """Verify GAM interpretability properties.""" + + def test_shape_functions_capture_correct_features(self): + """When y = f(X[:,0]), feature 0's shape function should be most active.""" + rng = np.random.RandomState(42) + X = rng.randn(300, 5).astype(np.float32) + y = np.sin(X[:, 0]).astype(np.float32) + + gam = ob.OpenBoostGAM(n_rounds=500, learning_rate=0.03) + gam.fit(X, y) + + # Feature 0 should have the largest shape function range + ranges = [np.ptp(gam.shape_values_[f]) for f in range(5)] + assert np.argmax(ranges) == 0, ( + f"Feature 0 should have largest range but ranges are: {ranges}" + ) + + def test_additive_prediction_structure(self, regression_100x5): + """Predictions should be sum of shape functions + base score.""" + X, y = regression_100x5 + + gam = ob.OpenBoostGAM(n_rounds=30, learning_rate=0.05) + gam.fit(X, y) + + # Get predictions the normal way + pred_normal = gam.predict(X) + + # Manually compute from shape functions + binned = gam.X_binned_ + binned_data = binned.data + if hasattr(binned_data, 'copy_to_host'): + binned_data = binned_data.copy_to_host() + binned_data = np.asarray(binned_data) + + base = getattr(gam, 'base_score_', np.float32(0.0)) + pred_manual = np.full(len(y), base, dtype=np.float32) + for f in range(X.shape[1]): + pred_manual += gam.shape_values_[f, binned_data[f, :]] + + np.testing.assert_allclose(pred_normal, pred_manual, atol=1e-5) + + +class TestGAMClassification: + """GAM with classification loss.""" + + def test_logloss(self, binary_500x10): + """GAM should work with logloss for binary classification.""" + X, y = binary_500x10 + + gam = ob.OpenBoostGAM(n_rounds=100, learning_rate=0.05, loss='logloss') + gam.fit(X, y) + pred_raw = gam.predict(X) + + # Convert to probabilities + prob = 1.0 / (1.0 + np.exp(-pred_raw)) + labels = (prob > 0.5).astype(float) + accuracy = np.mean(labels == y) + + assert accuracy > 0.70, f"GAM classification accuracy {accuracy:.3f} < 0.70" + + +class TestGAMEdgeCases: + """Edge cases for OpenBoostGAM.""" + + def test_predict_before_fit_raises(self): + """Predict on unfitted model should raise.""" + gam = ob.OpenBoostGAM(n_rounds=10) + rng = np.random.RandomState(42) + X = rng.randn(10, 3).astype(np.float32) + + with pytest.raises(RuntimeError, match="not fitted"): + gam.predict(X) + + def test_single_round(self): + """Should work with a single boosting round.""" + rng = np.random.RandomState(42) + X = rng.randn(50, 3).astype(np.float32) + y = rng.randn(50).astype(np.float32) + + gam = ob.OpenBoostGAM(n_rounds=1, learning_rate=0.1) + gam.fit(X, y) + pred = gam.predict(X) + + assert pred.shape == y.shape + assert np.all(np.isfinite(pred)) + + def test_constant_target(self): + """GAM with constant target should predict that constant.""" + rng = np.random.RandomState(42) + X = rng.randn(100, 3).astype(np.float32) + y = np.full(100, 2.5, dtype=np.float32) + + gam = ob.OpenBoostGAM(n_rounds=50, learning_rate=0.1) + gam.fit(X, y) + pred = gam.predict(X) + + np.testing.assert_allclose(pred, 2.5, atol=0.2, + err_msg="Should converge to constant target") + + +class TestGAMPersistence: + """Save/load functionality.""" + + def test_save_load_roundtrip(self, regression_100x5, tmp_path): + """Predictions should match after save/load.""" + X, y = regression_100x5 + + gam = ob.OpenBoostGAM(n_rounds=10, learning_rate=0.05) + gam.fit(X, y) + pred_before = gam.predict(X) + + path = str(tmp_path / "gam_model.json") + gam.save(path) + + loaded = ob.OpenBoostGAM.load(path) + pred_after = loaded.predict(X) + + np.testing.assert_array_equal(pred_before, pred_after) + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/tests/test_kernel_correctness.py b/tests/test_kernel_correctness.py new file mode 100644 index 0000000..8c7ea85 --- /dev/null +++ b/tests/test_kernel_correctness.py @@ -0,0 +1,475 @@ +"""Kernel-level correctness tests for OpenBoost. + +Verifies that the lowest-level computational kernels (histograms, split finding, +leaf values) produce correct results against hand-computed reference values. +These tests catch bugs in the core algorithms that affect all models. +""" + +import numpy as np +import pytest + +import openboost as ob +from openboost._core._split import compute_leaf_value, find_best_split + +# ============================================================================= +# Histogram Correctness +# ============================================================================= + + +class TestHistogramCorrectness: + """Verify histogram building produces correct aggregations.""" + + def test_histogram_sum_equals_gradient_sum(self, binned_100x5, mse_grads_100): + """Sum of histogram gradients must equal sum of input gradients.""" + binned, _ = binned_100x5 + grad, hess = mse_grads_100 + + sample_node_ids = ob.init_sample_node_ids(100) + histograms = ob.build_node_histograms( + binned.data, grad, hess, sample_node_ids, [0] + ) + + hist = histograms[0] + # Sum across all bins for each feature should equal total gradient + for feat in range(5): + feat_grad_sum = np.sum(hist.hist_grad[feat, :]) + feat_hess_sum = np.sum(hist.hist_hess[feat, :]) + np.testing.assert_almost_equal( + feat_grad_sum, np.sum(grad), decimal=4, + err_msg=f"Feature {feat}: hist grad sum != input grad sum" + ) + np.testing.assert_almost_equal( + feat_hess_sum, np.sum(hess), decimal=4, + err_msg=f"Feature {feat}: hist hess sum != input hess sum" + ) + + def test_histogram_per_bin_counts(self): + """Hand-crafted data: verify each bin's grad/hess matches manual sum.""" + # Create data where we know exactly which sample goes to which bin + # 10 samples, 2 features, carefully crafted to land in known bins + rng = np.random.RandomState(42) + n_samples = 20 + X = rng.randn(n_samples, 2).astype(np.float32) + binned = ob.array(X) + + # Known gradients + grad = np.arange(n_samples, dtype=np.float32) + hess = np.ones(n_samples, dtype=np.float32) + + sample_node_ids = ob.init_sample_node_ids(n_samples) + histograms = ob.build_node_histograms( + binned.data, grad, hess, sample_node_ids, [0] + ) + + hist = histograms[0] + + # For each feature, verify that the samples in each bin sum correctly + for feat in range(2): + bin_values = binned.data[feat, :] # bin assignment for each sample + for b in range(256): + mask = bin_values == b + expected_grad = np.sum(grad[mask]) + expected_hess = np.sum(hess[mask]) + np.testing.assert_almost_equal( + hist.hist_grad[feat, b], expected_grad, decimal=5, + err_msg=f"Feature {feat}, bin {b}: grad mismatch" + ) + np.testing.assert_almost_equal( + hist.hist_hess[feat, b], expected_hess, decimal=5, + err_msg=f"Feature {feat}, bin {b}: hess mismatch" + ) + + def test_histogram_with_missing_bin_isolated(self): + """NaN samples must accumulate only in bin 255 (MISSING_BIN).""" + rng = np.random.RandomState(42) + n_samples = 50 + X = rng.randn(n_samples, 3).astype(np.float32) + # Inject NaN in feature 0 for first 10 samples + X[:10, 0] = np.nan + + binned = ob.array(X) + grad = np.ones(n_samples, dtype=np.float32) + hess = np.ones(n_samples, dtype=np.float32) + + sample_node_ids = ob.init_sample_node_ids(n_samples) + histograms = ob.build_node_histograms( + binned.data, grad, hess, sample_node_ids, [0] + ) + + hist = histograms[0] + + # For feature 0: bin 255 should have grad=10 (10 NaN samples * grad=1) + np.testing.assert_almost_equal( + hist.hist_grad[0, 255], 10.0, decimal=4, + err_msg="Missing bin should accumulate exactly NaN samples" + ) + np.testing.assert_almost_equal( + hist.hist_hess[0, 255], 10.0, decimal=4, + ) + + # Non-NaN features should have 0 in bin 255 + np.testing.assert_almost_equal( + hist.hist_grad[1, 255], 0.0, decimal=4, + err_msg="Feature without NaN should have 0 in missing bin" + ) + + def test_histogram_constant_feature(self): + """A constant feature should have all samples in a single bin.""" + n_samples = 50 + X = np.zeros((n_samples, 2), dtype=np.float32) + X[:, 0] = 5.0 # Constant + X[:, 1] = np.arange(n_samples, dtype=np.float32) # Varying + + binned = ob.array(X) + grad = np.ones(n_samples, dtype=np.float32) * 3.0 + hess = np.ones(n_samples, dtype=np.float32) + + sample_node_ids = ob.init_sample_node_ids(n_samples) + histograms = ob.build_node_histograms( + binned.data, grad, hess, sample_node_ids, [0] + ) + + hist = histograms[0] + + # Feature 0 (constant): exactly one bin should have all grad/hess + nonzero_bins = np.sum(hist.hist_hess[0, :] > 0) + assert nonzero_bins == 1, f"Constant feature should have 1 non-zero bin, got {nonzero_bins}" + np.testing.assert_almost_equal( + np.sum(hist.hist_grad[0, :]), 3.0 * n_samples, decimal=4 + ) + + def test_histogram_subtraction(self, binned_100x5, mse_grads_100): + """Parent histogram - left child histogram = right child histogram.""" + binned, _ = binned_100x5 + grad, hess = mse_grads_100 + + sample_node_ids = ob.init_sample_node_ids(100) + + # Build parent histogram + parent_hists = ob.build_node_histograms( + binned.data, grad, hess, sample_node_ids, [0] + ) + parent = parent_hists[0] + + # Do a split to create children + splits = ob.find_node_splits(parent_hists) + if splits and 0 in splits and splits[0].split.is_valid: + new_node_ids = ob.partition_samples(binned.data, sample_node_ids, splits) + left_id = splits[0].left_child + right_id = splits[0].right_child + + child_hists = ob.build_node_histograms( + binned.data, grad, hess, new_node_ids, [left_id, right_id] + ) + + if left_id in child_hists and right_id in child_hists: + left = child_hists[left_id] + right = child_hists[right_id] + + # Parent = left + right + np.testing.assert_almost_equal( + parent.hist_grad, left.hist_grad + right.hist_grad, + decimal=4, err_msg="Parent grad != left + right" + ) + np.testing.assert_almost_equal( + parent.hist_hess, left.hist_hess + right.hist_hess, + decimal=4, err_msg="Parent hess != left + right" + ) + + +# ============================================================================= +# Split Finding Correctness +# ============================================================================= + + +class TestSplitFindingCorrectness: + """Verify split finding selects the optimal split.""" + + def test_split_gain_formula_exact(self): + """Verify split gain matches the formula: left_score + right_score - parent_score.""" + # Construct a histogram with known values + n_features = 2 + hist_grad = np.zeros((n_features, 256), dtype=np.float32) + hist_hess = np.zeros((n_features, 256), dtype=np.float32) + + # Feature 0: bins 0-9 have grad=1, hess=1 each; bins 10-19 have grad=-1, hess=1 + for b in range(10): + hist_grad[0, b] = 1.0 + hist_hess[0, b] = 1.0 + for b in range(10, 20): + hist_grad[0, b] = -1.0 + hist_hess[0, b] = 1.0 + + # Feature 1: spread evenly (poor split) + for b in range(20): + hist_grad[1, b] = 0.0 + hist_hess[1, b] = 1.0 + + total_grad = float(np.sum(hist_grad[0])) # 0.0 + total_hess = float(np.sum(hist_hess[0])) # 20.0 + + reg_lambda = 1.0 + split = find_best_split( + hist_grad, hist_hess, total_grad, total_hess, + reg_lambda=reg_lambda, min_child_weight=0.0, + ) + + assert split.feature == 0, f"Should split on feature 0, got {split.feature}" + + # Manual gain computation for the best split on feature 0 at threshold=9 + # Left: grad=10, hess=10 -> score = 10^2/(10+1) = 100/11 + # Right: grad=-10, hess=10 -> score = (-10)^2/(10+1) = 100/11 + # Parent: grad=0, hess=20 -> score = 0^2/(20+1) = 0 + # Gain = 100/11 + 100/11 - 0 = 200/11 ≈ 18.18 + expected_gain = 100.0 / 11.0 + 100.0 / 11.0 - 0.0 + np.testing.assert_almost_equal( + split.gain, expected_gain, decimal=3, + err_msg=f"Gain should be {expected_gain}, got {split.gain}" + ) + + def test_split_selects_optimal_feature(self): + """Feature with highest gain should be selected.""" + n_features = 3 + hist_grad = np.zeros((n_features, 256), dtype=np.float32) + hist_hess = np.zeros((n_features, 256), dtype=np.float32) + + # Feature 0: weak split + hist_grad[0, :5] = 0.1 + hist_hess[0, :5] = 1.0 + hist_grad[0, 5:10] = -0.1 + hist_hess[0, 5:10] = 1.0 + + # Feature 1: NO split possible (constant) + hist_grad[1, 0] = 0.0 + hist_hess[1, 0] = 10.0 + + # Feature 2: strong split (large gradient difference) + hist_grad[2, :5] = 5.0 + hist_hess[2, :5] = 1.0 + hist_grad[2, 5:10] = -5.0 + hist_hess[2, 5:10] = 1.0 + + total_grad = float(np.sum(hist_grad[0])) + total_hess = float(np.sum(hist_hess[0])) + + split = find_best_split( + hist_grad, hist_hess, total_grad, total_hess, + reg_lambda=1.0, min_child_weight=0.0, + ) + + assert split.feature == 2, f"Should pick feature 2 (strongest), got {split.feature}" + + def test_split_min_child_weight_enforcement(self): + """Splits that violate min_child_weight should be rejected.""" + n_features = 1 + hist_grad = np.zeros((n_features, 256), dtype=np.float32) + hist_hess = np.zeros((n_features, 256), dtype=np.float32) + + # Only one sample in bin 0, rest in bin 1 + hist_grad[0, 0] = 5.0 + hist_hess[0, 0] = 0.5 # Below min_child_weight=1.0 + hist_grad[0, 1] = -5.0 + hist_hess[0, 1] = 10.0 + + total_grad = 0.0 + total_hess = 10.5 + + split = find_best_split( + hist_grad, hist_hess, total_grad, total_hess, + reg_lambda=1.0, min_child_weight=1.0, + ) + + # Split at threshold=0 would put hess=0.5 in left, violating min_child_weight=1.0 + # Should either find no split or a different threshold + if split.is_valid and split.threshold == 0: + pytest.fail("Split at threshold=0 should be rejected (hess=0.5 < min_child_weight=1.0)") + + +# ============================================================================= +# Leaf Value Correctness +# ============================================================================= + + +class TestLeafValueCorrectness: + """Verify leaf value computation follows Newton-Raphson formula.""" + + def test_newton_raphson_formula(self): + """leaf_value = -sum_grad / (sum_hess + lambda).""" + # Case 1: simple + val = compute_leaf_value(sum_grad=6.0, sum_hess=3.0, reg_lambda=1.0) + expected = -6.0 / (3.0 + 1.0) # -1.5 + np.testing.assert_almost_equal(val, expected, decimal=10) + + # Case 2: negative gradient + val = compute_leaf_value(sum_grad=-3.0, sum_hess=2.0, reg_lambda=1.0) + expected = 3.0 / (2.0 + 1.0) # 1.0 + np.testing.assert_almost_equal(val, expected, decimal=10) + + # Case 3: zero gradient + val = compute_leaf_value(sum_grad=0.0, sum_hess=5.0, reg_lambda=1.0) + np.testing.assert_almost_equal(val, 0.0, decimal=10) + + # Case 4: large lambda + val = compute_leaf_value(sum_grad=10.0, sum_hess=2.0, reg_lambda=100.0) + expected = -10.0 / (2.0 + 100.0) # -0.098... + np.testing.assert_almost_equal(val, expected, decimal=10) + + def test_l1_soft_thresholding_below_threshold(self): + """When |sum_grad| <= reg_alpha, leaf value should be 0.""" + val = compute_leaf_value(sum_grad=0.5, sum_hess=5.0, reg_lambda=1.0, reg_alpha=1.0) + np.testing.assert_almost_equal(val, 0.0, decimal=10) + + val = compute_leaf_value(sum_grad=-0.3, sum_hess=5.0, reg_lambda=1.0, reg_alpha=0.5) + np.testing.assert_almost_equal(val, 0.0, decimal=10) + + def test_l1_soft_thresholding_above_threshold(self): + """When |sum_grad| > reg_alpha, apply soft-thresholding.""" + # Positive gradient above threshold + val = compute_leaf_value(sum_grad=2.0, sum_hess=3.0, reg_lambda=1.0, reg_alpha=0.5) + expected = -(2.0 - 0.5) / (3.0 + 1.0) # -0.375 + np.testing.assert_almost_equal(val, expected, decimal=10) + + # Negative gradient above threshold + val = compute_leaf_value(sum_grad=-2.0, sum_hess=3.0, reg_lambda=1.0, reg_alpha=0.5) + expected = -(-2.0 + 0.5) / (3.0 + 1.0) # 0.375 + np.testing.assert_almost_equal(val, expected, decimal=10) + + +# ============================================================================= +# Partition Correctness +# ============================================================================= + + +class TestPartitionCorrectness: + """Verify sample partitioning preserves counts and is consistent.""" + + def test_partition_conserves_samples(self, binned_100x5, mse_grads_100): + """After partitioning, n_left + n_right = n_total.""" + binned, _ = binned_100x5 + grad, hess = mse_grads_100 + + sample_node_ids = ob.init_sample_node_ids(100) + + histograms = ob.build_node_histograms( + binned.data, grad, hess, sample_node_ids, [0] + ) + splits = ob.find_node_splits(histograms) + + if splits and 0 in splits and splits[0].split.is_valid: + new_node_ids = ob.partition_samples(binned.data, sample_node_ids, splits) + left_id = splits[0].left_child + right_id = splits[0].right_child + + n_left = np.sum(new_node_ids == left_id) + n_right = np.sum(new_node_ids == right_id) + + assert n_left + n_right == 100, ( + f"Partition should conserve samples: {n_left} + {n_right} != 100" + ) + assert n_left > 0, "Left child should have at least 1 sample" + assert n_right > 0, "Right child should have at least 1 sample" + + def test_partition_deterministic(self, binned_100x5, mse_grads_100): + """Same data should produce same partition.""" + binned, _ = binned_100x5 + grad, hess = mse_grads_100 + + results = [] + for _ in range(2): + sample_node_ids = ob.init_sample_node_ids(100) + histograms = ob.build_node_histograms( + binned.data, grad, hess, sample_node_ids, [0] + ) + splits = ob.find_node_splits(histograms) + if splits and 0 in splits and splits[0].split.is_valid: + new_node_ids = ob.partition_samples(binned.data, sample_node_ids, splits) + results.append(new_node_ids.copy()) + + if len(results) == 2: + np.testing.assert_array_equal(results[0], results[1]) + + def test_tree_depth_matches_max_depth(self, regression_100x5): + """Trees must respect max_depth constraint.""" + X, y = regression_100x5 + binned = ob.array(X) + grad = (2 * (np.zeros(100, dtype=np.float32) - y)).astype(np.float32) + hess = np.ones(100, dtype=np.float32) * 2 + + for depth in [1, 2, 3, 4, 5]: + tree = ob.fit_tree(binned, grad, hess, max_depth=depth) + assert tree.depth <= depth, f"Tree depth {tree.depth} > max_depth {depth}" + + +# ============================================================================= +# End-to-End Algorithmic Correctness +# ============================================================================= + + +class TestAlgorithmicCorrectness: + """End-to-end correctness of the boosting algorithm.""" + + def test_boosting_monotonic_loss_decrease(self, regression_200x10): + """Loss should decrease every round (for reasonable settings).""" + X, y = regression_200x10 + binned = ob.array(X) + pred = np.zeros(200, dtype=np.float32) + + losses = [] + for _ in range(5): + loss = float(np.mean((pred - y) ** 2)) + losses.append(loss) + grad = (2 * (pred - y)).astype(np.float32) + hess = np.ones(200, dtype=np.float32) * 2 + tree = ob.fit_tree(binned, grad, hess, max_depth=4) + pred = pred + 0.1 * tree(binned) + + # Each subsequent loss should be lower + for i in range(1, len(losses)): + assert losses[i] < losses[i - 1], ( + f"Loss should decrease monotonically: round {i}: {losses[i]} >= {losses[i-1]}" + ) + + def test_converges_to_mean_for_constant_target(self): + """For constant y, predictions should converge to that constant.""" + rng = np.random.RandomState(42) + X = rng.randn(100, 5).astype(np.float32) + y = np.full(100, 3.7, dtype=np.float32) + + model = ob.GradientBoosting(n_trees=50, max_depth=2, learning_rate=0.3) + model.fit(X, y) + pred = model.predict(X) + + # Predictions should be very close to 3.7 + np.testing.assert_allclose(pred, 3.7, atol=0.1, + err_msg="Predictions should converge to constant target value") + + def test_single_split_tree_matches_manual(self): + """A depth-1 tree with simple data should produce predictable splits.""" + # Feature 0 clearly splits the target + X = np.array([ + [-2.0, 0.0], + [-1.0, 0.0], + [1.0, 0.0], + [2.0, 0.0], + ], dtype=np.float32) + y = np.array([-1.0, -1.0, 1.0, 1.0], dtype=np.float32) + + binned = ob.array(X) + grad = (2 * (np.zeros(4, dtype=np.float32) - y)).astype(np.float32) # [-2, -2, 2, 2] * -1 = [2, 2, -2, -2] + hess = np.ones(4, dtype=np.float32) * 2 + + tree = ob.fit_tree(binned, grad, hess, max_depth=1) + + # Should split on feature 0 + assert tree.n_nodes >= 3, "Depth-1 tree should have at least 3 nodes (root + 2 leaves)" + assert tree.depth == 1 + + # Predictions for left vs right should have opposite signs + pred = tree(binned) + assert pred[0] * pred[2] < 0 or np.abs(pred[0] - pred[2]) > 0.1, ( + "Left and right predictions should differ for this clear split" + ) + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/tests/test_linear_leaf.py b/tests/test_linear_leaf.py new file mode 100644 index 0000000..e2dea7a --- /dev/null +++ b/tests/test_linear_leaf.py @@ -0,0 +1,180 @@ +"""Tests for LinearLeafGBDT model. + +Verifies that gradient boosting with linear models in leaves works correctly +on CPU. These are the first CPU tests for this model variant. +""" + +import numpy as np +import pytest + +import openboost as ob + + +class TestLinearLeafBasic: + """Basic functionality tests.""" + + def test_basic_fit_predict(self, regression_200x10): + """Fit and predict should produce correct shapes and dtypes.""" + X, y = regression_200x10 + + model = ob.LinearLeafGBDT(n_trees=10, max_depth=3, learning_rate=0.1) + model.fit(X, y) + pred = model.predict(X) + + assert pred.shape == y.shape, f"Expected shape {y.shape}, got {pred.shape}" + assert pred.dtype == np.float32 + assert np.all(np.isfinite(pred)), "Predictions should be finite" + + def test_training_reduces_loss(self, regression_200x10): + """More trees should reduce training loss.""" + X, y = regression_200x10 + + model_few = ob.LinearLeafGBDT(n_trees=5, max_depth=3) + model_few.fit(X, y) + mse_few = np.mean((model_few.predict(X) - y) ** 2) + + model_many = ob.LinearLeafGBDT(n_trees=30, max_depth=3) + model_many.fit(X, y) + mse_many = np.mean((model_many.predict(X) - y) ** 2) + + assert mse_many < mse_few, ( + f"More trees should reduce MSE: {mse_many} >= {mse_few}" + ) + + def test_deterministic(self, regression_100x5): + """Same input should produce identical output.""" + X, y = regression_100x5 + + model1 = ob.LinearLeafGBDT(n_trees=5, max_depth=2) + model1.fit(X, y) + pred1 = model1.predict(X) + + model2 = ob.LinearLeafGBDT(n_trees=5, max_depth=2) + model2.fit(X, y) + pred2 = model2.predict(X) + + np.testing.assert_array_equal(pred1, pred2) + + def test_predict_before_fit_raises(self): + """Predict on unfitted model should raise.""" + model = ob.LinearLeafGBDT(n_trees=5) + rng = np.random.RandomState(42) + X = rng.randn(10, 3).astype(np.float32) + + with pytest.raises((RuntimeError, AttributeError)): + model.predict(X) + + +class TestLinearLeafExtrapolation: + """Verify that linear leaves improve extrapolation.""" + + def test_extrapolation_on_linear_target(self): + """LinearLeaf should extrapolate better than standard GBDT on linear data.""" + rng = np.random.RandomState(42) + # Training: X in [-2, 2] + X_train = rng.uniform(-2, 2, (200, 3)).astype(np.float32) + y_train = (2 * X_train[:, 0] + X_train[:, 1]).astype(np.float32) + + # Test: X in [3, 5] (extrapolation region) + X_test = rng.uniform(3, 5, (50, 3)).astype(np.float32) + y_test = (2 * X_test[:, 0] + X_test[:, 1]).astype(np.float32) + + # Standard GBDT + standard = ob.GradientBoosting(n_trees=50, max_depth=4, learning_rate=0.1) + standard.fit(X_train, y_train) + std_pred = standard.predict(X_test) + _ = np.mean((std_pred - y_test) ** 2) + + # Linear Leaf GBDT + linear = ob.LinearLeafGBDT(n_trees=50, max_depth=3, learning_rate=0.1) + linear.fit(X_train, y_train) + lin_pred = linear.predict(X_test) + _ = np.mean((lin_pred - y_test) ** 2) + + # Linear leaf should extrapolate better (or at least comparably) + # We don't assert strict superiority since it depends on the data + assert np.all(np.isfinite(lin_pred)), "Linear leaf predictions should be finite" + # At minimum, linear leaf predictions should be in a reasonable range + assert np.max(np.abs(lin_pred)) < 100, "Predictions shouldn't explode" + + +class TestLinearLeafEdgeCases: + """Edge cases for LinearLeafGBDT.""" + + def test_with_constant_features(self): + """Should handle constant features gracefully.""" + rng = np.random.RandomState(42) + X = rng.randn(100, 3).astype(np.float32) + X[:, 1] = 5.0 # Constant feature + y = X[:, 0].copy() + + model = ob.LinearLeafGBDT(n_trees=10, max_depth=2) + model.fit(X, y) + pred = model.predict(X) + + assert np.all(np.isfinite(pred)) + + def test_with_missing_values(self): + """Should handle NaN in features.""" + rng = np.random.RandomState(42) + X = rng.randn(100, 3).astype(np.float32) + X[:5, 0] = np.nan + y = rng.randn(100).astype(np.float32) + + model = ob.LinearLeafGBDT(n_trees=10, max_depth=2) + model.fit(X, y) + pred = model.predict(X) + + assert pred.shape == y.shape + assert np.all(np.isfinite(pred)) + + def test_shallow_trees_with_linear_leaves(self): + """Shallow trees (depth 1-2) should still work with linear leaves.""" + rng = np.random.RandomState(42) + X = rng.randn(100, 5).astype(np.float32) + y = (X[:, 0] + 0.5 * X[:, 1]).astype(np.float32) + + model = ob.LinearLeafGBDT(n_trees=20, max_depth=1, learning_rate=0.1) + model.fit(X, y) + pred = model.predict(X) + + mse = np.mean((pred - y) ** 2) + baseline_mse = np.var(y) + assert mse < baseline_mse, "Model should fit better than mean prediction" + + def test_single_tree(self): + """Should work with a single tree.""" + rng = np.random.RandomState(42) + X = rng.randn(50, 3).astype(np.float32) + y = rng.randn(50).astype(np.float32) + + model = ob.LinearLeafGBDT(n_trees=1, max_depth=3) + model.fit(X, y) + pred = model.predict(X) + + assert pred.shape == y.shape + assert np.all(np.isfinite(pred)) + + +class TestLinearLeafPersistence: + """Save/load functionality.""" + + def test_save_load_roundtrip(self, regression_100x5, tmp_path): + """Predictions should match after save/load.""" + X, y = regression_100x5 + + model = ob.LinearLeafGBDT(n_trees=5, max_depth=2) + model.fit(X, y) + pred_before = model.predict(X) + + path = str(tmp_path / "linear_leaf_model.json") + model.save(path) + + loaded = ob.LinearLeafGBDT.load(path) + pred_after = loaded.predict(X) + + np.testing.assert_array_equal(pred_before, pred_after) + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/tests/test_loss_correctness.py b/tests/test_loss_correctness.py new file mode 100644 index 0000000..62dc08c --- /dev/null +++ b/tests/test_loss_correctness.py @@ -0,0 +1,298 @@ +"""Loss function correctness tests for OpenBoost. + +Verifies all loss functions using two approaches: +1. Analytical: compare gradient/hessian against independently computed formulas +2. Numerical differentiation: central differences vs returned gradient + +This catches sign errors, missing factors, numerical instability. +""" + +import numpy as np +import pytest + +from openboost._loss import ( + compute_loss_value, + gamma_gradient, + get_loss_function, + huber_gradient, # noqa: F401 + logloss_gradient, + mae_gradient, + mse_gradient, + poisson_gradient, + quantile_gradient, # noqa: F401 + tweedie_gradient, +) + + +def _numerical_gradient(loss_name, pred, y, eps=1e-5, **kwargs): + """Compute gradient numerically via central differences. + + grad_i ≈ (L(pred+eps) - L(pred-eps)) / (2*eps) + """ + pred = np.asarray(pred, dtype=np.float64) + y = np.asarray(y, dtype=np.float64) + n = len(pred) + num_grad = np.zeros(n, dtype=np.float64) + + for i in range(n): + pred_plus = pred.copy() + pred_minus = pred.copy() + pred_plus[i] += eps + pred_minus[i] -= eps + loss_plus = compute_loss_value(loss_name, pred_plus, y, **kwargs) * n + loss_minus = compute_loss_value(loss_name, pred_minus, y, **kwargs) * n + num_grad[i] = (loss_plus - loss_minus) / (2 * eps) + + return num_grad + + +# ============================================================================= +# Analytical gradient verification +# ============================================================================= + + +class TestAnalyticalGradients: + """Verify gradients against independently computed formulas.""" + + def test_mse_gradient_analytical(self): + """MSE: grad = 2*(pred - y), hess = 2.""" + pred = np.array([1.0, 2.0, 3.0], dtype=np.float32) + y = np.array([0.5, 2.5, 1.0], dtype=np.float32) + + grad, hess = mse_gradient(pred, y) + + expected_grad = 2.0 * (pred - y) + np.testing.assert_allclose(grad, expected_grad, atol=1e-6) + np.testing.assert_allclose(hess, 2.0, atol=1e-6) + + def test_mse_gradient_zero_at_match(self): + """MSE gradient should be zero when pred == y.""" + y = np.array([1.0, 2.0, 3.0], dtype=np.float32) + grad, hess = mse_gradient(y.copy(), y) + np.testing.assert_allclose(grad, 0.0, atol=1e-6) + + def test_logloss_gradient_analytical(self): + """LogLoss: grad = sigmoid(pred) - y, hess = p*(1-p).""" + pred = np.array([0.0, 1.0, -1.0, 2.0], dtype=np.float32) + y = np.array([1.0, 0.0, 1.0, 1.0], dtype=np.float32) + + grad, hess = logloss_gradient(pred, y) + + # Independently compute sigmoid + p = 1.0 / (1.0 + np.exp(-pred.astype(np.float64))) + expected_grad = (p - y).astype(np.float32) + expected_hess = np.clip((p * (1 - p)).astype(np.float32), 1e-6, 1.0 - 1e-6) + + np.testing.assert_allclose(grad, expected_grad, atol=1e-5) + np.testing.assert_allclose(hess, expected_hess, atol=1e-5) + + def test_mae_gradient_sign(self): + """MAE: grad = sign(pred - y).""" + pred = np.array([1.0, 2.0, 3.0], dtype=np.float32) + y = np.array([0.5, 3.0, 1.0], dtype=np.float32) + + grad, hess = mae_gradient(pred, y) + + expected_sign = np.sign(pred - y) + np.testing.assert_allclose(grad, expected_sign, atol=1e-5) + + def test_poisson_gradient_analytical(self): + """Poisson: grad = exp(pred) - y.""" + pred = np.array([0.0, 1.0, -0.5], dtype=np.float32) + y = np.array([2.0, 1.0, 3.0], dtype=np.float32) + + grad, hess = poisson_gradient(pred, y) + + expected_grad = np.exp(pred) - y + np.testing.assert_allclose(grad, expected_grad, atol=1e-4) + + def test_gamma_gradient_analytical(self): + """Gamma: grad = 1 - y*exp(-pred), hess = y*exp(-pred).""" + pred = np.array([1.0, 0.5, 2.0], dtype=np.float32) + y = np.array([2.0, 1.0, 3.0], dtype=np.float32) + + grad, hess = gamma_gradient(pred, y) + + expected_grad = 1.0 - y * np.exp(-pred) + np.testing.assert_allclose(grad, expected_grad, atol=1e-4) + + +# ============================================================================= +# Numerical differentiation verification +# ============================================================================= + + +class TestNumericalGradients: + """Verify gradients match numerical differentiation (central differences).""" + + def _check_gradient(self, loss_name, pred, y, atol=1e-3, **kwargs): + """Helper: compare analytical gradient against numerical gradient.""" + loss_fn = get_loss_function(loss_name, **kwargs) + grad, _ = loss_fn( + np.asarray(pred, dtype=np.float32), + np.asarray(y, dtype=np.float32), + ) + grad = np.asarray(grad, dtype=np.float64) + + num_grad = _numerical_gradient(loss_name, pred, y, **kwargs) + + np.testing.assert_allclose( + grad, num_grad, atol=atol, + err_msg=f"Gradient mismatch for {loss_name}" + ) + + def test_mse_gradient_numerical(self): + pred = np.array([1.0, 2.5, -0.3]) + y = np.array([0.5, 3.0, 1.0]) + self._check_gradient('mse', pred, y) + + def test_logloss_gradient_numerical(self): + pred = np.array([0.5, -1.0, 2.0]) + y = np.array([1.0, 0.0, 1.0]) + self._check_gradient('logloss', pred, y) + + def test_huber_gradient_numerical(self): + pred = np.array([1.0, 5.0, -2.0]) + y = np.array([0.5, 0.0, 1.0]) + self._check_gradient('huber', pred, y, huber_delta=1.0) + + def test_poisson_gradient_numerical(self): + pred = np.array([0.5, 1.0, -0.5]) + y = np.array([2.0, 1.0, 3.0]) + self._check_gradient('poisson', pred, y, atol=1e-2) + + def test_gamma_gradient_numerical(self): + pred = np.array([0.5, 1.0, 1.5]) + y = np.array([2.0, 1.0, 3.0]) + self._check_gradient('gamma', pred, y, atol=1e-2) + + @pytest.mark.parametrize("rho", [1.1, 1.5, 1.9]) + def test_tweedie_gradient_numerical(self, rho): + pred = np.array([0.5, 1.0, 0.2]) + y = np.array([2.0, 1.0, 3.0]) + self._check_gradient('tweedie', pred, y, tweedie_rho=rho, atol=1e-2) + + @pytest.mark.parametrize("alpha", [0.1, 0.5, 0.9]) + def test_quantile_gradient_numerical(self, alpha): + """Quantile loss gradient via numerical differentiation. + + Note: quantile loss has discontinuous gradient at pred=y, + so we avoid exact match points. + """ + pred = np.array([1.5, 0.3, -0.7]) + y = np.array([1.0, 2.0, 0.5]) + self._check_gradient('quantile', pred, y, quantile_alpha=alpha, atol=0.1) + + +# ============================================================================= +# Edge cases and numerical stability +# ============================================================================= + + +class TestLossEdgeCases: + """Edge cases that can cause NaN, overflow, or incorrect behavior.""" + + def test_logloss_extreme_negative_pred(self): + """Very negative predictions should not produce NaN.""" + pred = np.array([-500.0, -100.0], dtype=np.float32) + y = np.array([1.0, 0.0], dtype=np.float32) + + grad, hess = logloss_gradient(pred, y) + + assert np.all(np.isfinite(grad)), f"NaN in logloss grad: {grad}" + assert np.all(np.isfinite(hess)), f"NaN in logloss hess: {hess}" + + def test_logloss_extreme_positive_pred(self): + """Very positive predictions should not produce NaN.""" + pred = np.array([500.0, 100.0], dtype=np.float32) + y = np.array([0.0, 1.0], dtype=np.float32) + + grad, hess = logloss_gradient(pred, y) + + assert np.all(np.isfinite(grad)), f"NaN in logloss grad: {grad}" + assert np.all(np.isfinite(hess)), f"NaN in logloss hess: {hess}" + + def test_poisson_large_pred_no_overflow(self): + """exp(pred) should not overflow for large predictions.""" + pred = np.array([15.0, 18.0], dtype=np.float32) + y = np.array([1.0, 2.0], dtype=np.float32) + + grad, hess = poisson_gradient(pred, y) + + assert np.all(np.isfinite(grad)), f"Overflow in poisson grad: {grad}" + assert np.all(np.isfinite(hess)), f"Overflow in poisson hess: {hess}" + + def test_tweedie_zero_y(self): + """y=0 is valid for Tweedie — should not produce NaN.""" + pred = np.array([0.5, 1.0], dtype=np.float32) + y = np.array([0.0, 0.0], dtype=np.float32) + + grad, hess = tweedie_gradient(pred, y, rho=1.5) + + assert np.all(np.isfinite(grad)), f"NaN in tweedie grad with y=0: {grad}" + assert np.all(np.isfinite(hess)), f"NaN in tweedie hess with y=0: {hess}" + + def test_all_losses_finite_on_normal_input(self): + """Every built-in loss should produce finite grad/hess on normal inputs.""" + rng = np.random.RandomState(42) + pred = rng.randn(20).astype(np.float32) + y_reg = rng.randn(20).astype(np.float32) + y_bin = (rng.rand(20) > 0.5).astype(np.float32) + y_pos = np.abs(y_reg) + 0.1 # Positive for Poisson/Gamma + + losses_and_data = [ + ('mse', pred, y_reg), + ('mae', pred, y_reg), + ('huber', pred, y_reg), + ('logloss', pred, y_bin), + ('poisson', pred * 0.5, y_pos), # Smaller pred to avoid overflow + ('gamma', np.abs(pred) + 0.1, y_pos), + ] + + for loss_name, p, y in losses_and_data: + loss_fn = get_loss_function(loss_name) + grad, hess = loss_fn(p, y) + assert np.all(np.isfinite(grad)), f"{loss_name}: NaN/inf in grad" + assert np.all(np.isfinite(hess)), f"{loss_name}: NaN/inf in hess" + + +# ============================================================================= +# Loss value computation +# ============================================================================= + + +class TestLossValueComputation: + """Verify compute_loss_value returns correct scalar losses.""" + + def test_mse_loss_value(self): + pred = np.array([1.0, 2.0, 3.0]) + y = np.array([1.5, 2.5, 2.0]) + loss = compute_loss_value('mse', pred, y) + expected = np.mean((pred - y) ** 2) + np.testing.assert_almost_equal(loss, expected, decimal=6) + + def test_mae_loss_value(self): + pred = np.array([1.0, 2.0, 3.0]) + y = np.array([1.5, 2.5, 2.0]) + loss = compute_loss_value('mae', pred, y) + expected = np.mean(np.abs(pred - y)) + np.testing.assert_almost_equal(loss, expected, decimal=6) + + def test_logloss_value(self): + pred = np.array([2.0, -1.0]) + y = np.array([1.0, 0.0]) + loss = compute_loss_value('logloss', pred, y) + # Manual: p = sigmoid(pred), -mean(y*log(p) + (1-y)*log(1-p)) + p = 1.0 / (1.0 + np.exp(-pred.astype(np.float64))) + expected = -np.mean(y * np.log(p) + (1 - y) * np.log(1 - p)) + np.testing.assert_almost_equal(loss, expected, decimal=6) + + def test_loss_zero_when_perfect(self): + """MSE loss should be zero when predictions are perfect.""" + y = np.array([1.0, 2.0, 3.0]) + loss = compute_loss_value('mse', y, y) + np.testing.assert_almost_equal(loss, 0.0, decimal=10) + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/tests/test_numerical_agreement.py b/tests/test_numerical_agreement.py new file mode 100644 index 0000000..e79f595 --- /dev/null +++ b/tests/test_numerical_agreement.py @@ -0,0 +1,268 @@ +"""Numerical agreement tests: OpenBoost vs XGBoost. + +For matched hyperparameters, OpenBoost predictions should be very close +to XGBoost predictions. This is the strongest end-to-end correctness signal. + +All tests are marked @pytest.mark.xgboost and skip if xgboost is not installed. +""" + +import numpy as np +import pytest + +import openboost as ob + +xgb = pytest.importorskip("xgboost") + + +def _matched_params(n_trees=50, max_depth=4): + """Hyperparameters that align OpenBoost and XGBoost behavior.""" + return dict( + ob_params=dict( + n_trees=n_trees, + max_depth=max_depth, + learning_rate=0.1, + reg_lambda=1.0, + min_child_weight=1.0, + subsample=1.0, + colsample_bytree=1.0, + ), + xgb_params=dict( + n_estimators=n_trees, + max_depth=max_depth, + learning_rate=0.1, + reg_lambda=1.0, + reg_alpha=0.0, + min_child_weight=1.0, + subsample=1.0, + colsample_bytree=1.0, + tree_method='hist', + max_bin=255, + random_state=42, + ), + ) + + +@pytest.mark.xgboost +class TestXGBoostRegressionAgreement: + """Regression prediction agreement between OpenBoost and XGBoost.""" + + def test_single_tree_very_close(self, regression_500x10): + """Single depth-1 tree should produce very similar predictions.""" + X, y = regression_500x10 + params = _matched_params(n_trees=1, max_depth=1) + + ob_model = ob.GradientBoosting(**params['ob_params']) + ob_model.fit(X, y) + ob_pred = ob_model.predict(X) + + xgb_model = xgb.XGBRegressor(**params['xgb_params']) + xgb_model.fit(X, y) + xgb_pred = xgb_model.predict(X) + + # Single tree, depth 1: very few ways to differ + rmse_diff = np.sqrt(np.mean((ob_pred - xgb_pred) ** 2)) + target_std = np.std(y) + + assert rmse_diff / target_std < 0.05, ( + f"Single tree predictions differ too much: RMSE diff = {rmse_diff:.4f}, " + f"target std = {target_std:.4f} (ratio = {rmse_diff/target_std:.3f})" + ) + + def test_regression_predictions_close(self, regression_500x10): + """50-tree predictions should be within 5% relative RMSE.""" + X, y = regression_500x10 + params = _matched_params(n_trees=50, max_depth=4) + + ob_model = ob.GradientBoosting(**params['ob_params']) + ob_model.fit(X, y) + ob_pred = ob_model.predict(X) + + xgb_model = xgb.XGBRegressor(**params['xgb_params']) + xgb_model.fit(X, y) + xgb_pred = xgb_model.predict(X) + + rmse_diff = np.sqrt(np.mean((ob_pred - xgb_pred) ** 2)) + rmse_target = np.sqrt(np.mean((y - np.mean(y)) ** 2)) + + assert rmse_diff / rmse_target < 0.10, ( + f"Prediction RMSE diff {rmse_diff:.4f} is >{10}% of target RMSE {rmse_target:.4f}" + ) + + def test_predictions_same_direction(self, regression_500x10): + """Predictions should agree on relative ordering (correlation > 0.95).""" + X, y = regression_500x10 + params = _matched_params(n_trees=50, max_depth=4) + + ob_model = ob.GradientBoosting(**params['ob_params']) + ob_model.fit(X, y) + ob_pred = ob_model.predict(X) + + xgb_model = xgb.XGBRegressor(**params['xgb_params']) + xgb_model.fit(X, y) + xgb_pred = xgb_model.predict(X) + + correlation = np.corrcoef(ob_pred, xgb_pred)[0, 1] + assert correlation > 0.95, ( + f"Prediction correlation should be > 0.95, got {correlation:.4f}" + ) + + +@pytest.mark.xgboost +class TestXGBoostClassificationAgreement: + """Classification prediction agreement.""" + + def test_classification_probabilities_close(self, binary_500x10): + """Predicted probabilities should be within 0.10 of each other.""" + X, y = binary_500x10 + params = _matched_params(n_trees=50, max_depth=4) + + ob_model = ob.GradientBoosting( + loss='logloss', **params['ob_params'] + ) + ob_model.fit(X, y) + ob_raw = ob_model.predict(X) + # Convert logits to probabilities + ob_prob = 1.0 / (1.0 + np.exp(-ob_raw)) + + xgb_model = xgb.XGBClassifier( + objective='binary:logistic', + eval_metric='logloss', + **params['xgb_params'], + ) + xgb_model.fit(X, y) + xgb_prob = xgb_model.predict_proba(X)[:, 1] + + mean_diff = np.mean(np.abs(ob_prob - xgb_prob)) + + assert mean_diff < 0.10, ( + f"Mean probability difference {mean_diff:.4f} > 0.10" + ) + + def test_classification_accuracy_comparable(self, binary_500x10): + """Both models should achieve similar accuracy.""" + X, y = binary_500x10 + params = _matched_params(n_trees=50, max_depth=4) + + ob_model = ob.GradientBoosting( + loss='logloss', **params['ob_params'] + ) + ob_model.fit(X, y) + ob_raw = ob_model.predict(X) + ob_labels = (ob_raw > 0).astype(float) + ob_acc = np.mean(ob_labels == y) + + xgb_model = xgb.XGBClassifier( + objective='binary:logistic', + **params['xgb_params'], + ) + xgb_model.fit(X, y) + xgb_labels = xgb_model.predict(X) + xgb_acc = np.mean(xgb_labels == y) + + # Accuracies should be within 5 percentage points + assert abs(ob_acc - xgb_acc) < 0.05, ( + f"Accuracy gap too large: OB={ob_acc:.3f}, XGB={xgb_acc:.3f}" + ) + + +@pytest.mark.xgboost +class TestXGBoostQualityParity: + """Model quality should be competitive with XGBoost.""" + + def test_regression_r2_competitive(self, regression_500x10): + """OpenBoost R2 should be within 15% of XGBoost R2.""" + X, y = regression_500x10 + params = _matched_params(n_trees=100, max_depth=4) + + ob_model = ob.GradientBoosting(**params['ob_params']) + ob_model.fit(X, y) + ob_pred = ob_model.predict(X) + ss_res_ob = np.sum((y - ob_pred) ** 2) + ss_tot = np.sum((y - np.mean(y)) ** 2) + ob_r2 = 1 - ss_res_ob / ss_tot + + xgb_model = xgb.XGBRegressor(**params['xgb_params']) + xgb_model.fit(X, y) + xgb_pred = xgb_model.predict(X) + ss_res_xgb = np.sum((y - xgb_pred) ** 2) + xgb_r2 = 1 - ss_res_xgb / ss_tot + + assert ob_r2 > xgb_r2 * 0.85, ( + f"OpenBoost R2 ({ob_r2:.4f}) should be within 15% of XGBoost R2 ({xgb_r2:.4f})" + ) + + @pytest.mark.slow + def test_regression_california_housing(self): + """Real dataset: California Housing regression.""" + pytest.importorskip("sklearn") + from sklearn.datasets import fetch_california_housing + from sklearn.model_selection import train_test_split + + try: + data = fetch_california_housing() + except Exception: + pytest.skip("Could not download California Housing dataset") + + X_train, X_test, y_train, y_test = train_test_split( + data.data.astype(np.float32), + data.target.astype(np.float32), + test_size=0.2, random_state=42, + ) + + params = _matched_params(n_trees=100, max_depth=6) + + ob_model = ob.GradientBoosting(**params['ob_params']) + ob_model.fit(X_train, y_train) + ob_pred = ob_model.predict(X_test) + ob_rmse = np.sqrt(np.mean((ob_pred - y_test) ** 2)) + + xgb_model = xgb.XGBRegressor(**params['xgb_params']) + xgb_model.fit(X_train, y_train) + xgb_pred = xgb_model.predict(X_test) + xgb_rmse = np.sqrt(np.mean((xgb_pred - y_test) ** 2)) + + # OpenBoost RMSE should be within 15% of XGBoost RMSE + assert ob_rmse < xgb_rmse * 1.15, ( + f"OB RMSE ({ob_rmse:.4f}) > 1.15x XGB RMSE ({xgb_rmse:.4f})" + ) + + @pytest.mark.slow + def test_binary_breast_cancer(self): + """Real dataset: Breast Cancer binary classification.""" + pytest.importorskip("sklearn") + from sklearn.datasets import load_breast_cancer + from sklearn.model_selection import train_test_split + + data = load_breast_cancer() + X_train, X_test, y_train, y_test = train_test_split( + data.data.astype(np.float32), + data.target.astype(np.float32), + test_size=0.2, random_state=42, + ) + + params = _matched_params(n_trees=50, max_depth=4) + + ob_model = ob.GradientBoosting( + loss='logloss', **params['ob_params'] + ) + ob_model.fit(X_train, y_train) + ob_pred = ob_model.predict(X_test) + ob_acc = np.mean((ob_pred > 0).astype(float) == y_test) + + xgb_model = xgb.XGBClassifier( + objective='binary:logistic', + **params['xgb_params'], + ) + xgb_model.fit(X_train, y_train) + xgb_acc = np.mean(xgb_model.predict(X_test) == y_test) + + # Both should achieve > 90% accuracy + assert ob_acc > 0.90, f"OB accuracy {ob_acc:.3f} < 0.90" + # Within 5 points of each other + assert abs(ob_acc - xgb_acc) < 0.05, ( + f"Accuracy gap: OB={ob_acc:.3f}, XGB={xgb_acc:.3f}" + ) + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/uv.lock b/uv.lock index 077322d..f9bcdcf 100644 --- a/uv.lock +++ b/uv.lock @@ -1,5 +1,5 @@ version = 1 -revision = 3 +revision = 2 requires-python = ">=3.10" resolution-markers = [ "python_full_version >= '3.13'", @@ -615,6 +615,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/8a/0e/97c33bf5009bdbac74fd2beace167cab3f978feb69cc36f1ef79360d6c4e/exceptiongroup-1.3.1-py3-none-any.whl", hash = "sha256:a7a39a3bd276781e98394987d3a5701d0c4edffb633bb7a5144577f82c773598", size = 16740, upload-time = "2025-11-21T23:01:53.443Z" }, ] +[[package]] +name = "execnet" +version = "2.1.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/bf/89/780e11f9588d9e7128a3f87788354c7946a9cbb1401ad38a48c4db9a4f07/execnet-2.1.2.tar.gz", hash = "sha256:63d83bfdd9a23e35b9c6a3261412324f964c2ec8dcd8d3c6916ee9373e0befcd", size = 166622, upload-time = "2025-11-12T09:56:37.75Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ab/84/02fc1827e8cdded4aa65baef11296a9bbe595c474f0d6d758af082d849fd/execnet-2.1.2-py3-none-any.whl", hash = "sha256:67fba928dd5a544b783f6056f449e5e3931a5c378b128bc18501f7ea79e296ec", size = 40708, upload-time = "2025-11-12T09:56:36.333Z" }, +] + [[package]] name = "fastrlock" version = "0.8.3" @@ -2405,6 +2414,7 @@ all = [ { name = "numba-cuda" }, { name = "pytest" }, { name = "pytest-cov" }, + { name = "pytest-xdist" }, { name = "ray", extra = ["default"] }, { name = "ruff" }, { name = "scikit-learn", version = "1.7.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, @@ -2426,6 +2436,7 @@ dev = [ { name = "modal" }, { name = "pytest" }, { name = "pytest-cov" }, + { name = "pytest-xdist" }, { name = "ruff" }, { name = "scikit-learn", version = "1.7.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "scikit-learn", version = "1.8.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, @@ -2445,6 +2456,7 @@ sklearn = [ test = [ { name = "pytest" }, { name = "pytest-cov" }, + { name = "pytest-xdist" }, ] torch = [ { name = "torch" }, @@ -2460,6 +2472,7 @@ dev = [ { name = "mypy" }, { name = "pytest" }, { name = "pytest-cov" }, + { name = "pytest-xdist" }, { name = "ruff" }, { name = "xgboost" }, ] @@ -2477,6 +2490,7 @@ requires-dist = [ { name = "openboost", extras = ["test", "bench", "sklearn"], marker = "extra == 'dev'" }, { name = "pytest", marker = "extra == 'test'", specifier = ">=7.0" }, { name = "pytest-cov", marker = "extra == 'test'", specifier = ">=4.0" }, + { name = "pytest-xdist", marker = "extra == 'test'", specifier = ">=3.0" }, { name = "ray", extras = ["default"], marker = "extra == 'distributed'", specifier = ">=2.0" }, { name = "ruff", marker = "extra == 'dev'", specifier = ">=0.4" }, { name = "scikit-learn", marker = "extra == 'bench'", specifier = ">=1.0" }, @@ -2494,10 +2508,11 @@ dev = [ { name = "mkdocs-section-index", specifier = ">=0.3.10" }, { name = "mkdocstrings", extras = ["python"], specifier = ">=1.0.1" }, { name = "mypy", specifier = ">=1.19.1" }, - { name = "pytest", specifier = ">=8.0" }, + { name = "pytest", specifier = ">=7.0" }, { name = "pytest-cov", specifier = ">=4.0" }, + { name = "pytest-xdist", specifier = ">=3.0" }, { name = "ruff", specifier = ">=0.4" }, - { name = "xgboost", specifier = ">=3.1.3" }, + { name = "xgboost", specifier = ">=2.0" }, ] [[package]] @@ -3016,6 +3031,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ee/49/1377b49de7d0c1ce41292161ea0f721913fa8722c19fb9c1e3aa0367eecb/pytest_cov-7.0.0-py3-none-any.whl", hash = "sha256:3b8e9558b16cc1479da72058bdecf8073661c7f57f7d3c5f22a1c23507f2d861", size = 22424, upload-time = "2025-09-09T10:57:00.695Z" }, ] +[[package]] +name = "pytest-xdist" +version = "3.8.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "execnet" }, + { name = "pytest" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/78/b4/439b179d1ff526791eb921115fca8e44e596a13efeda518b9d845a619450/pytest_xdist-3.8.0.tar.gz", hash = "sha256:7e578125ec9bc6050861aa93f2d59f1d8d085595d6551c2c90b6f4fad8d3a9f1", size = 88069, upload-time = "2025-07-01T13:30:59.346Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ca/31/d4e37e9e550c2b92a9cbc2e4d0b7420a27224968580b5a447f420847c975/pytest_xdist-3.8.0-py3-none-any.whl", hash = "sha256:202ca578cfeb7370784a8c33d6d05bc6e13b4f25b5053c30a152269fd10f0b88", size = 46396, upload-time = "2025-07-01T13:30:56.632Z" }, +] + [[package]] name = "python-dateutil" version = "2.9.0.post0" From c2a4e84293b876cc995a83423dc1f657bb764768 Mon Sep 17 00:00:00 2001 From: J Xu Date: Sun, 22 Mar 2026 19:32:48 -0700 Subject: [PATCH 2/6] Fix all ruff lint errors (276 errors resolved) - Add E402, F821 to ruff ignore list (intentional patterns) - Auto-fix: unused imports, f-string placeholders, nested ifs, ternaries - Add `from err` to raise-in-except clauses (B904) - Suppress intentional ABC no-op methods (B024/B027) - Rename unused loop variables (B007) Co-Authored-By: Claude Opus 4.6 --- .claude/worktrees/silly-kirch | 1 + pyproject.toml | 6 +- src/openboost/__init__.py | 285 ++++++++++++----------- src/openboost/_array.py | 10 +- src/openboost/_backends/_cpu.py | 26 +-- src/openboost/_backends/_cuda.py | 51 ++-- src/openboost/_batch.py | 3 +- src/openboost/_boosting.py | 6 +- src/openboost/_callbacks.py | 22 +- src/openboost/_core/__init__.py | 53 ++--- src/openboost/_core/_growth.py | 25 +- src/openboost/_core/_histogram.py | 1 + src/openboost/_core/_predict.py | 13 +- src/openboost/_core/_primitives.py | 28 +-- src/openboost/_core/_split.py | 3 +- src/openboost/_core/_tree.py | 38 +-- src/openboost/_distributed/__init__.py | 7 +- src/openboost/_distributed/_multigpu.py | 58 ++--- src/openboost/_distributed/_ray.py | 9 +- src/openboost/_distributed/_tree.py | 29 +-- src/openboost/_distributions.py | 18 +- src/openboost/_histogram.py | 2 +- src/openboost/_importance.py | 9 +- src/openboost/_kernels.py | 14 +- src/openboost/_loss.py | 22 +- src/openboost/_models/__init__.py | 28 +-- src/openboost/_models/_batch.py | 3 +- src/openboost/_models/_boosting.py | 34 +-- src/openboost/_models/_dart.py | 15 +- src/openboost/_models/_distributional.py | 10 +- src/openboost/_models/_gam.py | 9 +- src/openboost/_models/_linear_leaf.py | 16 +- src/openboost/_models/_sklearn.py | 18 +- src/openboost/_persistence.py | 12 +- src/openboost/_predict.py | 8 +- src/openboost/_profiler.py | 13 +- src/openboost/_sampling.py | 15 +- src/openboost/_split.py | 2 +- src/openboost/_training.py | 5 +- src/openboost/_utils.py | 29 ++- src/openboost/_validation.py | 15 +- 41 files changed, 442 insertions(+), 529 deletions(-) create mode 160000 .claude/worktrees/silly-kirch diff --git a/.claude/worktrees/silly-kirch b/.claude/worktrees/silly-kirch new file mode 160000 index 0000000..7450b78 --- /dev/null +++ b/.claude/worktrees/silly-kirch @@ -0,0 +1 @@ +Subproject commit 7450b7841f2af64569a9cc573946c42b0fcae851 diff --git a/pyproject.toml b/pyproject.toml index 87a468a..4390e3c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -89,7 +89,11 @@ src = ["src"] [tool.ruff.lint] select = ["E", "F", "I", "UP", "B", "SIM"] -ignore = ["E501"] # Line length handled separately +ignore = [ + "E501", # Line length handled separately + "E402", # Imports organized by section in __init__.py + "F821", # DeviceNDArray type hints for optional CUDA (not available at lint time) +] [tool.pytest.ini_options] testpaths = ["tests"] diff --git a/src/openboost/__init__.py b/src/openboost/__init__.py index 206e100..8e21fb9 100644 --- a/src/openboost/__init__.py +++ b/src/openboost/__init__.py @@ -39,7 +39,7 @@ # ============================================================================= # Data Layer # ============================================================================= -from ._array import BinnedArray, array, as_numba_array, MISSING_BIN +from ._array import MISSING_BIN, BinnedArray, array, as_numba_array # ============================================================================= # Core (Foundation) @@ -48,41 +48,43 @@ # Growth strategies (Phase 8.2) GrowthConfig, GrowthStrategy, - TreeStructure, - LevelWiseGrowth, - LeafWiseGrowth, - SymmetricGrowth, - get_growth_strategy, # Leaf value abstractions (Phase 9.0) LeafValues, + LeafWiseGrowth, + LevelWiseGrowth, + # Primitives (Phase 8.1) + NodeHistogram, + NodeSplit, ScalarLeaves, + SymmetricGrowth, + # Symmetric trees + SymmetricTree, + TreeNode, + TreeStructure, VectorLeaves, + build_node_histograms, + compute_leaf_values, + find_node_splits, # Tree building fit_tree, - fit_trees_batch, - Tree as LegacyTree, - TreeNode, fit_tree_gpu_native, - predict_tree, - # Symmetric trees - SymmetricTree, fit_tree_symmetric, fit_tree_symmetric_gpu_native, - predict_symmetric_tree, - # Primitives (Phase 8.1) - NodeHistogram, - NodeSplit, - build_node_histograms, - subtract_histogram, - find_node_splits, - partition_samples, - compute_leaf_values, - init_sample_node_ids, - get_nodes_at_depth, + fit_trees_batch, get_children, + get_growth_strategy, + get_nodes_at_depth, get_parent, + init_sample_node_ids, + partition_samples, # Prediction predict_ensemble, + predict_symmetric_tree, + predict_tree, + subtract_histogram, +) +from ._core import ( + Tree as LegacyTree, ) # Phase 8: TreeStructure is the new Tree @@ -91,78 +93,56 @@ # ============================================================================= # Models (High-Level) # ============================================================================= -from ._models import ( - GradientBoosting, - MultiClassGradientBoosting, - DART, - OpenBoostGAM, - ConfigBatch, - BatchTrainingState, - # Phase 13: sklearn-compatible wrappers - OpenBoostRegressor, - OpenBoostClassifier, - # Phase 15: sklearn wrappers for new models - OpenBoostDistributionalRegressor, - OpenBoostLinearLeafRegressor, - # Phase 15/16: Distributional GBDT (NaturalBoost) - DistributionalGBDT, - NaturalBoost, - NaturalBoostNormal, - NaturalBoostLogNormal, - NaturalBoostGamma, - NaturalBoostPoisson, - NaturalBoostStudentT, - NaturalBoostTweedie, - NaturalBoostNegBin, - # Backward compatibility aliases (deprecated, accessed via __getattr__) - NGBoost as _NGBoost, - NGBoostNormal as _NGBoostNormal, - NGBoostLogNormal as _NGBoostLogNormal, - NGBoostGamma as _NGBoostGamma, - NGBoostPoisson as _NGBoostPoisson, - NGBoostStudentT as _NGBoostStudentT, - NGBoostTweedie as _NGBoostTweedie, - NGBoostNegBin as _NGBoostNegBin, - # Phase 15: Linear Leaf GBDT - LinearLeafGBDT, +# ============================================================================= +# Backend Control +# ============================================================================= +from ._backends import get_backend, is_cpu, is_cuda, set_backend + +# ============================================================================= +# Callbacks (Phase 13) +# ============================================================================= +from ._callbacks import ( + Callback, + CallbackManager, + EarlyStopping, + HistoryCallback, + LearningRateScheduler, + Logger, + ModelCheckpoint, + TrainingState, +) + +# ============================================================================= +# Multi-GPU Training (Phase 18) +# ============================================================================= +from ._distributed import ( + GPUWorker, + GPUWorkerBase, + MultiGPUContext, + fit_tree_multigpu, ) # ============================================================================= # Distributions (Phase 15) # ============================================================================= from ._distributions import ( + # Custom distributions with autodiff + CustomDistribution, Distribution, DistributionOutput, - Normal, - LogNormal, Gamma, + LogNormal, + NegativeBinomial, + Normal, Poisson, StudentT, # Kaggle competition favorites Tweedie, - NegativeBinomial, - # Custom distributions with autodiff - CustomDistribution, create_custom_distribution, get_distribution, list_distributions, ) -# ============================================================================= -# Callbacks (Phase 13) -# ============================================================================= -from ._callbacks import ( - Callback, - EarlyStopping, - Logger, - ModelCheckpoint, - LearningRateScheduler, - HistoryCallback, - CallbackManager, - TrainingState, -) -from ._profiler import ProfilingCallback - # ============================================================================= # Feature Importance (Phase 13) # ============================================================================= @@ -176,93 +156,124 @@ # Loss Functions # ============================================================================= from ._loss import ( - mse_gradient, - logloss_gradient, - huber_gradient, - mae_gradient, # Phase 9.1 - quantile_gradient, # Phase 9.1 - poisson_gradient, # Phase 9.3 - gamma_gradient, # Phase 9.3 - tweedie_gradient, # Phase 9.3 - softmax_gradient, # Phase 9.2 + gamma_gradient, # Phase 9.3 get_loss_function, + huber_gradient, + logloss_gradient, + mae_gradient, # Phase 9.1 + mse_gradient, + poisson_gradient, # Phase 9.3 + quantile_gradient, # Phase 9.1 + softmax_gradient, # Phase 9.2 + tweedie_gradient, # Phase 9.3 ) - -# ============================================================================= -# Backend Control -# ============================================================================= -from ._backends import get_backend, set_backend, is_cuda, is_cpu +from ._models import ( + DART, + BatchTrainingState, + ConfigBatch, + # Phase 15/16: Distributional GBDT (NaturalBoost) + DistributionalGBDT, + GradientBoosting, + # Phase 15: Linear Leaf GBDT + LinearLeafGBDT, + MultiClassGradientBoosting, + NaturalBoost, + NaturalBoostGamma, + NaturalBoostLogNormal, + NaturalBoostNegBin, + NaturalBoostNormal, + NaturalBoostPoisson, + NaturalBoostStudentT, + NaturalBoostTweedie, + OpenBoostClassifier, + # Phase 15: sklearn wrappers for new models + OpenBoostDistributionalRegressor, + OpenBoostGAM, + OpenBoostLinearLeafRegressor, + # Phase 13: sklearn-compatible wrappers + OpenBoostRegressor, +) +from ._models import ( + # Backward compatibility aliases (deprecated, accessed via __getattr__) + NGBoost as _NGBoost, +) +from ._models import ( + NGBoostGamma as _NGBoostGamma, +) +from ._models import ( + NGBoostLogNormal as _NGBoostLogNormal, +) +from ._models import ( + NGBoostNegBin as _NGBoostNegBin, +) +from ._models import ( + NGBoostNormal as _NGBoostNormal, +) +from ._models import ( + NGBoostPoisson as _NGBoostPoisson, +) +from ._models import ( + NGBoostStudentT as _NGBoostStudentT, +) +from ._models import ( + NGBoostTweedie as _NGBoostTweedie, +) +from ._profiler import ProfilingCallback # ============================================================================= # Sampling Strategies (Phase 17) # ============================================================================= from ._sampling import ( - SamplingStrategy, GOSSConfig, MiniBatchConfig, - SamplingResult, - goss_sample, - random_sample, - apply_sampling, MiniBatchIterator, + SamplingResult, + SamplingStrategy, accumulate_histograms_minibatch, + apply_sampling, create_memmap_binned, + goss_sample, load_memmap_binned, -) - -# ============================================================================= -# Multi-GPU Training (Phase 18) -# ============================================================================= -from ._distributed import ( - MultiGPUContext, - GPUWorkerBase, - GPUWorker, - fit_tree_multigpu, + random_sample, ) # ============================================================================= # Utilities (Phase 20.6) # ============================================================================= -from ._utils import ( - suggest_params, - cross_val_predict, - cross_val_predict_proba, - cross_val_predict_interval, - evaluate_coverage, - get_param_grid, - PARAM_GRID_REGRESSION, - PARAM_GRID_CLASSIFICATION, - PARAM_GRID_DISTRIBUTIONAL, -) - # ============================================================================= # Evaluation Metrics (Phase 22) # ============================================================================= -from ._utils import ( - roc_auc_score, - accuracy_score, - log_loss_score, - mse_score, - r2_score, - mae_score, - rmse_score, - f1_score, - precision_score, - recall_score, -) - # ============================================================================= # Probabilistic/Distributional Metrics (Phase 22 Sprint 2) # ============================================================================= from ._utils import ( - crps_gaussian, - crps_empirical, + PARAM_GRID_CLASSIFICATION, + PARAM_GRID_DISTRIBUTIONAL, + PARAM_GRID_REGRESSION, + accuracy_score, brier_score, - pinball_loss, - interval_score, - expected_calibration_error, calibration_curve, + cross_val_predict, + cross_val_predict_interval, + cross_val_predict_proba, + crps_empirical, + crps_gaussian, + evaluate_coverage, + expected_calibration_error, + f1_score, + get_param_grid, + interval_score, + log_loss_score, + mae_score, + mse_score, negative_log_likelihood, + pinball_loss, + precision_score, + r2_score, + recall_score, + rmse_score, + roc_auc_score, + suggest_params, ) _DEPRECATED_ALIASES = { @@ -312,6 +323,7 @@ def __getattr__(name: str): "NaturalBoostStudentT", "NaturalBoostTweedie", "NaturalBoostNegBin", + "LegacyTree", # Backward compatibility (deprecated) "NGBoost", "NGBoostNormal", @@ -377,6 +389,7 @@ def __getattr__(name: str): "fit_tree_symmetric", "fit_tree_symmetric_gpu_native", "SymmetricTree", + "TreeNode", "predict_symmetric_tree", # Training (batch, low-level) "fit_trees_batch", diff --git a/src/openboost/_array.py b/src/openboost/_array.py index 1607813..5370a13 100644 --- a/src/openboost/_array.py +++ b/src/openboost/_array.py @@ -9,8 +9,9 @@ from __future__ import annotations import warnings +from collections.abc import Sequence from dataclasses import dataclass, field -from typing import TYPE_CHECKING, Sequence +from typing import TYPE_CHECKING import numpy as np @@ -70,7 +71,7 @@ def any_categorical(self) -> bool: """Check if any feature is categorical.""" return len(self.is_categorical) > 0 and np.any(self.is_categorical) - def transform(self, X: ArrayLike) -> "BinnedArray": + def transform(self, X: ArrayLike) -> BinnedArray: """Transform new data using the bin edges from this BinnedArray. Use this method to transform test/validation data using the same @@ -420,10 +421,7 @@ def _bin_categorical_feature( has_nan = bool(np.any(nan_mask)) # Get unique non-missing values - if has_nan: - valid_values = col[~nan_mask] - else: - valid_values = col + valid_values = col[~nan_mask] if has_nan else col unique_vals = np.unique(valid_values) n_categories = len(unique_vals) diff --git a/src/openboost/_backends/_cpu.py b/src/openboost/_backends/_cpu.py index 5c2c7cc..d160e48 100644 --- a/src/openboost/_backends/_cpu.py +++ b/src/openboost/_backends/_cpu.py @@ -5,7 +5,6 @@ import numpy as np from numba import jit, prange - # ============================================================================= # Histogram Functions # ============================================================================= @@ -589,10 +588,7 @@ def _predict_cpu( threshold = tree_thresholds[node] bin_value = binned[feature, i] - if bin_value <= threshold: - node = tree_left[node] - else: - node = tree_right[node] + node = tree_left[node] if bin_value <= threshold else tree_right[node] predictions[i] = tree_values[node] @@ -671,10 +667,7 @@ def _predict_cpu_with_missing( # Check for missing value if bin_value == MISSING_BIN: # Use learned direction - if tree_missing_left[node]: - node = tree_left[node] - else: - node = tree_right[node] + node = tree_left[node] if tree_missing_left[node] else tree_right[node] elif bin_value <= threshold: node = tree_left[node] else: @@ -716,24 +709,15 @@ def _predict_cpu_with_categorical( # Check for missing value if bin_value == MISSING_BIN: - if tree_missing_left[node]: - node = tree_left[node] - else: - node = tree_right[node] + node = tree_left[node] if tree_missing_left[node] else tree_right[node] elif is_categorical_split[node]: # Categorical split: use bitmask bitset = cat_bitsets[node] - if (np.int64(1) << bin_value) & bitset: - node = tree_left[node] - else: - node = tree_right[node] + node = tree_left[node] if np.int64(1) << bin_value & bitset else tree_right[node] else: # Numeric split: use threshold threshold = tree_thresholds[node] - if bin_value <= threshold: - node = tree_left[node] - else: - node = tree_right[node] + node = tree_left[node] if bin_value <= threshold else tree_right[node] predictions[i] = tree_values[node] diff --git a/src/openboost/_backends/_cuda.py b/src/openboost/_backends/_cuda.py index d7e5ba3..c6f9685 100644 --- a/src/openboost/_backends/_cuda.py +++ b/src/openboost/_backends/_cuda.py @@ -406,10 +406,9 @@ def _argmax_with_values_kernel( # Tree reduction to find global max s = block_size // 2 while s > 0: - if thread_idx < s: - if shared_vals[thread_idx + s] > shared_vals[thread_idx]: - shared_vals[thread_idx] = shared_vals[thread_idx + s] - shared_idxs[thread_idx] = shared_idxs[thread_idx + s] + if thread_idx < s and shared_vals[thread_idx + s] > shared_vals[thread_idx]: + shared_vals[thread_idx] = shared_vals[thread_idx + s] + shared_idxs[thread_idx] = shared_idxs[thread_idx + s] cuda.syncthreads() s //= 2 @@ -1148,10 +1147,7 @@ def _predict_kernel( threshold = tree_thresholds[node] bin_value = binned[feature, sample_idx] - if bin_value <= threshold: - node = tree_left[node] - else: - node = tree_right[node] + node = tree_left[node] if bin_value <= threshold else tree_right[node] predictions[sample_idx] = tree_values[node] @@ -1235,10 +1231,7 @@ def _predict_with_missing_kernel( # Phase 14.2: Check for missing value if bin_value == 255: # MISSING_BIN - if tree_missing_left[node]: - node = tree_left[node] - else: - node = tree_right[node] + node = tree_left[node] if tree_missing_left[node] else tree_right[node] elif bin_value <= threshold: node = tree_left[node] else: @@ -1279,24 +1272,15 @@ def _predict_with_categorical_kernel( # Check for missing value first if bin_value == 255: # MISSING_BIN - if tree_missing_left[node]: - node = tree_left[node] - else: - node = tree_right[node] + node = tree_left[node] if tree_missing_left[node] else tree_right[node] elif is_categorical_split[node]: # Categorical split: use bitmask bitset = cat_bitsets[node] - if (int64(1) << bin_value) & bitset: - node = tree_left[node] - else: - node = tree_right[node] + node = tree_left[node] if int64(1) << bin_value & bitset else tree_right[node] else: # Numeric split: use threshold threshold = tree_thresholds[node] - if bin_value <= threshold: - node = tree_left[node] - else: - node = tree_right[node] + node = tree_left[node] if bin_value <= threshold else tree_right[node] predictions[sample_idx] = tree_values[node] @@ -1949,7 +1933,6 @@ def _find_split_batch_kernel( feature_idx = cuda.blockIdx.x config_idx = cuda.blockIdx.y thread_idx = cuda.threadIdx.x - block_size = cuda.blockDim.x n_features = hist_grad.shape[1] n_configs = hist_grad.shape[0] @@ -2723,11 +2706,10 @@ def _find_level_splits_kernel( # Tree reduction to find global best s = block_size // 2 while s > 0: - if thread_idx < s: - if shared_gains[thread_idx + s] > shared_gains[thread_idx]: - shared_gains[thread_idx] = shared_gains[thread_idx + s] - shared_bins[thread_idx] = shared_bins[thread_idx + s] - shared_features[thread_idx] = shared_features[thread_idx + s] + if thread_idx < s and shared_gains[thread_idx + s] > shared_gains[thread_idx]: + shared_gains[thread_idx] = shared_gains[thread_idx + s] + shared_bins[thread_idx] = shared_bins[thread_idx + s] + shared_features[thread_idx] = shared_features[thread_idx + s] cuda.syncthreads() s //= 2 @@ -3218,10 +3200,9 @@ def _find_symmetric_split_kernel( # Parallel reduction to find max gain within this feature stride = 128 while stride > 0: - if tid < stride: - if shared_gains[tid + stride] > shared_gains[tid]: - shared_gains[tid] = shared_gains[tid + stride] - shared_thresholds[tid] = shared_thresholds[tid + stride] + if tid < stride and shared_gains[tid + stride] > shared_gains[tid]: + shared_gains[tid] = shared_gains[tid + stride] + shared_thresholds[tid] = shared_thresholds[tid + stride] cuda.syncthreads() stride //= 2 @@ -3418,7 +3399,7 @@ def build_tree_symmetric_gpu_native( # Convert params to float32 reg_lambda_f32 = np.float32(reg_lambda) min_child_weight_f32 = np.float32(min_child_weight) - min_gain_f32 = np.float32(min_gain) + np.float32(min_gain) # Initialize GPU arrays (using module-level kernel to avoid JIT overhead) init_blocks = max(sample_blocks, leaf_blocks, 1) diff --git a/src/openboost/_batch.py b/src/openboost/_batch.py index a77eccb..1678d67 100644 --- a/src/openboost/_batch.py +++ b/src/openboost/_batch.py @@ -6,8 +6,9 @@ from __future__ import annotations +from collections.abc import Sequence from dataclasses import dataclass, field -from typing import TYPE_CHECKING, Sequence +from typing import TYPE_CHECKING import numpy as np diff --git a/src/openboost/_boosting.py b/src/openboost/_boosting.py index 3644352..49c8ac9 100644 --- a/src/openboost/_boosting.py +++ b/src/openboost/_boosting.py @@ -4,10 +4,10 @@ from numba import cuda from ._array import _quantile_bin -from ._tree import Tree from ._histogram import build_histograms -from ._split import find_best_splits, compute_leaf_values -from ._kernels import update_sample_nodes_kernel, predict_kernel +from ._kernels import predict_kernel, update_sample_nodes_kernel +from ._split import compute_leaf_values, find_best_splits +from ._tree import Tree class GradientBoosting: diff --git a/src/openboost/_callbacks.py b/src/openboost/_callbacks.py index e752ac6..77f6adb 100644 --- a/src/openboost/_callbacks.py +++ b/src/openboost/_callbacks.py @@ -27,10 +27,8 @@ from dataclasses import dataclass, field from typing import TYPE_CHECKING, Any -import numpy as np - if TYPE_CHECKING: - from numpy.typing import NDArray + pass @dataclass @@ -56,7 +54,7 @@ class TrainingState: extra: dict = field(default_factory=dict) -class Callback(ABC): +class Callback(ABC): # noqa: B024 """Base class for training callbacks. Subclass this to create custom callbacks for training hooks. @@ -77,22 +75,22 @@ class Callback(ABC): >>> plt.plot(tracker.grad_norms) """ - def on_train_begin(self, state: TrainingState) -> None: + def on_train_begin(self, state: TrainingState) -> None: # noqa: B027 """Called at the start of training. - + Args: state: Current training state. """ pass - - def on_round_begin(self, state: TrainingState) -> None: + + def on_round_begin(self, state: TrainingState) -> None: # noqa: B027 """Called at the start of each boosting round. - + Args: state: Current training state. """ pass - + def on_round_end(self, state: TrainingState) -> bool: """Called at the end of each boosting round. @@ -104,9 +102,9 @@ def on_round_end(self, state: TrainingState) -> bool: """ return True - def on_train_end(self, state: TrainingState) -> None: + def on_train_end(self, state: TrainingState) -> None: # noqa: B027 """Called at the end of training. - + Args: state: Current training state. """ diff --git a/src/openboost/_core/__init__.py b/src/openboost/_core/__init__.py index 23197a9..4faee93 100644 --- a/src/openboost/_core/__init__.py +++ b/src/openboost/_core/__init__.py @@ -6,50 +6,47 @@ - fit_tree: main entry point for building trees """ +from ._growth import ( + GrowthConfig, + GrowthStrategy, + LeafValues, + LeafWiseGrowth, + LevelWiseGrowth, + ScalarLeaves, + SymmetricGrowth, + TreeStructure, + VectorLeaves, + get_growth_strategy, +) +from ._histogram import build_histogram +from ._predict import predict_ensemble from ._primitives import ( NodeHistogram, NodeSplit, build_node_histograms, - subtract_histogram, - find_node_splits, - partition_samples, compute_leaf_values, - init_sample_node_ids, - get_nodes_at_depth, + find_node_splits, get_children, + get_nodes_at_depth, get_parent, + init_sample_node_ids, + partition_samples, + subtract_histogram, ) - -from ._growth import ( - GrowthConfig, - GrowthStrategy, - TreeStructure, - LevelWiseGrowth, - LeafWiseGrowth, - SymmetricGrowth, - get_growth_strategy, - LeafValues, - ScalarLeaves, - VectorLeaves, -) - +from ._split import SplitInfo, compute_leaf_value, find_best_split from ._tree import ( - fit_tree, - fit_trees_batch, + SymmetricTree, Tree, TreeNode, - SymmetricTree, + fit_tree, + fit_tree_gpu_native, fit_tree_symmetric, fit_tree_symmetric_gpu_native, - fit_tree_gpu_native, - predict_tree, + fit_trees_batch, predict_symmetric_tree, + predict_tree, ) -from ._histogram import build_histogram, subtract_histogram as hist_subtract -from ._split import find_best_split, compute_leaf_value, SplitInfo -from ._predict import predict_ensemble - __all__ = [ # Primitives "NodeHistogram", diff --git a/src/openboost/_core/_growth.py b/src/openboost/_core/_growth.py index b45cf67..d126d89 100644 --- a/src/openboost/_core/_growth.py +++ b/src/openboost/_core/_growth.py @@ -20,19 +20,18 @@ import numpy as np -from .._backends import is_cuda from .._array import MISSING_BIN +from .._backends import is_cuda from ._primitives import ( NodeHistogram, NodeSplit, build_node_histograms, - subtract_histogram, - find_node_splits, - partition_samples, compute_leaf_values, - init_sample_node_ids, + find_node_splits, get_nodes_at_depth, - get_children, + init_sample_node_ids, + partition_samples, + subtract_histogram, ) if TYPE_CHECKING: @@ -94,7 +93,7 @@ def values(self) -> NDArray: return self._values @classmethod - def zeros(cls, n_nodes: int) -> "ScalarLeaves": + def zeros(cls, n_nodes: int) -> ScalarLeaves: """Create zero-initialized scalar leaves.""" return cls(_values=np.zeros(n_nodes, dtype=np.float32)) @@ -127,7 +126,7 @@ def values(self) -> NDArray: return self._values @classmethod - def zeros(cls, n_nodes: int, n_outputs: int) -> "VectorLeaves": + def zeros(cls, n_nodes: int, n_outputs: int) -> VectorLeaves: """Create zero-initialized vector leaves.""" return cls( _values=np.zeros((n_nodes, n_outputs), dtype=np.float32), @@ -277,10 +276,7 @@ def __call__(self, X) -> NDArray: """ # Handle BinnedArray from .._array import BinnedArray - if isinstance(X, BinnedArray): - binned = X.data - else: - binned = X + binned = X.data if isinstance(X, BinnedArray) else X return self.predict(binned) def _predict_standard(self, binned: NDArray) -> NDArray: @@ -324,10 +320,7 @@ def _predict_standard_cpu(self, binned: NDArray) -> NDArray: # Check bitmask membership: bit[bin_value] == 1 means go left bitset = self.cat_bitsets[node] goes_left = (bitset >> bin_value) & 1 - if goes_left: - node = self.left_children[node] - else: - node = self.right_children[node] + node = self.left_children[node] if goes_left else self.right_children[node] # Standard ordinal split elif bin_value <= threshold: node = self.left_children[node] diff --git a/src/openboost/_core/_histogram.py b/src/openboost/_core/_histogram.py index 6e28719..135abf3 100644 --- a/src/openboost/_core/_histogram.py +++ b/src/openboost/_core/_histogram.py @@ -13,6 +13,7 @@ if TYPE_CHECKING: from numpy.typing import NDArray + from .._array import BinnedArray diff --git a/src/openboost/_core/_predict.py b/src/openboost/_core/_predict.py index 9e3d8b0..3f8f577 100644 --- a/src/openboost/_core/_predict.py +++ b/src/openboost/_core/_predict.py @@ -6,6 +6,7 @@ from __future__ import annotations +import contextlib from typing import TYPE_CHECKING import numpy as np @@ -14,9 +15,10 @@ from .._backends import is_cuda if TYPE_CHECKING: - from ._tree import Tree from numpy.typing import NDArray + from ._tree import Tree + def predict_ensemble( trees: list[Tree], @@ -203,10 +205,7 @@ def kernel(X_binned, node_features, node_thresholds, node_left, node_right, while node_features[node] >= 0: # Not a leaf feat = node_features[node] val = X_binned[feat, idx] # Feature-major layout - if val <= node_thresholds[node]: - node = node_left[node] - else: - node = node_right[node] + node = node_left[node] if val <= node_thresholds[node] else node_right[node] # Add leaf value to prediction pred[idx] += learning_rate * node_values[node] @@ -217,8 +216,6 @@ def kernel(X_binned, node_features, node_thresholds, node_left, node_right, # Initialize kernel at module load if CUDA available if is_cuda(): - try: + with contextlib.suppress(Exception): _predict_tree_add_kernel = _get_predict_tree_add_kernel() - except Exception: - pass diff --git a/src/openboost/_core/_primitives.py b/src/openboost/_core/_primitives.py index db7e8af..32fc1bf 100644 --- a/src/openboost/_core/_primitives.py +++ b/src/openboost/_core/_primitives.py @@ -25,8 +25,8 @@ import numpy as np -from .._backends import is_cuda from .._array import MISSING_BIN +from .._backends import is_cuda from ._split import SplitInfo if TYPE_CHECKING: @@ -174,12 +174,6 @@ def _build_node_histograms_gpu( Uses the optimized shared memory histogram kernel from Phase 6.3. """ - from numba import cuda - import math - from .._backends._cuda import ( - _build_histogram_shared_kernel, - _zero_level_histograms_kernel, - ) n_features, n_samples = binned.shape @@ -216,8 +210,10 @@ def _build_node_histograms_gpu_contiguous( n_nodes: int, ) -> dict[int, NodeHistogram]: """GPU histogram building for contiguous node range.""" - from numba import cuda import math + + from numba import cuda + from .._backends._cuda import ( _build_histogram_shared_kernel, _zero_level_histograms_kernel, @@ -267,7 +263,7 @@ def _build_node_histograms_gpu_contiguous( sample_node_ids_cpu = sample_node_ids.copy_to_host() result = {} - for i, node_id in enumerate(range(level_start, level_end)): + for _i, node_id in enumerate(range(level_start, level_end)): node_hist = histograms_cpu[node_id] hist_grad = node_hist[:, :, 0] # (n_features, 256) hist_hess = node_hist[:, :, 1] @@ -297,6 +293,7 @@ def _build_node_histograms_gpu_sparse( ) -> dict[int, NodeHistogram]: """GPU histogram building for non-contiguous nodes (leaf-wise).""" from numba import cuda + from .._backends._cuda import build_histogram_cuda, gather_cuda # For sparse node sets, build each node separately @@ -426,7 +423,11 @@ def find_node_splits( >>> for node_id, split in splits.items(): ... print(f"Node {node_id}: split on feature {split.split.feature}") """ - from ._split import find_best_split, find_best_split_with_missing, find_best_split_with_categorical + from ._split import ( + find_best_split, + find_best_split_with_categorical, + find_best_split_with_missing, + ) result = {} @@ -596,13 +597,14 @@ def _partition_samples_gpu( sample_node_ids, splits: dict[int, NodeSplit], missing_go_left: NDArray | None = None, -) -> "DeviceNDArray": +) -> DeviceNDArray: """GPU implementation of sample partitioning. Phase 14: Handles missing values (bin 255) using learned direction. """ - from numba import cuda import math + + from numba import cuda n_samples = sample_node_ids.shape[0] @@ -660,7 +662,7 @@ def _init_partition_kernel_with_missing(): if _partition_kernel_with_missing is not None: return - from numba import cuda, int32, uint8 + from numba import cuda, int32 @cuda.jit def kernel(binned, old_node_ids, new_node_ids, diff --git a/src/openboost/_core/_split.py b/src/openboost/_core/_split.py index 2b0bff6..029e0cd 100644 --- a/src/openboost/_core/_split.py +++ b/src/openboost/_core/_split.py @@ -11,7 +11,6 @@ import numpy as np from .._backends import is_cuda -from .._array import MISSING_BIN if TYPE_CHECKING: from numpy.typing import NDArray @@ -184,7 +183,7 @@ def find_best_split_with_missing( total_hess = float(_sum_histogram(hist_hess)) # Check if any feature has missing values - n_features = hist_grad.shape[0] + hist_grad.shape[0] any_missing = has_missing is not None and np.any(has_missing) # If no missing values, use standard split finding diff --git a/src/openboost/_core/_tree.py b/src/openboost/_core/_tree.py index 8588062..1c2f607 100644 --- a/src/openboost/_core/_tree.py +++ b/src/openboost/_core/_tree.py @@ -17,15 +17,12 @@ GrowthConfig, GrowthStrategy, TreeStructure, - LevelWiseGrowth, - LeafWiseGrowth, - SymmetricGrowth, get_growth_strategy, ) # Legacy imports for backward compatibility with internal code from ._histogram import build_histogram, subtract_histogram -from ._split import SplitInfo, compute_leaf_value, find_best_split +from ._split import compute_leaf_value, find_best_split if TYPE_CHECKING: from numpy.typing import NDArray @@ -74,11 +71,11 @@ class Tree: _right: NDArray | None = field(default=None, repr=False) # GPU arrays for fast GPU training (Phase 5.1) - _features_gpu: "DeviceNDArray | None" = field(default=None, repr=False) - _thresholds_gpu: "DeviceNDArray | None" = field(default=None, repr=False) - _values_gpu: "DeviceNDArray | None" = field(default=None, repr=False) - _left_gpu: "DeviceNDArray | None" = field(default=None, repr=False) - _right_gpu: "DeviceNDArray | None" = field(default=None, repr=False) + _features_gpu: DeviceNDArray | None = field(default=None, repr=False) + _thresholds_gpu: DeviceNDArray | None = field(default=None, repr=False) + _values_gpu: DeviceNDArray | None = field(default=None, repr=False) + _left_gpu: DeviceNDArray | None = field(default=None, repr=False) + _right_gpu: DeviceNDArray | None = field(default=None, repr=False) @property def on_gpu(self) -> bool: @@ -395,10 +392,7 @@ def fit_tree( subsample_mask = None # Get growth strategy - if isinstance(growth, str): - strategy = get_growth_strategy(growth) - else: - strategy = growth + strategy = get_growth_strategy(growth) if isinstance(growth, str) else growth # Build config config = GrowthConfig( @@ -835,7 +829,7 @@ def fit_trees_batch( >>> >>> # all_trees[0] contains trees for first config, etc. """ - from .._models._batch import ConfigBatch, BatchTrainingState + from .._models._batch import BatchTrainingState, ConfigBatch if not isinstance(configs, ConfigBatch): raise TypeError(f"configs must be ConfigBatch, got {type(configs)}") @@ -854,7 +848,6 @@ def fit_trees_batch( hess = as_numba_array(hess) n_configs = configs.n_configs - n_rounds = configs.n_rounds # Initialize training state state = BatchTrainingState.create(n_configs, n_samples) @@ -892,7 +885,7 @@ def _fit_trees_batch_cpu( for round_idx in range(n_rounds): # Compute gradients from current predictions # Note: User provides initial grad/hess, subsequent rounds recompute - if round_idx > 0: + if round_idx > 0: # noqa: SIM108 # Recompute MSE gradients from current predictions. # Initial grad = 2*(0 - y) = -2y, so for MSE: # grad_new = 2*(pred - y) = 2*pred + initial_grad @@ -934,13 +927,7 @@ def _fit_trees_batch_cuda( ) -> list[list[Tree]]: """CUDA batch training using fused kernels.""" from numba import cuda - from .._backends._cuda import ( - build_histogram_batch_cuda, - find_best_split_batch_cuda, - compute_split_masks_batch_cuda, - reduce_sum_cuda, - to_device, - ) + n_configs = configs.n_configs n_rounds = configs.n_rounds @@ -1222,10 +1209,7 @@ def predict_symmetric_tree(tree: SymmetricTree, X: BinnedArray | NDArray) -> NDA Prediction is just bit operations - very fast! """ - if isinstance(X, BinnedArray): - binned = X.data - else: - binned = X + binned = X.data if isinstance(X, BinnedArray) else X use_gpu = is_cuda() and hasattr(binned, '__cuda_array_interface__') diff --git a/src/openboost/_distributed/__init__.py b/src/openboost/_distributed/__init__.py index 38f1555..0901ef6 100644 --- a/src/openboost/_distributed/__init__.py +++ b/src/openboost/_distributed/__init__.py @@ -4,9 +4,9 @@ Phase 18: Adds multi-GPU support via Ray actors. """ -from typing import Protocol, Any +from typing import Any, Protocol + from numpy.typing import NDArray -import numpy as np class DistributedContext(Protocol): @@ -29,13 +29,12 @@ def partition_data(self, X: NDArray, y: NDArray) -> tuple[NDArray, NDArray]: # Phase 18: Multi-GPU support from ._multigpu import ( - GPUWorkerBase, GPUWorker, + GPUWorkerBase, MultiGPUContext, fit_tree_multigpu, ) - __all__ = [ "DistributedContext", # Phase 18: Multi-GPU diff --git a/src/openboost/_distributed/_multigpu.py b/src/openboost/_distributed/_multigpu.py index bbe3dc0..d8ed219 100644 --- a/src/openboost/_distributed/_multigpu.py +++ b/src/openboost/_distributed/_multigpu.py @@ -22,7 +22,7 @@ from __future__ import annotations from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, Callable, List, Optional +from typing import TYPE_CHECKING, Any import numpy as np from numpy.typing import NDArray @@ -59,7 +59,7 @@ def __init__( X_shard: NDArray, y_shard: NDArray, n_bins: int, - bin_edges: Optional[NDArray] = None, + bin_edges: NDArray | None = None, ): """Initialize worker with data shard on assigned GPU. @@ -164,7 +164,7 @@ def build_histogram( self, grad: NDArray, hess: NDArray, - node_ids: Optional[List[int]] = None, + node_ids: list[int] | None = None, ) -> tuple[NDArray, NDArray]: """Build local histogram for this shard. @@ -180,7 +180,6 @@ def build_histogram( or dict mapping node_id to histogram """ from .._core._histogram import build_histogram - from .._array import as_numba_array # Get binned data binned = self.X_binned.data @@ -240,7 +239,7 @@ def get_n_samples(self) -> int: """Get number of samples in this shard.""" return self.n_samples - def get_bin_edges(self) -> Optional[NDArray]: + def get_bin_edges(self) -> NDArray | None: """Get bin edges used by this worker (for consistent binning).""" if hasattr(self.X_binned, 'bin_edges'): return self.X_binned.bin_edges @@ -248,10 +247,7 @@ def get_bin_edges(self) -> Optional[NDArray]: # Create Ray remote version if Ray is available -if ray is not None: - GPUWorker = ray.remote(num_gpus=1)(GPUWorkerBase) -else: - GPUWorker = GPUWorkerBase +GPUWorker = ray.remote(num_gpus=1)(GPUWorkerBase) if ray is not None else GPUWorkerBase # ============================================================================= @@ -291,11 +287,11 @@ class MultiGPUContext: """ n_gpus: int = None - devices: List[int] = None - workers: List[Any] = None + devices: list[int] = None + workers: list[Any] = None n_features: int = None n_samples: int = None - shard_sizes: List[int] = None + shard_sizes: list[int] = None bin_edges: NDArray = None def __post_init__(self): @@ -334,7 +330,7 @@ def setup( X: NDArray, y: NDArray, n_bins: int = 256, - bin_edges: Optional[NDArray] = None, + bin_edges: NDArray | None = None, ): """Shard data and create GPU workers. @@ -377,7 +373,7 @@ def setup( # Create workers self.workers = [] - for gpu_id, shard_indices in zip(self.devices, indices): + for gpu_id, shard_indices in zip(self.devices, indices, strict=False): X_shard = X[shard_indices] y_shard = y[shard_indices] @@ -393,7 +389,7 @@ def setup( def compute_all_gradients( self, loss_fn: LossFunction, - ) -> List[tuple[NDArray, NDArray]]: + ) -> list[tuple[NDArray, NDArray]]: """Compute gradients on all workers in parallel. Args: @@ -410,8 +406,8 @@ def compute_all_gradients( def build_all_histograms( self, - grads_hess: List[tuple[NDArray, NDArray]], - ) -> List[tuple[NDArray, NDArray]]: + grads_hess: list[tuple[NDArray, NDArray]], + ) -> list[tuple[NDArray, NDArray]]: """Build local histograms on all workers in parallel. Args: @@ -422,13 +418,13 @@ def build_all_histograms( """ hist_refs = [ worker.build_histogram.remote(grad, hess) - for worker, (grad, hess) in zip(self.workers, grads_hess) + for worker, (grad, hess) in zip(self.workers, grads_hess, strict=False) ] return ray.get(hist_refs) def aggregate_histograms( self, - local_histograms: List[tuple[NDArray, NDArray]], + local_histograms: list[tuple[NDArray, NDArray]], ) -> tuple[NDArray, NDArray]: """Sum histograms from all workers (AllReduce). @@ -492,7 +488,7 @@ def shutdown(self): def fit_tree_multigpu( ctx: MultiGPUContext, - grads_hess: List[tuple[NDArray, NDArray]], + grads_hess: list[tuple[NDArray, NDArray]], *, max_depth: int = 6, min_child_weight: float = 1.0, @@ -522,9 +518,6 @@ def fit_tree_multigpu( Returns: Fitted TreeStructure """ - from .._core._tree import fit_tree - from .._array import BinnedArray - import openboost as ob # Build local histograms on each GPU local_histograms = ctx.build_all_histograms(grads_hess) @@ -541,7 +534,7 @@ def fit_tree_multigpu( total_hess = np.zeros(ctx.n_samples, dtype=np.float32) offset = 0 - for (grad, hess), size in zip(grads_hess, ctx.shard_sizes): + for (grad, hess), size in zip(grads_hess, ctx.shard_sizes, strict=False): total_grad[offset:offset + size] = grad total_hess[offset:offset + size] = hess offset += size @@ -549,7 +542,7 @@ def fit_tree_multigpu( # Create a dummy BinnedArray for tree fitting # In practice, we'd want to do distributed tree building # For now, collect data to driver and fit there - all_preds = ray.get([w.get_predictions.remote() for w in ctx.workers]) + ray.get([w.get_predictions.remote() for w in ctx.workers]) # Use the global histogram for tree building # This is where we'd integrate with fit_tree_from_histogram @@ -558,11 +551,10 @@ def fit_tree_multigpu( # Get binned data from first worker for structure # NOTE: This is a simplification - full implementation would do # distributed tree building with sample partitioning - first_worker_bin_edges = ray.get(ctx.workers[0].get_bin_edges.remote()) + ray.get(ctx.workers[0].get_bin_edges.remote()) # Build tree using growth strategy with global histogram - from .._core._growth import LevelWiseGrowth, GrowthConfig - from .._core._primitives import NodeHistogram + from .._core._growth import GrowthConfig config = GrowthConfig( max_depth=max_depth, @@ -604,9 +596,9 @@ def _build_tree_from_global_histogram( TreeStructure """ from .._core._growth import TreeStructure - from .._core._split import find_best_split, compute_leaf_value + from .._core._split import compute_leaf_value, find_best_split - n_bins = hist_grad.shape[1] + hist_grad.shape[1] max_nodes = 2**(config.max_depth + 1) - 1 # Initialize tree arrays @@ -695,7 +687,7 @@ def _build_tree_from_global_histogram( left_hist_grad = np.zeros_like(h_grad) left_hist_hess = np.zeros_like(h_hess) - for f in range(n_features): + for _f in range(n_features): # For the split feature, partition bins at the threshold # For all other features, we need the full histogram # conditioned on left/right. With only histogram information @@ -716,8 +708,8 @@ def _build_tree_from_global_histogram( # Left child: bins <= threshold for the split feature left_sf_grad = float(np.sum(h_grad[sf, :t + 1])) left_sf_hess = float(np.sum(h_hess[sf, :t + 1])) - right_sf_grad = s_grad - left_sf_grad - right_sf_hess = s_hess - left_sf_hess + s_grad - left_sf_grad + s_hess - left_sf_hess # For each feature, split histogram bins proportionally based on # the split feature's left/right ratio. diff --git a/src/openboost/_distributed/_ray.py b/src/openboost/_distributed/_ray.py index a2c7034..d06473a 100644 --- a/src/openboost/_distributed/_ray.py +++ b/src/openboost/_distributed/_ray.py @@ -3,7 +3,8 @@ Phase 12: Implements distributed training using Ray for multi-GPU/multi-node. """ -from typing import Any, List, Dict +from typing import Any + import numpy as np from numpy.typing import NDArray @@ -42,7 +43,7 @@ def __init__(self, X_shard: NDArray, y_shard: NDArray, n_bins: int, self.pred = np.zeros(self.n_samples, dtype=np.float32) def compute_histograms(self, grad: NDArray, hess: NDArray, - node_ids: List[int]) -> Dict[int, Any]: + node_ids: list[int]) -> dict[int, Any]: """Compute local histograms for this shard.""" histograms = build_node_histograms( self.X_binned.data if hasattr(self.X_binned, 'data') else self.X_binned, @@ -120,10 +121,10 @@ def setup(self, X: NDArray, y: NDArray, n_bins: int): self.workers = [ RayWorker.remote(s, ys, n_bins, bin_edges=global_bin_edges) - for s, ys in zip(shards, y_shards) + for s, ys in zip(shards, y_shards, strict=False) ] - def allreduce_histograms(self, local_hists_refs: List[Any]) -> Dict[int, Any]: + def allreduce_histograms(self, local_hists_refs: list[Any]) -> dict[int, Any]: """Sum histograms from all workers.""" local_hists = ray.get(local_hists_refs) diff --git a/src/openboost/_distributed/_tree.py b/src/openboost/_distributed/_tree.py index 3acb8af..bdc2562 100644 --- a/src/openboost/_distributed/_tree.py +++ b/src/openboost/_distributed/_tree.py @@ -3,7 +3,8 @@ Phase 12: Implements distributed tree building using histogram aggregation. """ -from typing import Any, List, Optional +from typing import Any + import numpy as np try: @@ -11,20 +12,15 @@ except ImportError: ray = None -from openboost._core._growth import TreeStructure, GrowthConfig -from openboost._core._primitives import ( - find_node_splits, - compute_leaf_values, - NodeHistogram, - NodeSplit -) +from openboost._core._growth import TreeStructure +from openboost._core._primitives import find_node_splits def fit_tree_distributed( ctx: Any, # DistributedContext - workers: List[Any], - grad_refs: List[Any], # Ray object refs - hess_refs: List[Any], + workers: list[Any], + grad_refs: list[Any], # Ray object refs + hess_refs: list[Any], *, max_depth: int = 6, min_child_weight: float = 1.0, @@ -42,7 +38,7 @@ def fit_tree_distributed( ) # 1. Initialize - sample_node_ids_refs = [w.init_node_ids.remote() for w in workers] + [w.init_node_ids.remote() for w in workers] n_features = get_worker_n_features(workers[0]) @@ -67,7 +63,7 @@ def fit_tree_distributed( # 3. Compute local histograms local_hists_refs = [ w.compute_histograms.remote(g, h, active_nodes) - for w, g, h in zip(workers, grad_refs, hess_refs) + for w, g, h in zip(workers, grad_refs, hess_refs, strict=False) ] # 4. Aggregate histograms @@ -103,14 +99,13 @@ def fit_tree_distributed( # 8. Compute leaf values leaf_nodes = [] for i in range(max_nodes): - if left_children[i] == -1: - if i == 0 or features[(i-1)//2] >= 0: - leaf_nodes.append(i) + if left_children[i] == -1 and (i == 0 or features[(i-1)//2] >= 0): + leaf_nodes.append(i) if leaf_nodes: local_hists_refs = [ w.compute_histograms.remote(g, h, leaf_nodes) - for w, g, h in zip(workers, grad_refs, hess_refs) + for w, g, h in zip(workers, grad_refs, hess_refs, strict=False) ] leaf_histograms = ctx.allreduce_histograms(local_hists_refs) diff --git a/src/openboost/_distributions.py b/src/openboost/_distributions.py index 4405bec..e24bb86 100644 --- a/src/openboost/_distributions.py +++ b/src/openboost/_distributions.py @@ -21,16 +21,12 @@ from abc import ABC, abstractmethod from dataclasses import dataclass -from typing import TYPE_CHECKING, Tuple import numpy as np from numpy.typing import NDArray -from ._backends import is_cuda - - # Type alias for gradient/hessian tuple -GradHess = Tuple[NDArray, NDArray] +GradHess = tuple[NDArray, NDArray] @dataclass @@ -42,7 +38,7 @@ class DistributionOutput: distribution: The Distribution instance used """ params: dict[str, NDArray] - distribution: "Distribution" + distribution: Distribution def mean(self) -> NDArray: """Expected value E[Y|X].""" @@ -60,7 +56,7 @@ def quantile(self, q: float) -> NDArray: """q-th quantile (0 < q < 1).""" return self.distribution.quantile(self.params, q) - def interval(self, alpha: float = 0.1) -> Tuple[NDArray, NDArray]: + def interval(self, alpha: float = 0.1) -> tuple[NDArray, NDArray]: """(1-alpha) prediction interval. Args: @@ -1272,7 +1268,7 @@ def init_params(self, y: NDArray) -> dict[str, float]: var_y = float(np.var(y_clip)) + 1e-6 # Estimate r from method of moments - if var_y > mu_init: + if var_y > mu_init: # noqa: SIM108 r_init = mu_init ** 2 / (var_y - mu_init) else: r_init = 10.0 # Default if not overdispersed @@ -1570,7 +1566,7 @@ def _numerical_gradient( """ results = {} eps = self._eps - n = len(y) + len(y) # Compute center NLL once nll_center = self._nll_fn(y, params) @@ -1617,7 +1613,7 @@ def _jax_gradient( # Define loss for a single sample def single_nll(param_values, y_single): - params_dict = {name: jnp.array([val]) for name, val in zip(self._param_names, param_values)} + params_dict = {name: jnp.array([val]) for name, val in zip(self._param_names, param_values, strict=False)} return self._nll_fn(jnp.array([y_single]), params_dict)[0] # Create grad and hessian functions once (cached pattern) @@ -1662,7 +1658,7 @@ def single_nll(param_values, y_single): results[name][0][i] = float(g[j]) results[name][1][i] = max(float(h[j, j]), 1e-6) except Exception: - for j, name in enumerate(self._param_names): + for _j, name in enumerate(self._param_names): results[name][0][i] = 0.0 results[name][1][i] = 1.0 diff --git a/src/openboost/_histogram.py b/src/openboost/_histogram.py index ca4c99b..6cb5daf 100644 --- a/src/openboost/_histogram.py +++ b/src/openboost/_histogram.py @@ -3,7 +3,7 @@ import numpy as np from numba import cuda -from ._kernels import histogram_kernel, HIST_BLOCK_SIZE, MAX_BINS +from ._kernels import HIST_BLOCK_SIZE, MAX_BINS, histogram_kernel def build_histograms( diff --git a/src/openboost/_importance.py b/src/openboost/_importance.py index 47bdd9e..39c4e68 100644 --- a/src/openboost/_importance.py +++ b/src/openboost/_importance.py @@ -21,9 +21,8 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any, Protocol, runtime_checkable - import warnings +from typing import TYPE_CHECKING, Any, Protocol, runtime_checkable import numpy as np @@ -237,7 +236,7 @@ def get_feature_importance_dict( feature_names = [f"feature_{i}" for i in range(len(importances))] # Create dict and sort by importance - importance_dict = dict(zip(feature_names, importances)) + importance_dict = dict(zip(feature_names, importances, strict=False)) sorted_dict = dict(sorted(importance_dict.items(), key=lambda x: x[1], reverse=True)) # Limit to top N if requested @@ -274,8 +273,8 @@ def plot_feature_importances( """ try: import matplotlib.pyplot as plt - except ImportError: - raise ImportError("matplotlib is required for plotting. Install with: pip install matplotlib") + except ImportError as err: + raise ImportError("matplotlib is required for plotting. Install with: pip install matplotlib") from err importances = compute_feature_importances(model, importance_type, normalize=True) diff --git a/src/openboost/_kernels.py b/src/openboost/_kernels.py index 0e3c648..887cc0b 100644 --- a/src/openboost/_kernels.py +++ b/src/openboost/_kernels.py @@ -118,11 +118,10 @@ def find_best_split_kernel( # Tree reduction to find best s = cuda.blockDim.x // 2 while s > 0: - if tid < s: - if shared_gain[tid + s] > shared_gain[tid]: - shared_gain[tid] = shared_gain[tid + s] - shared_feature[tid] = shared_feature[tid + s] - shared_bin[tid] = shared_bin[tid + s] + if tid < s and shared_gain[tid + s] > shared_gain[tid]: + shared_gain[tid] = shared_gain[tid + s] + shared_feature[tid] = shared_feature[tid + s] + shared_bin[tid] = shared_bin[tid + s] cuda.syncthreads() s //= 2 @@ -185,10 +184,7 @@ def predict_kernel( node = 0 while not tree_is_leaf[node]: feature = tree_features[node] - if X_binned[feature, idx] <= tree_bins[node]: - node = 2 * node + 1 - else: - node = 2 * node + 2 + node = 2 * node + 1 if X_binned[feature, idx] <= tree_bins[node] else 2 * node + 2 predictions[idx] += learning_rate * tree_values[node] diff --git a/src/openboost/_loss.py b/src/openboost/_loss.py index 790ce59..e649442 100644 --- a/src/openboost/_loss.py +++ b/src/openboost/_loss.py @@ -7,7 +7,8 @@ from __future__ import annotations import warnings -from typing import TYPE_CHECKING, Callable +from collections.abc import Callable +from typing import TYPE_CHECKING import numpy as np @@ -288,8 +289,9 @@ def _logloss_gradient_gpu(pred, y): def _get_logloss_kernel(): """Lazily compile LogLoss gradient kernel.""" - from numba import cuda import math + + from numba import cuda @cuda.jit def kernel(pred, y, grad, hess, n): @@ -669,8 +671,9 @@ def _poisson_gradient_gpu(pred, y): def _get_poisson_kernel(): """Lazily compile Poisson gradient kernel.""" - from numba import cuda import math + + from numba import cuda @cuda.jit def kernel(pred, y, grad, hess, n): @@ -759,8 +762,9 @@ def _gamma_gradient_gpu(pred, y): def _get_gamma_kernel(): """Lazily compile Gamma gradient kernel.""" - from numba import cuda import math + + from numba import cuda @cuda.jit def kernel(pred, y, grad, hess, n): @@ -863,8 +867,9 @@ def _tweedie_gradient_gpu(pred, y, rho: float = 1.5): def _get_tweedie_kernel(): """Lazily compile Tweedie gradient kernel.""" - from numba import cuda import math + + from numba import cuda @cuda.jit def kernel(pred, y, grad, hess, n, rho): @@ -876,7 +881,7 @@ def kernel(pred, y, grad, hess, n, rho): elif p < -20: p = -20.0 - mu = math.exp(p) + math.exp(p) # mu^(2-rho) and mu^(1-rho) via exp mu_2_rho = math.exp(p * (2.0 - rho)) @@ -955,10 +960,7 @@ def _softmax_gradient_gpu(pred, y, n_classes: int): else: pred_cpu = np.asarray(pred, dtype=np.float32) - if hasattr(y, 'copy_to_host'): - y_cpu = y.copy_to_host() - else: - y_cpu = np.asarray(y) + y_cpu = y.copy_to_host() if hasattr(y, 'copy_to_host') else np.asarray(y) return _softmax_gradient_cpu(pred_cpu, y_cpu, n_classes) diff --git a/src/openboost/_models/__init__.py b/src/openboost/_models/__init__.py index 2618145..b145847 100644 --- a/src/openboost/_models/__init__.py +++ b/src/openboost/_models/__init__.py @@ -8,42 +8,42 @@ Phase 16: Renamed NGBoost -> NaturalBoost for clarity. """ +from ._batch import BatchTrainingState, ConfigBatch from ._boosting import GradientBoosting, MultiClassGradientBoosting from ._dart import DART -from ._gam import OpenBoostGAM -from ._batch import ConfigBatch, BatchTrainingState -from ._sklearn import ( - OpenBoostRegressor, - OpenBoostClassifier, - OpenBoostDistributionalRegressor, - OpenBoostLinearLeafRegressor, -) # Phase 15/16: Distributional GBDT and NaturalBoost from ._distributional import ( DistributionalGBDT, # Primary names (Phase 16) NaturalBoost, - NaturalBoostNormal, - NaturalBoostLogNormal, NaturalBoostGamma, + NaturalBoostLogNormal, + NaturalBoostNegBin, + NaturalBoostNormal, NaturalBoostPoisson, NaturalBoostStudentT, NaturalBoostTweedie, - NaturalBoostNegBin, # Backward compatibility aliases NGBoost, - NGBoostNormal, - NGBoostLogNormal, NGBoostGamma, + NGBoostLogNormal, + NGBoostNegBin, + NGBoostNormal, NGBoostPoisson, NGBoostStudentT, NGBoostTweedie, - NGBoostNegBin, ) +from ._gam import OpenBoostGAM # Phase 15: Linear Leaf GBDT from ._linear_leaf import LinearLeafGBDT, LinearLeafTree +from ._sklearn import ( + OpenBoostClassifier, + OpenBoostDistributionalRegressor, + OpenBoostLinearLeafRegressor, + OpenBoostRegressor, +) __all__ = [ # Standard GBDT diff --git a/src/openboost/_models/_batch.py b/src/openboost/_models/_batch.py index 5417c85..604f06c 100644 --- a/src/openboost/_models/_batch.py +++ b/src/openboost/_models/_batch.py @@ -6,8 +6,9 @@ from __future__ import annotations +from collections.abc import Sequence from dataclasses import dataclass, field -from typing import TYPE_CHECKING, Sequence +from typing import TYPE_CHECKING import numpy as np diff --git a/src/openboost/_models/_boosting.py b/src/openboost/_models/_boosting.py index a9a1e7a..1f263f4 100644 --- a/src/openboost/_models/_boosting.py +++ b/src/openboost/_models/_boosting.py @@ -15,34 +15,29 @@ from __future__ import annotations import os +from collections.abc import Callable from dataclasses import dataclass, field -from pathlib import Path -from typing import TYPE_CHECKING, Callable, Literal +from typing import TYPE_CHECKING, Literal import numpy as np from .._array import BinnedArray, array from .._backends import is_cuda -from .._loss import get_loss_function, compute_loss_value, LossFunction -from .._core._tree import fit_tree -from .._core._growth import TreeStructure from .._callbacks import Callback, CallbackManager, TrainingState +from .._core._growth import TreeStructure +from .._core._tree import fit_tree +from .._loss import LossFunction, compute_loss_value, get_loss_function +from .._persistence import PersistenceMixin from .._sampling import ( - SamplingStrategy, goss_sample, - random_sample, - apply_sampling, - MiniBatchIterator, - accumulate_histograms_minibatch, ) -from .._persistence import PersistenceMixin from .._validation import ( - validate_X, - validate_y, - validate_sample_weight, validate_eval_set, validate_hyperparameters, validate_predict_input, + validate_sample_weight, + validate_X, + validate_y, ) try: @@ -310,9 +305,8 @@ def _fit_distributed(self, y: NDArray, n_samples: int): ctx.setup(X_data, y, self.n_bins) - import ray - for i in range(self.n_trees): + for _i in range(self.n_trees): # Compute gradients on each worker grad_hess_refs = [ w.compute_gradients.options(num_returns=2).remote(self._loss_fn) @@ -385,7 +379,6 @@ def _fit_multigpu( ctx.setup(X_data, y, n_bins=self.n_bins) try: - import ray # Training loop for i in range(self.n_trees): @@ -408,8 +401,6 @@ def _fit_multigpu( # For proper tree building, we need the full binned data # Use a simplified approach: fit tree on driver with full histogram info - from .._core._growth import TreeStructure, GrowthConfig - from .._core._split import find_best_split, compute_leaf_value tree = self._build_tree_from_histogram( global_hist_grad, @@ -457,7 +448,7 @@ def _build_tree_from_histogram( Uses recursive histogram-based tree building similar to LightGBM. """ from .._core._growth import TreeStructure - from .._core._split import find_best_split, compute_leaf_value + from .._core._split import compute_leaf_value, find_best_split max_nodes = 2**(self.max_depth + 1) - 1 features = np.full(max_nodes, -1, dtype=np.int32) @@ -800,7 +791,6 @@ def _fit_cpu( # Determine sampling strategy use_goss = self.subsample_strategy == 'goss' use_random_sampling = self.subsample_strategy == 'random' and self.subsample < 1.0 - use_minibatch = self.batch_size is not None and self.batch_size < n_samples # Train trees for i in range(self.n_trees): @@ -1080,7 +1070,7 @@ class MultiClassGradientBoosting(PersistenceMixin): X_binned_: BinnedArray | None = field(default=None, init=False, repr=False) n_features_in_: int = field(default=0, init=False, repr=False) - def fit(self, X: NDArray, y: NDArray) -> "MultiClassGradientBoosting": + def fit(self, X: NDArray, y: NDArray) -> MultiClassGradientBoosting: """Fit the multi-class gradient boosting model. Args: diff --git a/src/openboost/_models/_dart.py b/src/openboost/_models/_dart.py index eb8d33e..27741c7 100644 --- a/src/openboost/_models/_dart.py +++ b/src/openboost/_models/_dart.py @@ -20,10 +20,9 @@ import numpy as np from .._array import BinnedArray, array -from .._backends import is_cuda -from .._core._growth import TreeStructure, GrowthConfig -from .._loss import get_loss_function, LossFunction +from .._core._growth import TreeStructure from .._core._tree import fit_tree +from .._loss import LossFunction, get_loss_function from .._persistence import PersistenceMixin if TYPE_CHECKING: @@ -86,7 +85,7 @@ class DART(PersistenceMixin): _loss_fn: LossFunction | None = field(default=None, init=False, repr=False) _rng: np.random.Generator | None = field(default=None, init=False, repr=False) - def fit(self, X: NDArray, y: NDArray) -> "DART": + def fit(self, X: NDArray, y: NDArray) -> DART: """Fit the DART model. Args: @@ -125,7 +124,7 @@ def fit(self, X: NDArray, y: NDArray) -> "DART": pred = np.full(n_samples, self.base_score_, dtype=np.float32) # Train trees - for i in range(self.n_trees): + for _i in range(self.n_trees): # Decide whether to apply dropout this round apply_dropout = ( len(self.trees_) > 0 and @@ -176,7 +175,7 @@ def fit(self, X: NDArray, y: NDArray) -> "DART": base = getattr(self, 'base_score_', np.float32(0.0)) pred = np.full(n_samples_tmp, base, dtype=np.float32) excluded_set = set(dropped_indices) - for t_i, (t, w) in enumerate(zip(self.trees_, self.tree_weights_)): + for t_i, (t, w) in enumerate(zip(self.trees_, self.tree_weights_, strict=False)): t_pred = t(self.X_binned_) if hasattr(t_pred, 'copy_to_host'): t_pred = t_pred.copy_to_host() @@ -231,7 +230,7 @@ def _predict_without_trees( excluded_set = set(excluded_indices) - for i, (tree, weight) in enumerate(zip(self.trees_, self.tree_weights_)): + for i, (tree, weight) in enumerate(zip(self.trees_, self.tree_weights_, strict=False)): if i in excluded_set: continue tree_pred = tree(X) @@ -247,7 +246,7 @@ def _predict_internal(self, X: BinnedArray) -> NDArray: base = getattr(self, 'base_score_', np.float32(0.0)) pred = np.full(n_samples, base, dtype=np.float32) - for tree, weight in zip(self.trees_, self.tree_weights_): + for tree, weight in zip(self.trees_, self.tree_weights_, strict=False): tree_pred = tree(X) if hasattr(tree_pred, 'copy_to_host'): tree_pred = tree_pred.copy_to_host() diff --git a/src/openboost/_models/_distributional.py b/src/openboost/_models/_distributional.py index 8ab7e7b..087329f 100644 --- a/src/openboost/_models/_distributional.py +++ b/src/openboost/_models/_distributional.py @@ -36,20 +36,18 @@ from __future__ import annotations from dataclasses import dataclass, field -from pathlib import Path from typing import TYPE_CHECKING, Literal import numpy as np from .._array import BinnedArray, array -from .._backends import is_cuda +from .._core._growth import TreeStructure +from .._core._tree import fit_tree from .._distributions import ( Distribution, DistributionOutput, get_distribution, ) -from .._core._tree import fit_tree -from .._core._growth import TreeStructure from .._persistence import PersistenceMixin if TYPE_CHECKING: @@ -118,7 +116,7 @@ class DistributionalGBDT(PersistenceMixin): _base_scores: dict[str, float] = field(default_factory=dict, init=False, repr=False) n_features_in_: int = field(default=0, init=False, repr=False) - def fit(self, X: NDArray, y: NDArray) -> "DistributionalGBDT": + def fit(self, X: NDArray, y: NDArray) -> DistributionalGBDT: """Fit the distributional gradient boosting model. Args: @@ -159,7 +157,7 @@ def fit(self, X: NDArray, y: NDArray) -> "DistributionalGBDT": params[param_name] = self.distribution_.link(param_name, raw_preds[param_name]) # Training loop - for round_idx in range(self.n_trees): + for _round_idx in range(self.n_trees): # Update params from raw predictions (apply link functions) for param_name in self.distribution_.param_names: params[param_name] = self.distribution_.link( diff --git a/src/openboost/_models/_gam.py b/src/openboost/_models/_gam.py index 8cf56c5..a55c90a 100644 --- a/src/openboost/_models/_gam.py +++ b/src/openboost/_models/_gam.py @@ -19,7 +19,7 @@ from .._array import BinnedArray, array from .._backends import is_cuda -from .._loss import get_loss_function, LossFunction +from .._loss import LossFunction, get_loss_function from .._persistence import PersistenceMixin if TYPE_CHECKING: @@ -67,7 +67,7 @@ class OpenBoostGAM(PersistenceMixin): X_binned_: BinnedArray | None = field(default=None, init=False, repr=False) _loss_fn: LossFunction | None = field(default=None, init=False, repr=False) - def fit(self, X: NDArray, y: NDArray) -> "OpenBoostGAM": + def fit(self, X: NDArray, y: NDArray) -> OpenBoostGAM: """Fit the GAM model. Args: @@ -107,6 +107,7 @@ def fit(self, X: NDArray, y: NDArray) -> "OpenBoostGAM": def _fit_gpu(self, y: NDArray): """GPU training path - all features in parallel.""" from numba import cuda + from .._backends._cuda import build_histogram_cuda n_features = self.X_binned_.n_features @@ -287,8 +288,8 @@ def plot_shape_function(self, feature_idx: int, feature_name: str | None = None) """ try: import matplotlib.pyplot as plt - except ImportError: - raise ImportError("matplotlib required for plotting. Install with: pip install matplotlib") + except ImportError as err: + raise ImportError("matplotlib required for plotting. Install with: pip install matplotlib") from err if self.shape_values_ is None: raise RuntimeError("Model not fitted.") diff --git a/src/openboost/_models/_linear_leaf.py b/src/openboost/_models/_linear_leaf.py index 96c5b9e..5fabdfb 100644 --- a/src/openboost/_models/_linear_leaf.py +++ b/src/openboost/_models/_linear_leaf.py @@ -32,10 +32,9 @@ import numpy as np from .._array import BinnedArray, array -from .._backends import is_cuda -from .._loss import get_loss_function, LossFunction -from .._core._tree import fit_tree from .._core._growth import TreeStructure +from .._core._tree import fit_tree +from .._loss import LossFunction, get_loss_function from .._persistence import PersistenceMixin if TYPE_CHECKING: @@ -89,7 +88,7 @@ def predict(self, X: NDArray) -> NDArray: node_id = int(leaf_node_ids[sample_idx]) # Look up leaf index using integer node index as key. - if node_id in self.leaf_ids: + if node_id in self.leaf_ids: # noqa: SIM108 leaf_idx = self.leaf_ids[node_id] else: # Fallback: use the constant term from first leaf @@ -119,10 +118,7 @@ def _get_leaf_node_indices(self, X: NDArray) -> NDArray: # Get binned data for tree traversal from .._array import BinnedArray as BA - if isinstance(X_binned, BA): - binned = X_binned.data - else: - binned = X_binned + binned = X_binned.data if isinstance(X_binned, BA) else X_binned if hasattr(binned, 'copy_to_host'): binned = binned.copy_to_host() @@ -206,7 +202,7 @@ class LinearLeafGBDT(PersistenceMixin): _loss_fn: LossFunction | None = field(default=None, init=False, repr=False) n_features_in_: int = field(default=0, init=False, repr=False) - def fit(self, X: NDArray, y: NDArray) -> "LinearLeafGBDT": + def fit(self, X: NDArray, y: NDArray) -> LinearLeafGBDT: """Fit the linear leaf GBDT model. Args: @@ -255,7 +251,7 @@ def fit(self, X: NDArray, y: NDArray) -> "LinearLeafGBDT": self.base_score_ = np.float32(np.mean(y)) pred = np.full(n_samples, self.base_score_, dtype=np.float32) - for round_idx in range(self.n_trees): + for _round_idx in range(self.n_trees): # Compute gradients grad, hess = self._loss_fn(pred, y) grad = np.asarray(grad, dtype=np.float32) diff --git a/src/openboost/_models/_sklearn.py b/src/openboost/_models/_sklearn.py index 4d5c155..24318c3 100644 --- a/src/openboost/_models/_sklearn.py +++ b/src/openboost/_models/_sklearn.py @@ -39,9 +39,9 @@ import numpy as np try: - from sklearn.base import BaseEstimator, RegressorMixin, ClassifierMixin - from sklearn.utils.validation import check_X_y, check_array, check_is_fitted + from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin from sklearn.preprocessing import LabelEncoder + from sklearn.utils.validation import check_array, check_is_fitted, check_X_y SKLEARN_AVAILABLE = True except ImportError: SKLEARN_AVAILABLE = False @@ -53,9 +53,9 @@ class RegressorMixin: class ClassifierMixin: pass -from ._boosting import GradientBoosting, MultiClassGradientBoosting -from .._callbacks import EarlyStopping, Logger, Callback +from .._callbacks import EarlyStopping from .._importance import compute_feature_importances +from ._boosting import GradientBoosting, MultiClassGradientBoosting if TYPE_CHECKING: from numpy.typing import NDArray @@ -204,7 +204,7 @@ def fit( y: NDArray, sample_weight: NDArray | None = None, eval_set: list[tuple[NDArray, NDArray]] | None = None, - ) -> "OpenBoostRegressor": + ) -> OpenBoostRegressor: """Fit the gradient boosting regressor. Parameters @@ -435,7 +435,7 @@ def fit( y: NDArray, sample_weight: NDArray | None = None, eval_set: list[tuple[NDArray, NDArray]] | None = None, - ) -> "OpenBoostClassifier": + ) -> OpenBoostClassifier: """Fit the gradient boosting classifier. Parameters @@ -698,7 +698,7 @@ def fit( X: NDArray, y: NDArray, **kwargs, - ) -> "OpenBoostDistributionalRegressor": + ) -> OpenBoostDistributionalRegressor: """Fit the distributional regressor. Parameters @@ -719,7 +719,7 @@ def fit( self.n_features_in_ = X.shape[1] # Import here to avoid circular imports - from ._distributional import NGBoost, DistributionalGBDT + from ._distributional import DistributionalGBDT, NGBoost ModelClass = NGBoost if self.use_natural_gradient else DistributionalGBDT @@ -961,7 +961,7 @@ def fit( X: NDArray, y: NDArray, **kwargs, - ) -> "OpenBoostLinearLeafRegressor": + ) -> OpenBoostLinearLeafRegressor: """Fit the linear leaf regressor. Parameters diff --git a/src/openboost/_persistence.py b/src/openboost/_persistence.py index 948a85d..1342e91 100644 --- a/src/openboost/_persistence.py +++ b/src/openboost/_persistence.py @@ -144,7 +144,7 @@ def _dict_to_tree(data: dict[str, Any]) -> TreeStructure: Returns: TreeStructure instance """ - from ._core._growth import TreeStructure, ScalarLeaves, VectorLeaves + from ._core._growth import ScalarLeaves, TreeStructure, VectorLeaves # Handle leaf values based on type values_type = data.get("values_type", "array") @@ -211,12 +211,12 @@ def _get_persist_attrs(self) -> list[str]: attrs = list(self.__dataclass_fields__.keys()) # Also include fitted attributes (sklearn convention: trailing _) # and other instance attributes not in dataclass fields - for k in vars(self).keys(): + for k in vars(self): if k not in attrs and not k.startswith("_"): attrs.append(k) return attrs # Fallback: all non-private attributes - return [k for k in vars(self).keys() if not k.startswith("_")] + return [k for k in vars(self) if not k.startswith("_")] def _to_state_dict(self) -> dict[str, Any]: """Convert model to a serializable state dictionary. @@ -285,7 +285,7 @@ def _from_state_dict(self, state: dict[str, Any]) -> None: import warnings _CURRENT_SERIALIZATION_VERSION = 1 - saved_version = state.get("_serialization_version", None) + saved_version = state.get("_serialization_version") if saved_version is None: warnings.warn( "Loading a model saved without a serialization version number. " @@ -341,8 +341,9 @@ def _from_state_dict(self, state: dict[str, Any]) -> None: # Restore bin edges for transform if "_bin_edges" in state: - from ._array import BinnedArray import numpy as np + + from ._array import BinnedArray # Create a minimal BinnedArray with just bin edges for transform n_features = state.get("_n_features", len(state["_bin_edges"])) @@ -425,6 +426,7 @@ def load(cls: type[T], path: str | Path) -> T: >>> predictions = model.predict(X_test) """ import warnings + import joblib warnings.warn( diff --git a/src/openboost/_predict.py b/src/openboost/_predict.py index cec7310..7a29db6 100644 --- a/src/openboost/_predict.py +++ b/src/openboost/_predict.py @@ -15,9 +15,10 @@ from ._backends import is_cuda if TYPE_CHECKING: - from ._tree import Tree from numpy.typing import NDArray + from ._tree import Tree + def predict_ensemble( trees: list[Tree], @@ -170,10 +171,7 @@ def kernel(X_binned, node_features, node_thresholds, node_left, node_right, while node_features[node] >= 0: # Not a leaf feat = node_features[node] val = X_binned[feat, idx] # Feature-major layout - if val <= node_thresholds[node]: - node = node_left[node] - else: - node = node_right[node] + node = node_left[node] if val <= node_thresholds[node] else node_right[node] # Add leaf value to prediction pred[idx] += learning_rate * node_values[node] diff --git a/src/openboost/_profiler.py b/src/openboost/_profiler.py index b3e429f..24fbb14 100644 --- a/src/openboost/_profiler.py +++ b/src/openboost/_profiler.py @@ -30,7 +30,6 @@ from ._callbacks import Callback, TrainingState - # ============================================================================= # Phase timer # ============================================================================= @@ -233,8 +232,9 @@ def _get_timer(self, name: str) -> PhaseTimer: def _wrap_primitives(self) -> None: import sys - import openboost._core._primitives as prims_mod + import openboost._core._growth as growth_mod + import openboost._core._primitives as prims_mod # Wrap the 4 core primitives for func_name in _PRIMITIVES_TO_WRAP: @@ -261,15 +261,16 @@ def wrapper(*args, **kwargs): for mod_name in _FIT_TREE_MODULES: mod = sys.modules.get(mod_name) if mod and hasattr(mod, "fit_tree"): - original_ft = getattr(mod, "fit_tree") + original_ft = mod.fit_tree self._originals[("fit_tree", mod_name)] = original_ft wrapped_ft = make_wrapper(original_ft, fit_tree_timer) - setattr(mod, "fit_tree", wrapped_ft) + mod.fit_tree = wrapped_ft def _unwrap_primitives(self) -> None: import sys - import openboost._core._primitives as prims_mod + import openboost._core._growth as growth_mod + import openboost._core._primitives as prims_mod for key, original in self._originals.items(): kind, name = key @@ -280,7 +281,7 @@ def _unwrap_primitives(self) -> None: elif kind == "fit_tree": mod = sys.modules.get(name) if mod: - setattr(mod, "fit_tree", original) + mod.fit_tree = original self._originals.clear() # ----- callback hooks ----- diff --git a/src/openboost/_sampling.py b/src/openboost/_sampling.py index fb72747..11be689 100644 --- a/src/openboost/_sampling.py +++ b/src/openboost/_sampling.py @@ -11,9 +11,10 @@ from __future__ import annotations -from dataclasses import dataclass, field +from collections.abc import Callable +from dataclasses import dataclass from enum import Enum -from typing import TYPE_CHECKING, Callable +from typing import TYPE_CHECKING import numpy as np @@ -167,10 +168,7 @@ def goss_sample( n_samples = len(grad) # Handle multi-dimensional gradients (e.g., distributional GBDT) - if grad.ndim > 1: - abs_grad = np.sum(np.abs(grad), axis=1) - else: - abs_grad = np.abs(grad) + abs_grad = np.sum(np.abs(grad), axis=1) if grad.ndim > 1 else np.abs(grad) # Number of samples to keep from each group n_top = int(n_samples * top_rate) @@ -523,10 +521,7 @@ def create_memmap_binned( binned = ob_array(X, n_bins=n_bins, device='cpu') # Get the binned data - if hasattr(binned.data, 'copy_to_host'): - data = binned.data.copy_to_host() - else: - data = binned.data + data = binned.data.copy_to_host() if hasattr(binned.data, 'copy_to_host') else binned.data # Create memory-mapped file mmap = np.memmap(path, dtype=np.uint8, mode='w+', shape=data.shape) diff --git a/src/openboost/_split.py b/src/openboost/_split.py index 25265a9..f4e456a 100644 --- a/src/openboost/_split.py +++ b/src/openboost/_split.py @@ -3,7 +3,7 @@ import numpy as np from numba import cuda -from ._kernels import find_best_split_kernel, HIST_BLOCK_SIZE +from ._kernels import HIST_BLOCK_SIZE, find_best_split_kernel def find_best_splits( diff --git a/src/openboost/_training.py b/src/openboost/_training.py index 7074935..30cda7f 100644 --- a/src/openboost/_training.py +++ b/src/openboost/_training.py @@ -22,8 +22,9 @@ from __future__ import annotations -from dataclasses import dataclass, field -from typing import TYPE_CHECKING, Any, Callable +from collections.abc import Callable +from dataclasses import dataclass +from typing import TYPE_CHECKING, Any import numpy as np diff --git a/src/openboost/_utils.py b/src/openboost/_utils.py index 538be7f..23f6d11 100644 --- a/src/openboost/_utils.py +++ b/src/openboost/_utils.py @@ -29,7 +29,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Literal, Any +from typing import TYPE_CHECKING, Any, Literal import numpy as np @@ -44,13 +44,13 @@ def _check_sklearn(): """Check if sklearn is available.""" try: - import sklearn + import sklearn # noqa: F401 return True - except ImportError: + except ImportError as err: raise ImportError( "scikit-learn is required for evaluation metrics. " "Install with: pip install scikit-learn" - ) + ) from err def roc_auc_score( @@ -474,7 +474,7 @@ def crps_empirical( if samples.ndim == 1: samples = samples.reshape(-1, 1) - n_samples = samples.shape[0] + samples.shape[0] n_mc = samples.shape[1] # E|X - y| term @@ -924,7 +924,6 @@ def suggest_params( n_unique = len(unique_y) # Detect task type from y if classification - is_binary = n_unique == 2 is_multiclass = n_unique > 2 and n_unique <= 50 and task == 'classification' is_imbalanced = False if task == 'classification' and n_unique <= 50: @@ -1030,13 +1029,13 @@ def cross_val_predict( >>> meta_model.fit(oof_pred.reshape(-1, 1), y) """ try: - from sklearn.model_selection import KFold from sklearn.base import clone - except ImportError: + from sklearn.model_selection import KFold + except ImportError as err: raise ImportError( "sklearn is required for cross_val_predict. " "Install with: pip install scikit-learn" - ) + ) from err X = np.asarray(X) y = np.asarray(y) @@ -1105,13 +1104,13 @@ def cross_val_predict_proba( AttributeError: If model doesn't have predict_proba method. """ try: - from sklearn.model_selection import StratifiedKFold from sklearn.base import clone - except ImportError: + from sklearn.model_selection import StratifiedKFold + except ImportError as err: raise ImportError( "sklearn is required for cross_val_predict_proba. " "Install with: pip install scikit-learn" - ) + ) from err if not hasattr(model, 'predict_proba'): raise AttributeError( @@ -1185,13 +1184,13 @@ def cross_val_predict_interval( AttributeError: If model doesn't have predict_interval method. """ try: - from sklearn.model_selection import KFold from sklearn.base import clone - except ImportError: + from sklearn.model_selection import KFold + except ImportError as err: raise ImportError( "sklearn is required for cross_val_predict_interval. " "Install with: pip install scikit-learn" - ) + ) from err if not hasattr(model, 'predict_interval'): raise AttributeError( diff --git a/src/openboost/_validation.py b/src/openboost/_validation.py index a842551..48c9c84 100644 --- a/src/openboost/_validation.py +++ b/src/openboost/_validation.py @@ -14,7 +14,7 @@ if TYPE_CHECKING: from numpy.typing import NDArray - from ._array import BinnedArray + class ValidationError(ValueError): @@ -191,7 +191,7 @@ def validate_y( # Check for infinity if np.any(np.isinf(y)): raise ValueError( - f"y contains infinite values. Replace with finite values." + "y contains infinite values. Replace with finite values." ) # Task-specific validation @@ -213,12 +213,11 @@ def validate_y( elif task == "multiclass": unique_values = np.unique(y) - if not np.issubdtype(y.dtype, np.integer): - if not np.allclose(y, y.astype(int)): - raise ValueError( - f"Multi-class classification expects integer class labels, " - f"got non-integer values. Convert y to integers." - ) + if not np.issubdtype(y.dtype, np.integer) and not np.allclose(y, y.astype(int)): + raise ValueError( + "Multi-class classification expects integer class labels, " + "got non-integer values. Convert y to integers." + ) if np.min(y) != 0: warnings.warn( f"Multi-class labels should start from 0, " From 844ff4b492a0cd1e5bf9d3037ff6ad9ac94f803e Mon Sep 17 00:00:00 2001 From: J Xu Date: Sun, 22 Mar 2026 19:32:53 -0700 Subject: [PATCH 3/6] Remove .claude directory from tracking Co-Authored-By: Claude Opus 4.6 --- .claude/worktrees/silly-kirch | 1 - 1 file changed, 1 deletion(-) delete mode 160000 .claude/worktrees/silly-kirch diff --git a/.claude/worktrees/silly-kirch b/.claude/worktrees/silly-kirch deleted file mode 160000 index 7450b78..0000000 --- a/.claude/worktrees/silly-kirch +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 7450b7841f2af64569a9cc573946c42b0fcae851 From 14231e3417f46f485f5d40d00dd87009976273cc Mon Sep 17 00:00:00 2001 From: J Xu Date: Sun, 22 Mar 2026 19:33:15 -0700 Subject: [PATCH 4/6] Add .claude/ to .gitignore Co-Authored-By: Claude Opus 4.6 --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 96b61f7..d5f513f 100644 --- a/.gitignore +++ b/.gitignore @@ -214,6 +214,7 @@ __marimo__/ repos/ # Project-specific +.claude/ .cursor/ agent_space/ logs/ From bda4781adf6d61dc0990e0729c59eacfd4787751 Mon Sep 17 00:00:00 2001 From: J Xu Date: Sun, 22 Mar 2026 19:54:23 -0700 Subject: [PATCH 5/6] Guard GPU-native builder against missing/categorical data and convert Tree to TreeStructure Addresses PR review comments: - Skip fit_tree_gpu_native when BinnedArray has missing values or categorical features, since the GPU-native builder doesn't support them - Convert legacy Tree to TreeStructure after GPU-native building so that feature importance, persistence, and sklearn wrappers work correctly - Do fused prediction on the legacy Tree before converting (keeps perf benefit) - Restructure prediction update to avoid double-append to trees_ Co-Authored-By: Claude Opus 4.6 --- src/openboost/_models/_boosting.py | 67 ++++++++++++++++++++---------- 1 file changed, 46 insertions(+), 21 deletions(-) diff --git a/src/openboost/_models/_boosting.py b/src/openboost/_models/_boosting.py index 1f263f4..cd49a9c 100644 --- a/src/openboost/_models/_boosting.py +++ b/src/openboost/_models/_boosting.py @@ -700,15 +700,29 @@ def _fit_gpu( else: # Use GPU-native tree builder when no features require the # growth-strategy path (reg_alpha, colsample, subsample, etc.) + # Also skip GPU-native when data has missing values or + # categorical features, which it doesn't support. + has_missing = ( + hasattr(self.X_binned_, 'has_missing') + and len(self.X_binned_.has_missing) > 0 + and np.any(self.X_binned_.has_missing) + ) + has_categorical = ( + hasattr(self.X_binned_, 'is_categorical') + and len(self.X_binned_.is_categorical) > 0 + and np.any(self.X_binned_.is_categorical) + ) use_gpu_native = ( is_cuda() and self.reg_alpha == 0.0 and self.colsample_bytree >= 1.0 and self.subsample >= 1.0 + and not has_missing + and not has_categorical ) if use_gpu_native: from .._core._tree import fit_tree_gpu_native - tree = fit_tree_gpu_native( + legacy_tree = fit_tree_gpu_native( self.X_binned_, grad_gpu, hess_gpu, @@ -717,6 +731,25 @@ def _fit_gpu( reg_lambda=self.reg_lambda, min_gain=self.gamma, ) + # Use fused prediction before converting to TreeStructure + from .._core._predict import predict_tree_add_gpu + predict_tree_add_gpu( + legacy_tree, self.X_binned_, pred_gpu, self.learning_rate + ) + # Convert legacy Tree to TreeStructure for compatibility + # with feature importance, persistence, and sklearn wrappers + features, thresholds, values, left, right = legacy_tree.to_arrays() + tree = TreeStructure( + features=features, + thresholds=thresholds, + left_children=left, + right_children=right, + values=values, + n_nodes=len(features), + depth=legacy_tree.depth, + n_features=legacy_tree.n_features, + ) + self.trees_.append(tree) else: tree = fit_tree( self.X_binned_, @@ -730,26 +763,18 @@ def _fit_gpu( subsample=self.subsample, colsample_bytree=self.colsample_bytree, ) - - # Update predictions on GPU - from .._core._tree import Tree - if isinstance(tree, Tree) and tree.on_gpu: - # Fused traversal + add: single kernel, no intermediate array - from .._core._predict import predict_tree_add_gpu - predict_tree_add_gpu(tree, self.X_binned_, pred_gpu, self.learning_rate) - else: - tree_pred = tree(self.X_binned_) - if hasattr(tree_pred, '__cuda_array_interface__'): - from .._core._predict import _add_inplace_cuda - _add_inplace_cuda(pred_gpu, tree_pred, self.learning_rate) - else: - if hasattr(tree_pred, 'copy_to_host'): - tree_pred = tree_pred.copy_to_host() - pred_cpu = pred_gpu.copy_to_host() - pred_cpu += self.learning_rate * tree_pred - cuda.to_device(pred_cpu, to=pred_gpu) - - self.trees_.append(tree) + # Update predictions on GPU + tree_pred = tree(self.X_binned_) + if hasattr(tree_pred, '__cuda_array_interface__'): + from .._core._predict import _add_inplace_cuda + _add_inplace_cuda(pred_gpu, tree_pred, self.learning_rate) + else: + if hasattr(tree_pred, 'copy_to_host'): + tree_pred = tree_pred.copy_to_host() + pred_cpu = pred_gpu.copy_to_host() + pred_cpu += self.learning_rate * tree_pred + cuda.to_device(pred_cpu, to=pred_gpu) + self.trees_.append(tree) # Only compute loss and copy to CPU when callbacks need it if cb_manager.callbacks: From 9e8be0ca0c62785a4f0b94ffe6f337e4ab302edf Mon Sep 17 00:00:00 2001 From: J Xu Date: Sun, 22 Mar 2026 20:00:00 -0700 Subject: [PATCH 6/6] Update CLAUDE.md with profiling, parallel tests, and GPU-native constraints - Add profiling commands and OPENBOOST_PROFILE env var - Document pytest-xdist parallel execution and conftest.py - Update ruff ignore list (E402, F821) - Document GPU-native builder limitations (no missing/categorical) - Add _profiler.py to architecture section Co-Authored-By: Claude Opus 4.6 --- CLAUDE.md | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index afc6f10..be2fcd7 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -14,13 +14,19 @@ uv sync # Install/sync dependencies uv sync --extra cuda # With GPU support uv sync --extra dev # With dev tools (test + bench + sklearn + ruff) -# Testing -uv run pytest tests/ -v --tb=short # All tests (CPU) +# Testing (parallelized with pytest-xdist, -n auto is in addopts) +uv run pytest tests/ -v --tb=short # All tests (CPU, parallel) uv run pytest tests/test_core.py -v # Single test file uv run pytest tests/test_core.py::test_name -v # Single test +uv run pytest tests/ -n 0 # Force serial (debugging) OPENBOOST_BACKEND=cuda uv run pytest tests/ # GPU tests OPENBOOST_BACKEND=cpu uv run pytest tests/ # Force CPU +# Profiling +uv run python benchmarks/profile_loop.py # Profile training (50K samples default) +uv run python benchmarks/profile_loop.py --summarize # Machine-readable bottleneck summary +OPENBOOST_PROFILE=1 uv run python script.py # Profile any training run via env var + # Linting uv run ruff check src/openboost/ # Lint uv run ruff check src/openboost/ --fix # Autofix @@ -63,6 +69,9 @@ DART, LinearLeaf growth strategies - **`_distributional.py`** — `NaturalBoost`: distributional GBDT (natural gradient boosting) - **`_dart.py`**, **`_linear_leaf.py`**, **`_gam.py`** — Specialized model variants +### Profiling (`_profiler.py`) +`ProfilingCallback` instruments training by wrapping core primitives (`build_node_histograms`, `find_node_splits`, `partition_samples`, `compute_leaf_values`, `fit_tree`) with timers. Outputs JSON reports to `logs/` with per-phase breakdown, bottleneck identification, and run-over-run comparison. CLI runner: `benchmarks/profile_loop.py`. + ### Loss Functions (`_loss.py`) 50+ loss implementations. Each returns `(gradient, hessian)`. Custom losses are callables with signature `fn(pred, y) -> (grad, hess)`. @@ -71,10 +80,13 @@ DART, LinearLeaf growth strategies ## Key Conventions -- **Python 3.10+** target. Ruff rules: E, F, I, UP, B, SIM (line length 100, E501 ignored). +- **Python 3.10+** target. Ruff rules: E, F, I, UP, B, SIM (line length 100; E501, E402, F821 ignored). - **uv only** for package management — never `pip install` or `conda`. - All Numba-jitted functions use `@njit` or `@cuda.jit`. CPU kernels are in `_backends/_cpu.py`, CUDA in `_backends/_cuda.py`. - Test environment variable `OPENBOOST_BACKEND=cpu` forces CPU backend in CI. +- Tests use `pytest-xdist` (`-n auto --dist loadfile`) for parallel execution. Shared fixtures are in `tests/conftest.py` (session-scoped datasets, function-scoped gradients). +- **GPU-native builder** (`fit_tree_gpu_native`) does not support missing values or categorical features. The training loop in `_boosting.py` auto-falls back to `fit_tree()` when the data has NaN or categorical columns. +- **Profiling**: `ProfilingCallback` wraps core primitives with timers. Enable via callback or `OPENBOOST_PROFILE=1` env var. Reports go to `logs/` as JSON. ## Working Style