# Gpu Optimization Deep Dive


In [1]:
"""
Converted from gpu_optimization_deep_dive.ipynb

This script was automatically generated from a Jupyter notebook.
Plots are saved to the figures/ directory instead of displayed inline.
"""

# ======================================================================
# # GPU Optimization and Performance Deep Dive
#
# **Level**: Advanced
# **Time**: 50-70 minutes
# **Prerequisites**: NLSQ Quickstart, JAX basics
#
# ## Overview
#
# This tutorial covers **performance optimization** for NLSQ, focusing on:
# - JAX JIT compilation and profiling
# - GPU acceleration strategies
# - Memory optimization
# - Batch processing for maximum throughput
#
# ### What You'll Learn
#
# 1. **JAX Profiling**: Identifying bottlenecks with JAX tools
# 2. **JIT Compilation**: Understanding and optimizing compilation
# 3. **GPU Acceleration**: When and how to leverage GPUs
# 4. **Memory Management**: Avoiding OOM errors
# 5. **Batch Strategies**: Processing thousands of fits efficiently
# 6. **Benchmarking**: Measuring and comparing performance
#
# ### Performance Targets
#
# Typical NLSQ performance (depends on hardware, problem size):
# - **Cold start (first call)**: 0.5-2 seconds (includes JIT compilation)
# - **Warm calls (cached)**: 1-50 ms per fit
# - **GPU speedup**: 5-50x for large batches vs CPU
# - **Batch throughput**: 100-10,000 fits/second (GPU, batched)
#
# ### Hardware Requirements
#
# This notebook runs on CPU or GPU. GPU examples automatically fall back to CPU if no GPU is available.
# ======================================================================
# Configure matplotlib for inline plotting in VS Code/Jupyter
# MUST come before importing matplotlib
import os
import time
from pathlib import Path

import jax
import jax.numpy as jnp
import matplotlib.pyplot as plt
import numpy as np
from jax import jit, vmap

from nlsq import CurveFit

# Detect available devices
devices = jax.devices()
has_gpu = any("gpu" in str(d).lower() for d in devices)

QUICK = os.environ.get("NLSQ_EXAMPLES_QUICK") == "1"
MAX_SAMPLES = int(os.environ.get("NLSQ_EXAMPLES_MAX_SAMPLES", "300000"))


def cap_samples(n: int) -> int:
    return min(n, MAX_SAMPLES) if QUICK else n


print("Hardware Configuration:")
print(f"  JAX version: {jax.__version__}")
print(f"  Default backend: {jax.default_backend()}")
print(f"  Available devices: {devices}")
print(f"  GPU available: {'✓ Yes' if has_gpu else '✗ No (will use CPU)'}")
print()

if has_gpu:
    print("GPU detected - examples will show GPU acceleration")
else:
    print("Running on CPU - GPU examples will still work but won't show speedup")
    print("To use GPU: Install jax[cuda] or jax[rocm] depending on your hardware")


# ======================================================================
# ## Part 1: JIT Compilation Basics
#
# Understanding JAX's Just-In-Time (JIT) compilation is crucial for performance.
# ======================================================================


# Demonstrating JIT compilation overhead and benefits


# Simple model
def exponential_model(x, a, b):
    return a * jnp.exp(-b * x)


# Test data
x_test = jnp.linspace(0, 5, cap_samples(1000))
y_test = exponential_model(x_test, 3.0, 0.5) + np.random.normal(0, 0.1, len(x_test))

cf = CurveFit()

print("JIT Compilation Analysis:")
print("=" * 60)

# First call: includes compilation time
start = time.time()
popt1, _ = cf.curve_fit(exponential_model, x_test, y_test, p0=[2.0, 0.3])
time_first = (time.time() - start) * 1000  # ms

# Second call: uses cached compilation
start = time.time()
popt2, _ = cf.curve_fit(exponential_model, x_test, y_test, p0=[2.5, 0.4])
time_second = (time.time() - start) * 1000  # ms

# Third call: still cached
start = time.time()
popt3, _ = cf.curve_fit(exponential_model, x_test, y_test, p0=[3.0, 0.5])
time_third = (time.time() - start) * 1000  # ms

print(f"First call (cold):  {time_first:.1f} ms (includes JIT compilation)")
print(f"Second call (warm): {time_second:.1f} ms (cached)")
print(f"Third call (warm):  {time_third:.1f} ms (cached)")
print()
print(f"Speedup after JIT:  {time_first / time_second:.1f}x")
print(
    f"Compilation overhead: {time_first - time_second:.1f} ms ({(time_first - time_second) / time_first * 100:.1f}% of first call)"
)
print()
print("Key insight: First call is slow due to JIT compilation.")
print("            Subsequent calls are much faster (10-100x).")


# Understanding what triggers recompilation

print("Recompilation Triggers:")
print("=" * 60)

# Trigger 1: Different array shapes
print("\n1. Changing array shapes triggers recompilation:")

x_100 = jnp.linspace(0, 5, 100)
y_100 = exponential_model(x_100, 3.0, 0.5) + np.random.normal(0, 0.1, 100)

x_200 = jnp.linspace(0, 5, cap_samples(200))
y_200 = exponential_model(x_200, 3.0, 0.5) + np.random.normal(0, 0.1, cap_samples(200))

cf_new = CurveFit()

start = time.time()
cf_new.curve_fit(exponential_model, x_100, y_100, p0=[2.0, 0.3])
time_100 = (time.time() - start) * 1000

start = time.time()
cf_new.curve_fit(exponential_model, x_200, y_200, p0=[2.0, 0.3])  # Different shape!
time_200 = (time.time() - start) * 1000

start = time.time()
cf_new.curve_fit(exponential_model, x_200, y_200, p0=[2.5, 0.4])  # Same shape
time_200_cached = (time.time() - start) * 1000

print(f"  Fit with shape (100,): {time_100:.1f} ms (first compile)")
print(f"  Fit with shape (200,): {time_200:.1f} ms (recompiled!)")
print(f"  Fit with shape (200,): {time_200_cached:.1f} ms (cached) ✓")
print()
print("  → Keep array shapes consistent to avoid recompilation")

# Trigger 2: Different dtypes
print("\n2. Changing dtypes triggers recompilation:")
print("  float32 vs float64 will trigger separate compilations")
print("  → Use consistent dtype (float32 for GPU, float64 for high precision)")

# Trigger 3: Different parameter counts
print("\n3. Different model signatures trigger recompilation:")
print("  model(x, a, b) vs model(x, a, b, c) are compiled separately")
print("  → Expected - different models need different compilations")


# ======================================================================
# ## Part 2: GPU Acceleration
#
# Leverage GPU for massive speedups on large problems.
# ======================================================================


# CPU vs GPU performance comparison

# Large dataset (GPU shines here)
n_points = cap_samples(10000)
x_large = jnp.linspace(0, 10, n_points)
y_large = (
    3.0 * jnp.exp(-0.5 * x_large)
    + 2.0 * jnp.sin(x_large)
    + np.random.normal(0, 0.1, n_points)
)


def complex_model(x, a, b, c, d):
    return a * jnp.exp(-b * x) + c * jnp.sin(d * x)


print(f"GPU Acceleration Benchmark (n_points={n_points}):")
print("=" * 60)

# Ensure compilation is done (use same settings as benchmark for consistency)
cf_gpu = CurveFit()
try:
    _ = cf_gpu.curve_fit(
        complex_model,
        x_large[:100],
        y_large[:100],
        p0=[3, 0.5, 2, 1],
        maxiter=20 if QUICK else 50,
        max_nfev=200 if QUICK else 1000,
    )
except Exception as exc:
    print(f"Warmup fit skipped: {exc}")

# Benchmark: 10 fits (reduced in quick mode)
n_runs = 3 if QUICK else 10
times = []

for i in range(n_runs):
    # Slightly vary initial guess to avoid trivial caching
    p0 = [3.0 + i * 0.1, 0.5, 2.0, 1.0]
    start = time.time()
    popt, _ = cf_gpu.curve_fit(
        complex_model, x_large, y_large, p0=p0, maxiter=20 if QUICK else 50
    )
    times.append((time.time() - start) * 1000)

mean_time = np.mean(times)
std_time = np.std(times)

print(f"\nDevice: {jax.devices()[0]}")
print(f"Average fit time: {mean_time:.1f} ± {std_time:.1f} ms")
print(f"Throughput: {1000 / mean_time:.1f} fits/second")
print()

if has_gpu:
    print("✓ Running on GPU - performance is optimized")
    print("  Expected speedup vs CPU: 5-20x for this problem size")
else:
    print("Running on CPU - results are valid but slower than GPU")
    print("  With GPU: Expect 5-50x speedup for large datasets")


# ======================================================================
# ## Part 3: Batch Processing Strategies
#
# Process thousands of fits efficiently with vectorization.
# ======================================================================


# Batch processing with vmap for maximum throughput

print("Batch Processing Benchmark:")
print("=" * 60)

# Generate batch of datasets
n_datasets = min(20, cap_samples(200)) if QUICK else max(10, cap_samples(1000))
n_points_per_dataset = 30 if QUICK else 50

x_batch_data = jnp.linspace(0, 5, n_points_per_dataset)

# Random true parameters for each dataset
np.random.seed(42)
a_true_batch = np.random.uniform(2, 4, n_datasets)
b_true_batch = np.random.uniform(0.3, 0.7, n_datasets)

y_batch_data = jnp.array(
    [
        a * jnp.exp(-b * x_batch_data) + np.random.normal(0, 0.05, n_points_per_dataset)
        for a, b in zip(a_true_batch, b_true_batch, strict=True)
    ]
)

print(f"Batch size: {n_datasets} datasets")
print(f"Points per dataset: {n_points_per_dataset}")
print(f"Total data points: {n_datasets * n_points_per_dataset:,}")
print()

# Method 1: Sequential (slow)
print("Method 1: Sequential fitting (baseline)")
start = time.time()
results_sequential = []
cf_seq = CurveFit()
sequential_runs = min(20 if QUICK else 100, n_datasets)
for i in range(sequential_runs):  # Only fit a subset for speed
    popt, _ = cf_seq.curve_fit(
        exponential_model, x_batch_data, y_batch_data[i], p0=[3.0, 0.5], maxiter=30
    )
    results_sequential.append(popt)
time_sequential = time.time() - start

print(
    f"  Time for {sequential_runs} datasets: {time_sequential * 1000:.0f} ms "
    f"({time_sequential * 1000 / sequential_runs:.1f} ms/fit)"
)
print(
    f"  Estimated time for {n_datasets}: {time_sequential * n_datasets / sequential_runs:.1f} s"
)
print()

# Method 2: Vectorized with vmap (fast)
print("Method 2: Batched fitting with vmap (optimized)")


# Simplified optimizer for vectorization
def fit_one_dataset(y_single):
    """Fit single dataset (simplified gradient descent)."""
    params = jnp.array([3.0, 0.5])

    def loss(p):
        return jnp.sum((y_single - exponential_model(x_batch_data, *p)) ** 2)

    # A few gradient descent steps for demonstration
    for _ in range(5 if QUICK else 20):
        g = jax.grad(loss)(params)
        params = params - 0.05 * g
    return params


# Vectorize over batch dimension
fit_batch = jit(vmap(fit_one_dataset))

# Warm up JIT
warmup_size = min(10, n_datasets)
_ = fit_batch(y_batch_data[:warmup_size])

# Benchmark
start = time.time()
results_batch = fit_batch(y_batch_data)
# Block until computation completes (JAX is async)
results_batch[0].block_until_ready()
time_batch = time.time() - start

print(
    f"  Time for {n_datasets} datasets: {time_batch * 1000:.0f} ms ({time_batch * 1000 / n_datasets:.3f} ms/fit)"
)
print(f"  Throughput: {n_datasets / time_batch:.0f} fits/second")
print()

# Speedup
estimated_sequential_time = time_sequential * n_datasets / sequential_runs
speedup = estimated_sequential_time / time_batch

print(f"Speedup: {speedup:.0f}x faster with vmap + JIT ✓")
print()
print("Key insight: vmap parallelizes across datasets, JIT compiles once")


# ======================================================================
# ## Part 4: Memory Optimization
#
# Avoiding out-of-memory (OOM) errors with large datasets.
# ======================================================================


# Memory optimization strategies

print("Memory Optimization Strategies:")
print("=" * 60)
print()

print("1. Use float32 instead of float64:")
x_f64 = jnp.array([1.0, 2.0, 3.0], dtype=jnp.float64)
x_f32 = jnp.array([1.0, 2.0, 3.0], dtype=jnp.float32)
print(f"   float64 memory: {x_f64.nbytes} bytes per element")
print(f"   float32 memory: {x_f32.nbytes} bytes per element")
print(f"   Savings: {(1 - x_f32.nbytes / x_f64.nbytes) * 100:.0f}%")
print("   → Use float32 unless high precision is critical\n")

print("2. Process data in chunks (streaming):")
print("   # For very large datasets (millions of points)")
print("   chunk_size = 100000")
print("   for i in range(0, len(data), chunk_size):")
print("       chunk = data[i:i+chunk_size]")
print("       result = fit(chunk)")
print("       results.append(result)\n")

print("3. Clear JAX cache if needed:")
print("   from jax import clear_caches")
print("   clear_caches()  # Frees compilation cache\n")

print("4. Monitor memory usage:")


def get_array_memory_mb(arr):
    return arr.nbytes / (1024**2)


large_array = jnp.ones((cap_samples(10000), cap_samples(1000)), dtype=jnp.float32)
print(
    f"   Example: {large_array.shape} array uses {get_array_memory_mb(large_array):.1f} MB"
)
print()

print("5. Typical memory requirements:")
print("   10K points:     ~0.1 MB (negligible)")
print("   1M points:      ~10 MB (easy)")
print("   100M points:    ~1 GB (manageable)")
print("   1B points:      ~10 GB (need chunking or distributed)")
print()
print("→ For datasets >100M points, use chunked processing or streaming")


# ======================================================================
# ## Part 5: Performance Benchmarking
#
# Systematic performance measurement and optimization.
# ======================================================================


# Comprehensive performance benchmark


def benchmark_nlsq(n_points_list, n_params=2, n_runs=5):
    """Benchmark NLSQ across different problem sizes.

    Parameters
    ----------
    n_points_list : list
        List of dataset sizes to test
    n_params : int
        Number of parameters to fit
    n_runs : int
        Number of runs to average

    Returns
    -------
    results : dict
        Benchmark results
    """
    results = {"n_points": [], "mean_time_ms": [], "std_time_ms": []}

    cf_bench = CurveFit()

    for n_points in n_points_list:
        x = jnp.linspace(0, 5, n_points)
        y = 3.0 * jnp.exp(-0.5 * x) + np.random.normal(0, 0.1, n_points)

        # Warm up
        _ = cf_bench.curve_fit(exponential_model, x, y, p0=[2.0, 0.3], maxiter=20)

        # Benchmark
        times = []
        for _ in range(n_runs):
            start = time.time()
            popt, _ = cf_bench.curve_fit(
                exponential_model, x, y, p0=[2.0, 0.3], maxiter=20
            )
            # Note: popt is numpy array (already synchronous), no need for block_until_ready
            times.append((time.time() - start) * 1000)

        results["n_points"].append(n_points)
        results["mean_time_ms"].append(np.mean(times))
        results["std_time_ms"].append(np.std(times))

    return results


print("Running comprehensive benchmark...")
print("(This may take 30-60 seconds in full mode)")
print()

# Test different problem sizes
size_candidates = [50, 100, 200] if QUICK else [100, 500, 1000, 5000, 10000]
sizes = sorted({cap_samples(s) for s in size_candidates})
bench_results = benchmark_nlsq(sizes, n_runs=2 if QUICK else 5)

# Display results
print("Benchmark Results:")
print("=" * 60)
print(f"{'N Points':<12} {'Mean Time (ms)':<20} {'Throughput (fits/s)'}")
print("-" * 60)

for i, n in enumerate(bench_results["n_points"]):
    mean_t = bench_results["mean_time_ms"][i]
    std_t = bench_results["std_time_ms"][i]
    throughput = 1000 / mean_t
    print(f"{n:<12} {mean_t:>8.2f} ± {std_t:<8.2f} {throughput:>12.1f}")

print()

# Plot scaling
_, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Time vs problem size
ax1.errorbar(
    bench_results["n_points"],
    bench_results["mean_time_ms"],
    yerr=bench_results["std_time_ms"],
    marker="o",
    capsize=5,
    label="NLSQ",
)
ax1.set_xlabel("Number of Data Points")
ax1.set_ylabel("Time (ms)")
ax1.set_title("Performance Scaling")
ax1.legend()
ax1.grid(alpha=0.3)

# Log-log plot to see scaling behavior
ax2.loglog(bench_results["n_points"], bench_results["mean_time_ms"], "o-", label="NLSQ")
ax2.set_xlabel("Number of Data Points")
ax2.set_ylabel("Time (ms)")
ax2.set_title("Scaling Behavior (log-log)")
ax2.legend()
ax2.grid(alpha=0.3, which="both")

plt.tight_layout()
# Save figure to file
fig_dir = Path.cwd() / "figures" / "gpu_optimization_deep_dive"
fig_dir.mkdir(parents=True, exist_ok=True)
plt.savefig(fig_dir / "fig_01.png", dpi=300, bbox_inches="tight")
plt.close()

print("Interpretation:")
print("  - Nearly flat scaling: Well-optimized (GPU benefits)")
print("  - Linear scaling: Expected for iterative optimization")
print("  - Superlinear scaling: May indicate memory issues or poor caching")


# ======================================================================
# ## Summary and Best Practices
#
# ### Performance Optimization Checklist
#
# **For Maximum Speed:**
#
# 1. ✅ **Use GPU** if available (5-50x speedup for large problems)
# 2. ✅ **Keep array shapes consistent** to avoid recompilation
# 3. ✅ **Use float32** unless high precision is needed (2x memory savings)
# 4. ✅ **Batch process** with `vmap` for multiple datasets (10-100x faster)
# 5. ✅ **Warm up JIT** with small dataset before benchmarking
# 6. ✅ **Use `block_until_ready()`** when timing (JAX is async)
#
# **For Large Datasets:**
#
# 1. ✅ **Chunk data** if >100M points
# 2. ✅ **Monitor memory** usage
# 3. ✅ **Consider downsampling** for smooth, oversampled data
# 4. ✅ **Use streaming** for datasets that don't fit in memory
#
# ### Performance Expectations
#
# | **Scenario** | **Typical Time** | **Optimization** |
# |--------------|------------------|------------------|
# | First call (cold start) | 0.5-2 seconds | Expected (JIT compilation) |
# | Subsequent calls (warm) | 1-50 ms | Cached compilation |
# | Large dataset (10K points) | 5-100 ms | Use GPU if available |
# | Batch (1000 fits) | 100-5000 ms | Use vmap for parallelization |
# | Huge dataset (1M points) | 50-500 ms | GPU + chunking |
#
# ### Troubleshooting Performance Issues
#
# **Problem**: First call is slow (>5 seconds)
# - **Solution**: Normal for JIT. Subsequent calls will be fast.
#
# **Problem**: All calls are slow (>1 second for small data)
# - **Solution**: Check if recompiling each time (varying shapes/dtypes)
#
# **Problem**: Out of memory errors
# - **Solution**: Use float32, chunk data, or downsample
#
# **Problem**: GPU not being used
# - **Solution**: Check `jax.devices()`, install jax[cuda] or jax[rocm]
#
# **Problem**: Batch processing not faster than sequential
# - **Solution**: Problem may be too small, try larger batches or datasets
#
# ### Advanced Profiling
#
# For detailed profiling:
#
# ```python
# # JAX profiling (requires jax[profiling])
# import jax.profiler
#
# # Profile a code block
# with jax.profiler.trace("/tmp/jax-trace", create_perfetto_link=True):
#     # Your NLSQ code here
#     popt, pcov = cf.curve_fit(model, x, y, p0=...)
#
# # Opens profiling UI in browser
# ```
#
# ### Production Recommendations
#
# ```python
# # Example: Optimized production setup
# import jax
# import jax.numpy as jnp
# from nlsq import CurveFit
#
# # Configure JAX for production
# jax.config.update('jax_enable_x64', False)  # Use float32
#
# # Pre-warm JIT cache at startup
# cf = CurveFit()
# x_dummy = jnp.linspace(0, 1, 100)
# y_dummy = jnp.ones(100)
# _ = cf.curve_fit(model, x_dummy, y_dummy, p0=initial_guess)
#
# # Now ready for fast production fitting
# ```
#
# ### Next Steps
#
# - **Scale up**: Try batch processing 10,000+ datasets with vmap
# - **Optimize models**: Simplify model functions for faster evaluation
# - **Profile**: Use JAX profiler to identify bottlenecks
# - **Distribute**: For massive scale, consider JAX's `pmap` for multi-GPU
#
# ### References
#
# 1. **JAX Performance**: https://jax.readthedocs.io/en/latest/notebooks/thinking_in_jax.html
# 2. **JAX Profiling**: https://jax.readthedocs.io/en/latest/profiling.html
# 3. **GPU Acceleration**: https://jax.readthedocs.io/en/latest/gpu_performance_tips.html
# 4. **Related examples**:
#    - `custom_algorithms_advanced.ipynb` - vmap for batch fitting
#    - `troubleshooting_guide.ipynb` - Performance debugging
#
# ---
#
# **Remember**: Premature optimization is the root of all evil. Profile first, optimize what matters!
# ======================================================================


INFO:nlsq.curve_fit:Starting curve fit n_params=2 | n_data_points=1000 | method=trf | solver=auto | batch_size=None | has_bounds=False | dynamic_sizing=False


Hardware Configuration:
  JAX version: 0.8.2
  Default backend: gpu
  Available devices: [CudaDevice(id=0)]
  GPU available: ✗ No (will use CPU)

Running on CPU - GPU examples will still work but won't show speedup
To use GPU: Install jax[cuda] or jax[rocm] depending on your hardware
JIT Compilation Analysis:


INFO:nlsq.least_squares:Starting least squares optimization method=trf | n_params=2 | loss=linear | ftol=1.0000e-08 | xtol=1.0000e-08 | gtol=1.0000e-08


INFO:nlsq.optimizer.trf:Starting TRF optimization (no bounds) n_params=2 | n_residuals=1000 | max_nfev=None


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=0 | cost=58.69419588587635 | grad_norm=102.3933 | nfev=1


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=1 | cost=6.088530168611488 | grad_norm=69.0604 | step=2.0223748416156684 | nfev=2


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=2 | cost=4.838307458816968 | grad_norm=0.6953 | step=2.0223748416156684 | nfev=3


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=3 | cost=4.838231004765117 | grad_norm=0.0011 | step=2.0223748416156684 | nfev=4


PERFORMANCE:nlsq.least_squares:Timer: optimization elapsed=1.338894s


INFO:nlsq.least_squares:Convergence reason=`ftol` termination condition is satisfied. | iterations=4 | final_cost=4.8382 | elapsed=1.339s | final_gradient_norm=2.6106e-06


PERFORMANCE:nlsq.curve_fit:Timer: curve_fit elapsed=2.019947s




INFO:nlsq.curve_fit:Starting curve fit n_params=2 | n_data_points=1000 | method=trf | solver=auto | batch_size=None | has_bounds=False | dynamic_sizing=False


INFO:nlsq.least_squares:Starting least squares optimization method=trf | n_params=2 | loss=linear | ftol=1.0000e-08 | xtol=1.0000e-08 | gtol=1.0000e-08


INFO:nlsq.optimizer.trf:Starting TRF optimization (no bounds) n_params=2 | n_residuals=1000 | max_nfev=None


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=0 | cost=17.030714244522223 | grad_norm=45.0304 | nfev=1


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=1 | cost=4.893173112319704 | grad_norm=13.9718 | step=2.5317977802344327 | nfev=2


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=2 | cost=4.838231014591427 | grad_norm=0.0079 | step=2.5317977802344327 | nfev=3


PERFORMANCE:nlsq.least_squares:Timer: optimization elapsed=0.012326s


INFO:nlsq.least_squares:Convergence reason=`ftol` termination condition is satisfied. | iterations=3 | final_cost=4.8382 | elapsed=0.012s | final_gradient_norm=7.9718e-06


PERFORMANCE:nlsq.curve_fit:Timer: curve_fit elapsed=0.095807s




INFO:nlsq.curve_fit:Starting curve fit n_params=2 | n_data_points=1000 | method=trf | solver=auto | batch_size=None | has_bounds=False | dynamic_sizing=False


INFO:nlsq.least_squares:Starting least squares optimization method=trf | n_params=2 | loss=linear | ftol=1.0000e-08 | xtol=1.0000e-08 | gtol=1.0000e-08


INFO:nlsq.optimizer.trf:Starting TRF optimization (no bounds) n_params=2 | n_residuals=1000 | max_nfev=None


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=0 | cost=4.84473226409262 | grad_norm=0.9269 | nfev=1


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=1 | cost=4.838231045995689 | grad_norm=0.0159 | step=3.0413812651491097 | nfev=2


PERFORMANCE:nlsq.least_squares:Timer: optimization elapsed=0.027419s


INFO:nlsq.least_squares:Convergence reason=`ftol` termination condition is satisfied. | iterations=2 | final_cost=4.8382 | elapsed=0.027s | final_gradient_norm=2.3746e-05


PERFORMANCE:nlsq.curve_fit:Timer: curve_fit elapsed=0.104338s




First call (cold):  2197.2 ms (includes JIT compilation)
Second call (warm): 99.6 ms (cached)
Third call (warm):  109.4 ms (cached)

Speedup after JIT:  22.1x
Compilation overhead: 2097.6 ms (95.5% of first call)

Key insight: First call is slow due to JIT compilation.
            Subsequent calls are much faster (10-100x).
Recompilation Triggers:

1. Changing array shapes triggers recompilation:


INFO:nlsq.curve_fit:Starting curve fit n_params=2 | n_data_points=100 | method=trf | solver=auto | batch_size=None | has_bounds=False | dynamic_sizing=False


INFO:nlsq.least_squares:Starting least squares optimization method=trf | n_params=2 | loss=linear | ftol=1.0000e-08 | xtol=1.0000e-08 | gtol=1.0000e-08


INFO:nlsq.optimizer.trf:Starting TRF optimization (no bounds) n_params=2 | n_residuals=100 | max_nfev=None


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=0 | cost=6.503342938510995 | grad_norm=10.9828 | nfev=1


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=1 | cost=0.6072101068960776 | grad_norm=7.5254 | step=2.0223748416156684 | nfev=2


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=2 | cost=0.4637401301622129 | grad_norm=0.1282 | step=2.0223748416156684 | nfev=3


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=3 | cost=0.4637132695753513 | grad_norm=5.6857e-04 | step=2.0223748416156684 | nfev=4


PERFORMANCE:nlsq.least_squares:Timer: optimization elapsed=0.546977s


INFO:nlsq.least_squares:Convergence reason=`ftol` termination condition is satisfied. | iterations=4 | final_cost=0.4637 | elapsed=0.547s | final_gradient_norm=5.2462e-06


PERFORMANCE:nlsq.curve_fit:Timer: curve_fit elapsed=0.828426s




INFO:nlsq.curve_fit:Starting curve fit n_params=2 | n_data_points=200 | method=trf | solver=auto | batch_size=None | has_bounds=False | dynamic_sizing=False


INFO:nlsq.least_squares:Starting least squares optimization method=trf | n_params=2 | loss=linear | ftol=1.0000e-08 | xtol=1.0000e-08 | gtol=1.0000e-08


INFO:nlsq.optimizer.trf:Starting TRF optimization (no bounds) n_params=2 | n_residuals=200 | max_nfev=None


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=0 | cost=11.818248300436995 | grad_norm=21.5892 | nfev=1


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=1 | cost=1.206573901535574 | grad_norm=14.8968 | step=2.0223748416156684 | nfev=2


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=2 | cost=0.9497527163957732 | grad_norm=0.2654 | step=2.0223748416156684 | nfev=3


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=3 | cost=0.949697551324633 | grad_norm=9.3079e-04 | step=2.0223748416156684 | nfev=4


PERFORMANCE:nlsq.least_squares:Timer: optimization elapsed=0.497695s


INFO:nlsq.least_squares:Convergence reason=`ftol` termination condition is satisfied. | iterations=4 | final_cost=0.9497 | elapsed=0.498s | final_gradient_norm=7.2142e-06


PERFORMANCE:nlsq.curve_fit:Timer: curve_fit elapsed=0.852104s




INFO:nlsq.curve_fit:Starting curve fit n_params=2 | n_data_points=200 | method=trf | solver=auto | batch_size=None | has_bounds=False | dynamic_sizing=False


INFO:nlsq.least_squares:Starting least squares optimization method=trf | n_params=2 | loss=linear | ftol=1.0000e-08 | xtol=1.0000e-08 | gtol=1.0000e-08


INFO:nlsq.optimizer.trf:Starting TRF optimization (no bounds) n_params=2 | n_residuals=200 | max_nfev=None


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=0 | cost=3.3664079616224587 | grad_norm=9.8106 | nfev=1


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=1 | cost=0.9610393793235749 | grad_norm=3.3035 | step=2.5317977802344327 | nfev=2


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=2 | cost=0.9496977756413681 | grad_norm=0.0161 | step=2.5317977802344327 | nfev=3


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=3 | cost=0.9496975497832811 | grad_norm=7.7768e-05 | step=2.5317977802344327 | nfev=4


PERFORMANCE:nlsq.least_squares:Timer: optimization elapsed=0.046682s


INFO:nlsq.least_squares:Convergence reason=`ftol` termination condition is satisfied. | iterations=4 | final_cost=0.9497 | elapsed=0.047s | final_gradient_norm=5.7244e-07


PERFORMANCE:nlsq.curve_fit:Timer: curve_fit elapsed=0.112259s




  Fit with shape (100,): 965.7 ms (first compile)
  Fit with shape (200,): 958.3 ms (recompiled!)
  Fit with shape (200,): 117.8 ms (cached) ✓

  → Keep array shapes consistent to avoid recompilation

2. Changing dtypes triggers recompilation:
  float32 vs float64 will trigger separate compilations
  → Use consistent dtype (float32 for GPU, float64 for high precision)

3. Different model signatures trigger recompilation:
  model(x, a, b) vs model(x, a, b, c) are compiled separately
  → Expected - different models need different compilations


INFO:nlsq.curve_fit:Starting curve fit n_params=4 | n_data_points=100 | method=trf | solver=auto | batch_size=None | has_bounds=False | dynamic_sizing=False


INFO:nlsq.least_squares:Starting least squares optimization method=trf | n_params=4 | loss=linear | ftol=1.0000e-08 | xtol=1.0000e-08 | gtol=1.0000e-08


GPU Acceleration Benchmark (n_points=10000):


INFO:nlsq.optimizer.trf:Starting TRF optimization (no bounds) n_params=4 | n_residuals=100 | max_nfev=200


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=0 | cost=0.5558499621298119 | grad_norm=0.2250 | nfev=1


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=1 | cost=0.5527698329622251 | grad_norm=0.1064 | step=0.47186465220442186 | nfev=4


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=2 | cost=0.552643203831146 | grad_norm=0.0193 | step=0.2359323261022109 | nfev=6


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=3 | cost=0.5525759791517322 | grad_norm=0.0792 | step=0.2359323261022109 | nfev=7


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=4 | cost=0.5524354059320944 | grad_norm=0.0741 | step=0.4718646522044218 | nfev=8


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=5 | cost=0.552318575717694 | grad_norm=0.0172 | step=0.2359323261022109 | nfev=10


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=6 | cost=0.5521930373431311 | grad_norm=0.0695 | step=0.4718646522044218 | nfev=11


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=7 | cost=0.5520688305622696 | grad_norm=0.0157 | step=0.23593232610221093 | nfev=13


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=8 | cost=0.5519234784139457 | grad_norm=0.0633 | step=0.47186465220442186 | nfev=14


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=9 | cost=0.5518012108012068 | grad_norm=0.0143 | step=0.2359323261022109 | nfev=16


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=10 | cost=0.5516552136342723 | grad_norm=0.0576 | step=0.4718646522044218 | nfev=17


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=11 | cost=0.5515451891108629 | grad_norm=0.0130 | step=0.2359323261022109 | nfev=19


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=12 | cost=0.5514236934239349 | grad_norm=0.0522 | step=0.4718646522044218 | nfev=20


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=13 | cost=0.5513390164114428 | grad_norm=0.0118 | step=0.23593232610221088 | nfev=22


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=14 | cost=0.5512726365023863 | grad_norm=0.0473 | step=0.47186465220442175 | nfev=23


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=15 | cost=0.5512291528907305 | grad_norm=0.0104 | step=0.2359323261022109 | nfev=25


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=16 | cost=0.5512228168670419 | grad_norm=0.0022 | step=0.11796616305110545 | nfev=27


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=17 | cost=0.55122164161819 | grad_norm=4.9426e-05 | step=0.058983081525552726 | nfev=29


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=18 | cost=0.5512211757285889 | grad_norm=0.0065 | step=0.058983081525552726 | nfev=30


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=19 | cost=0.5512205409293798 | grad_norm=0.0070 | step=0.058983081525552726 | nfev=31


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=20 | cost=0.5512198746013339 | grad_norm=0.0069 | step=0.058983081525552726 | nfev=32


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=21 | cost=0.5512192340099134 | grad_norm=0.0069 | step=0.058983081525552726 | nfev=33


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=22 | cost=0.5512186203142282 | grad_norm=0.0068 | step=0.058983081525552726 | nfev=34


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=23 | cost=0.5512180326388212 | grad_norm=0.0067 | step=0.058983081525552726 | nfev=35


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=24 | cost=0.5512174698929493 | grad_norm=0.0067 | step=0.058983081525552726 | nfev=36


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=25 | cost=0.5512169309888342 | grad_norm=0.0066 | step=0.058983081525552726 | nfev=37


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=26 | cost=0.5512164148744804 | grad_norm=0.0065 | step=0.058983081525552726 | nfev=38


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=27 | cost=0.5512159205374855 | grad_norm=0.0064 | step=0.058983081525552726 | nfev=39


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=28 | cost=0.5512154470046644 | grad_norm=0.0063 | step=0.058983081525552726 | nfev=40


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=29 | cost=0.5512149933411399 | grad_norm=0.0062 | step=0.058983081525552726 | nfev=41


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=30 | cost=0.5512145586493873 | grad_norm=0.0061 | step=0.058983081525552726 | nfev=42


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=31 | cost=0.5512141420682963 | grad_norm=0.0060 | step=0.058983081525552726 | nfev=43


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=32 | cost=0.5512137427722327 | grad_norm=0.0059 | step=0.058983081525552726 | nfev=44


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=33 | cost=0.5512133599701083 | grad_norm=0.0058 | step=0.058983081525552726 | nfev=45


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=34 | cost=0.5512129929044511 | grad_norm=0.0057 | step=0.058983081525552726 | nfev=46


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=35 | cost=0.5512126408504644 | grad_norm=0.0056 | step=0.058983081525552726 | nfev=47


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=36 | cost=0.5512123031150828 | grad_norm=0.0055 | step=0.058983081525552726 | nfev=48


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=37 | cost=0.5512119790360127 | grad_norm=0.0054 | step=0.058983081525552726 | nfev=49


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=38 | cost=0.5512116679807659 | grad_norm=0.0053 | step=0.058983081525552726 | nfev=50


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=39 | cost=0.5512113693456806 | grad_norm=0.0052 | step=0.058983081525552726 | nfev=51


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=40 | cost=0.5512110825549424 | grad_norm=0.0051 | step=0.058983081525552726 | nfev=52


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=41 | cost=0.5512108070595867 | grad_norm=0.0050 | step=0.058983081525552726 | nfev=53


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=42 | cost=0.5512105423365141 | grad_norm=0.0049 | step=0.058983081525552726 | nfev=54


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=43 | cost=0.5512102878874965 | grad_norm=0.0048 | step=0.058983081525552726 | nfev=55


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=44 | cost=0.5512100432381939 | grad_norm=0.0047 | step=0.058983081525552726 | nfev=56


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=45 | cost=0.5512098079371781 | grad_norm=0.0046 | step=0.058983081525552726 | nfev=57


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=46 | cost=0.55120958155497 | grad_norm=0.0045 | step=0.058983081525552726 | nfev=58


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=47 | cost=0.5512093636830884 | grad_norm=0.0044 | step=0.058983081525552726 | nfev=59


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=48 | cost=0.5512091539331271 | grad_norm=0.0044 | step=0.058983081525552726 | nfev=60


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=49 | cost=0.5512089519358366 | grad_norm=0.0043 | step=0.058983081525552726 | nfev=61


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=50 | cost=0.5512087573402488 | grad_norm=0.0042 | step=0.058983081525552726 | nfev=62


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=51 | cost=0.5512085698128083 | grad_norm=0.0041 | step=0.058983081525552726 | nfev=63


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=52 | cost=0.5512083890365442 | grad_norm=0.0040 | step=0.058983081525552726 | nfev=64


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=53 | cost=0.5512082147102633 | grad_norm=0.0040 | step=0.058983081525552726 | nfev=65


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=54 | cost=0.5512080465477741 | grad_norm=0.0039 | step=0.058983081525552726 | nfev=66


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=55 | cost=0.5512078842771391 | grad_norm=0.0038 | step=0.058983081525552726 | nfev=67


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=56 | cost=0.5512077276399574 | grad_norm=0.0037 | step=0.058983081525552726 | nfev=68


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=57 | cost=0.5512075763906783 | grad_norm=0.0037 | step=0.058983081525552726 | nfev=69


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=58 | cost=0.551207430295944 | grad_norm=0.0036 | step=0.058983081525552726 | nfev=70


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=59 | cost=0.55120728913396 | grad_norm=0.0035 | step=0.058983081525552726 | nfev=71


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=60 | cost=0.5512071526938956 | grad_norm=0.0034 | step=0.058983081525552726 | nfev=72


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=61 | cost=0.5512070207753144 | grad_norm=0.0034 | step=0.058983081525552726 | nfev=73


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=62 | cost=0.5512068931876289 | grad_norm=0.0033 | step=0.058983081525552726 | nfev=74


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=63 | cost=0.5512067697495873 | grad_norm=0.0033 | step=0.058983081525552726 | nfev=75


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=64 | cost=0.5512066502887771 | grad_norm=0.0032 | step=0.058983081525552726 | nfev=76


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=65 | cost=0.5512065346411646 | grad_norm=0.0031 | step=0.058983081525552726 | nfev=77


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=66 | cost=0.5512064226506497 | grad_norm=0.0031 | step=0.058983081525552726 | nfev=78


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=67 | cost=0.5512063141686498 | grad_norm=0.0030 | step=0.058983081525552726 | nfev=79


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=68 | cost=0.5512062090537012 | grad_norm=0.0030 | step=0.058983081525552726 | nfev=80


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=69 | cost=0.5512061071710852 | grad_norm=0.0029 | step=0.058983081525552726 | nfev=81


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=70 | cost=0.5512060083924715 | grad_norm=0.0029 | step=0.058983081525552726 | nfev=82


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=71 | cost=0.5512059125955822 | grad_norm=0.0028 | step=0.058983081525552726 | nfev=83


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=72 | cost=0.5512058196638758 | grad_norm=0.0028 | step=0.058983081525552726 | nfev=84


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=73 | cost=0.5512057294862454 | grad_norm=0.0027 | step=0.058983081525552726 | nfev=85


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=74 | cost=0.551205641956735 | grad_norm=0.0027 | step=0.058983081525552726 | nfev=86


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=75 | cost=0.5512055569742702 | grad_norm=0.0026 | step=0.058983081525552726 | nfev=87


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=76 | cost=0.551205474442406 | grad_norm=0.0026 | step=0.058983081525552726 | nfev=88


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=77 | cost=0.5512053942690874 | grad_norm=0.0025 | step=0.058983081525552726 | nfev=89


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=78 | cost=0.5512053163664217 | grad_norm=0.0025 | step=0.058983081525552726 | nfev=90


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=79 | cost=0.5512052406504653 | grad_norm=0.0024 | step=0.058983081525552726 | nfev=91


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=80 | cost=0.5512051670410225 | grad_norm=0.0024 | step=0.058983081525552726 | nfev=92


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=81 | cost=0.551205095461454 | grad_norm=0.0024 | step=0.058983081525552726 | nfev=93


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=82 | cost=0.5512050258384957 | grad_norm=0.0023 | step=0.058983081525552726 | nfev=94


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=83 | cost=0.5512049581020902 | grad_norm=0.0023 | step=0.058983081525552726 | nfev=95


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=84 | cost=0.5512048921852257 | grad_norm=0.0022 | step=0.058983081525552726 | nfev=96


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=85 | cost=0.5512048280237816 | grad_norm=0.0022 | step=0.058983081525552726 | nfev=97


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=86 | cost=0.5512047655563866 | grad_norm=0.0022 | step=0.058983081525552726 | nfev=98


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=87 | cost=0.5512047047242836 | grad_norm=0.0021 | step=0.058983081525552726 | nfev=99


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=88 | cost=0.5512046454711985 | grad_norm=0.0021 | step=0.058983081525552726 | nfev=100


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=89 | cost=0.5512045877432215 | grad_norm=0.0021 | step=0.058983081525552726 | nfev=101


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=90 | cost=0.5512045314886893 | grad_norm=0.0020 | step=0.058983081525552726 | nfev=102


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=91 | cost=0.5512044766580801 | grad_norm=0.0020 | step=0.058983081525552726 | nfev=103


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=92 | cost=0.5512044232039074 | grad_norm=0.0020 | step=0.058983081525552726 | nfev=104


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=93 | cost=0.5512043710806243 | grad_norm=0.0019 | step=0.058983081525552726 | nfev=105


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=94 | cost=0.5512043202445296 | grad_norm=0.0019 | step=0.058983081525552726 | nfev=106


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=95 | cost=0.5512042706536835 | grad_norm=0.0019 | step=0.058983081525552726 | nfev=107


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=96 | cost=0.5512042222678212 | grad_norm=0.0018 | step=0.058983081525552726 | nfev=108


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=97 | cost=0.5512041750482795 | grad_norm=0.0018 | step=0.058983081525552726 | nfev=109


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=98 | cost=0.5512041289579166 | grad_norm=0.0018 | step=0.058983081525552726 | nfev=110


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=99 | cost=0.5512040839610469 | grad_norm=0.0018 | step=0.058983081525552726 | nfev=111


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=100 | cost=0.5512040400233724 | grad_norm=0.0017 | step=0.058983081525552726 | nfev=112


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=101 | cost=0.5512039971119183 | grad_norm=0.0017 | step=0.058983081525552726 | nfev=113


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=102 | cost=0.5512039551949741 | grad_norm=0.0017 | step=0.058983081525552726 | nfev=114


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=103 | cost=0.5512039142420389 | grad_norm=0.0017 | step=0.058983081525552726 | nfev=115


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=104 | cost=0.5512038742237632 | grad_norm=0.0016 | step=0.058983081525552726 | nfev=116


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=105 | cost=0.551203835111899 | grad_norm=0.0016 | step=0.058983081525552726 | nfev=117


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=106 | cost=0.5512037968792536 | grad_norm=0.0016 | step=0.058983081525552726 | nfev=118


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=107 | cost=0.5512037594996415 | grad_norm=0.0016 | step=0.058983081525552726 | nfev=119


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=108 | cost=0.551203722947839 | grad_norm=0.0015 | step=0.058983081525552726 | nfev=120


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=109 | cost=0.551203687199545 | grad_norm=0.0015 | step=0.058983081525552726 | nfev=121


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=110 | cost=0.5512036522313408 | grad_norm=0.0015 | step=0.058983081525552726 | nfev=122


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=111 | cost=0.5512036180206505 | grad_norm=0.0015 | step=0.058983081525552726 | nfev=123


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=112 | cost=0.5512035845457073 | grad_norm=0.0015 | step=0.058983081525552726 | nfev=124


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=113 | cost=0.5512035517855175 | grad_norm=0.0014 | step=0.058983081525552726 | nfev=125


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=114 | cost=0.5512035197198296 | grad_norm=0.0014 | step=0.058983081525552726 | nfev=126


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=115 | cost=0.5512034883291033 | grad_norm=0.0014 | step=0.058983081525552726 | nfev=127


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=116 | cost=0.5512034575944774 | grad_norm=0.0014 | step=0.058983081525552726 | nfev=128


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=117 | cost=0.5512034274977453 | grad_norm=0.0014 | step=0.058983081525552726 | nfev=129


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=118 | cost=0.5512033980213263 | grad_norm=0.0014 | step=0.058983081525552726 | nfev=130


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=119 | cost=0.5512033691482395 | grad_norm=0.0013 | step=0.058983081525552726 | nfev=131


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=120 | cost=0.551203340862082 | grad_norm=0.0013 | step=0.058983081525552726 | nfev=132


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=121 | cost=0.5512033131470025 | grad_norm=0.0013 | step=0.058983081525552726 | nfev=133


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=122 | cost=0.5512032859876812 | grad_norm=0.0013 | step=0.058983081525552726 | nfev=134


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=123 | cost=0.5512032593693081 | grad_norm=0.0013 | step=0.058983081525552726 | nfev=135


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=124 | cost=0.5512032332775653 | grad_norm=0.0013 | step=0.058983081525552726 | nfev=136


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=125 | cost=0.5512032076986022 | grad_norm=0.0012 | step=0.058983081525552726 | nfev=137


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=126 | cost=0.551203182619022 | grad_norm=0.0012 | step=0.058983081525552726 | nfev=138


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=127 | cost=0.5512031580258633 | grad_norm=0.0012 | step=0.058983081525552726 | nfev=139


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=128 | cost=0.5512031339065803 | grad_norm=0.0012 | step=0.058983081525552726 | nfev=140


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=129 | cost=0.551203110249032 | grad_norm=0.0012 | step=0.058983081525552726 | nfev=141


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=130 | cost=0.5512030870414628 | grad_norm=0.0012 | step=0.058983081525552726 | nfev=142


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=131 | cost=0.5512030642724879 | grad_norm=0.0011 | step=0.058983081525552726 | nfev=143


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=132 | cost=0.5512030419310827 | grad_norm=0.0011 | step=0.058983081525552726 | nfev=144


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=133 | cost=0.551203020006565 | grad_norm=0.0011 | step=0.058983081525552726 | nfev=145


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=134 | cost=0.551202998488587 | grad_norm=0.0011 | step=0.058983081525552726 | nfev=146


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=135 | cost=0.5512029773671179 | grad_norm=0.0011 | step=0.058983081525552726 | nfev=147


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=136 | cost=0.551202956632436 | grad_norm=0.0011 | step=0.058983081525552726 | nfev=148


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=137 | cost=0.5512029362751153 | grad_norm=0.0011 | step=0.058983081525552726 | nfev=149


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=138 | cost=0.5512029162860175 | grad_norm=0.0011 | step=0.058983081525552726 | nfev=150


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=139 | cost=0.5512028966562773 | grad_norm=0.0010 | step=0.058983081525552726 | nfev=151


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=140 | cost=0.5512028773772968 | grad_norm=0.0010 | step=0.058983081525552726 | nfev=152


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=141 | cost=0.5512028584407327 | grad_norm=0.0010 | step=0.058983081525552726 | nfev=153


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=142 | cost=0.5512028398384896 | grad_norm=0.0010 | step=0.058983081525552726 | nfev=154


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=143 | cost=0.5512028215627114 | grad_norm=9.9472e-04 | step=0.058983081525552726 | nfev=155


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=144 | cost=0.5512028036057695 | grad_norm=9.8347e-04 | step=0.058983081525552726 | nfev=156


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=145 | cost=0.551202785960258 | grad_norm=9.7241e-04 | step=0.058983081525552726 | nfev=157


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=146 | cost=0.5512027686189866 | grad_norm=9.6154e-04 | step=0.058983081525552726 | nfev=158


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=147 | cost=0.551202751574972 | grad_norm=9.5084e-04 | step=0.058983081525552726 | nfev=159


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=148 | cost=0.5512027348214272 | grad_norm=9.4032e-04 | step=0.058983081525552726 | nfev=160


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=149 | cost=0.5512027183517615 | grad_norm=9.2997e-04 | step=0.058983081525552726 | nfev=161


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=150 | cost=0.55120270215957 | grad_norm=9.1979e-04 | step=0.058983081525552726 | nfev=162


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=151 | cost=0.5512026862386259 | grad_norm=9.0978e-04 | step=0.058983081525552726 | nfev=163


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=152 | cost=0.5512026705828791 | grad_norm=8.9993e-04 | step=0.058983081525552726 | nfev=164


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=153 | cost=0.5512026551864457 | grad_norm=8.9023e-04 | step=0.058983081525552726 | nfev=165


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=154 | cost=0.5512026400436063 | grad_norm=8.8070e-04 | step=0.058983081525552726 | nfev=166


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=155 | cost=0.5512026251487969 | grad_norm=8.7131e-04 | step=0.058983081525552726 | nfev=167


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=156 | cost=0.5512026104966062 | grad_norm=8.6207e-04 | step=0.058983081525552726 | nfev=168


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=157 | cost=0.5512025960817712 | grad_norm=8.5298e-04 | step=0.058983081525552726 | nfev=169


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=158 | cost=0.5512025818991692 | grad_norm=8.4403e-04 | step=0.058983081525552726 | nfev=170


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=159 | cost=0.5512025679438174 | grad_norm=8.3522e-04 | step=0.058983081525552726 | nfev=171


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=160 | cost=0.5512025542108647 | grad_norm=8.2654e-04 | step=0.058983081525552726 | nfev=172


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=161 | cost=0.5512025406955894 | grad_norm=8.1800e-04 | step=0.058983081525552726 | nfev=173


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=162 | cost=0.5512025273933958 | grad_norm=8.0959e-04 | step=0.058983081525552726 | nfev=174


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=163 | cost=0.5512025142998092 | grad_norm=8.0131e-04 | step=0.058983081525552726 | nfev=175


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=164 | cost=0.5512025014104693 | grad_norm=7.9316e-04 | step=0.058983081525552726 | nfev=176


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=165 | cost=0.551202488721133 | grad_norm=7.8512e-04 | step=0.058983081525552726 | nfev=177


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=166 | cost=0.5512024762276658 | grad_norm=7.7721e-04 | step=0.058983081525552726 | nfev=178


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=167 | cost=0.551202463926039 | grad_norm=7.6942e-04 | step=0.058983081525552726 | nfev=179


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=168 | cost=0.5512024518123286 | grad_norm=7.6174e-04 | step=0.058983081525552726 | nfev=180


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=169 | cost=0.5512024398827086 | grad_norm=7.5418e-04 | step=0.058983081525552726 | nfev=181


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=170 | cost=0.5512024281334535 | grad_norm=7.4673e-04 | step=0.058983081525552726 | nfev=182


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=171 | cost=0.5512024165609288 | grad_norm=7.3939e-04 | step=0.058983081525552726 | nfev=183


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=172 | cost=0.551202405161593 | grad_norm=7.3216e-04 | step=0.058983081525552726 | nfev=184


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=173 | cost=0.5512023939319919 | grad_norm=7.2503e-04 | step=0.058983081525552726 | nfev=185


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=174 | cost=0.5512023828687587 | grad_norm=7.1800e-04 | step=0.058983081525552726 | nfev=186


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=175 | cost=0.5512023719686087 | grad_norm=7.1108e-04 | step=0.058983081525552726 | nfev=187


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=176 | cost=0.5512023612283392 | grad_norm=7.0426e-04 | step=0.058983081525552726 | nfev=188


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=177 | cost=0.5512023506448249 | grad_norm=6.9753e-04 | step=0.058983081525552726 | nfev=189


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=178 | cost=0.551202340215017 | grad_norm=6.9090e-04 | step=0.058983081525552726 | nfev=190


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=179 | cost=0.5512023299359419 | grad_norm=6.8436e-04 | step=0.058983081525552726 | nfev=191


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=180 | cost=0.5512023198046957 | grad_norm=6.7792e-04 | step=0.058983081525552726 | nfev=192


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=181 | cost=0.5512023098184458 | grad_norm=6.7156e-04 | step=0.058983081525552726 | nfev=193


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=182 | cost=0.5512022999744272 | grad_norm=6.6529e-04 | step=0.058983081525552726 | nfev=194


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=183 | cost=0.5512022902699403 | grad_norm=6.5912e-04 | step=0.058983081525552726 | nfev=195


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=184 | cost=0.5512022807023484 | grad_norm=6.5302e-04 | step=0.058983081525552726 | nfev=196


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=185 | cost=0.5512022712690808 | grad_norm=6.4701e-04 | step=0.058983081525552726 | nfev=197


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=186 | cost=0.5512022619676221 | grad_norm=6.4398e-04 | step=0.058983081525552726 | nfev=198


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=187 | cost=0.5512022527955183 | grad_norm=6.4102e-04 | step=0.058983081525552726 | nfev=199






PERFORMANCE:nlsq.least_squares:Timer: optimization elapsed=2.311995s


INFO:nlsq.least_squares:Convergence reason=The maximum number of function evaluations is exceeded. | iterations=188 | final_cost=0.5512 | elapsed=2.312s | final_gradient_norm=6.3808e-04


PERFORMANCE:nlsq.curve_fit:Timer: curve_fit elapsed=2.851011s


[ERROR] Optimization failed reason=The maximum number of function evaluations is exceeded. | status=0


ERROR:nlsq.curve_fit:Optimization failed reason=The maximum number of function evaluations is exceeded. | status=0


INFO:nlsq.curve_fit:Starting curve fit n_params=4 | n_data_points=10000 | method=trf | solver=auto | batch_size=None | has_bounds=False | dynamic_sizing=False


INFO:nlsq.least_squares:Starting least squares optimization method=trf | n_params=4 | loss=linear | ftol=1.0000e-08 | xtol=1.0000e-08 | gtol=1.0000e-08


Warmup fit skipped: Optimization failed to converge.

Diagnostics:
  - Final cost: 5.512022e-01
  - Gradient norm: 6.380825e-04
  - Gradient tolerance: 1.000000e-08
  - Function evaluations: 200 / 200
  - Iterations: 188
  - Status: The maximum number of function evaluations is exceeded.

Reasons:
  - Gradient norm 6.38e-04 exceeds tolerance 1.00e-08
  - Reached maximum function evaluations (200)

Recommendations:
  ✓ Try looser gradient tolerance: gtol=1.0e-07
  ✓ Check if initial guess p0 is reasonable
  ✓ Consider parameter scaling with x_scale
  ✓ Increase iteration limit: max_nfev=400
  ✓ Provide better initial guess p0
  ✓ Try different optimization method (trf/dogbox/lm)

For more help, see: https://nlsq.readthedocs.io/troubleshooting


INFO:nlsq.optimizer.trf:Starting TRF optimization (no bounds) n_params=4 | n_residuals=10000 | max_nfev=None


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=0 | cost=51.554223109787316 | grad_norm=59.8867 | nfev=1


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=1 | cost=51.4853478091409 | grad_norm=0.1611 | step=3.774917217635375 | nfev=2


PERFORMANCE:nlsq.least_squares:Timer: optimization elapsed=0.351746s


INFO:nlsq.least_squares:Convergence reason=`ftol` termination condition is satisfied. | iterations=2 | final_cost=51.4853 | elapsed=0.352s | final_gradient_norm=3.3996e-05


PERFORMANCE:nlsq.curve_fit:Timer: curve_fit elapsed=0.733130s




INFO:nlsq.curve_fit:Starting curve fit n_params=4 | n_data_points=10000 | method=trf | solver=auto | batch_size=None | has_bounds=False | dynamic_sizing=False


INFO:nlsq.least_squares:Starting least squares optimization method=trf | n_params=4 | loss=linear | ftol=1.0000e-08 | xtol=1.0000e-08 | gtol=1.0000e-08


INFO:nlsq.optimizer.trf:Starting TRF optimization (no bounds) n_params=4 | n_residuals=10000 | max_nfev=None


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=0 | cost=55.80311965114427 | grad_norm=294.9443 | nfev=1


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=1 | cost=51.4853524140059 | grad_norm=0.4064 | step=3.8548670534792766 | nfev=2


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=2 | cost=51.48534778301991 | grad_norm=4.6586e-04 | step=3.8548670534792766 | nfev=3


PERFORMANCE:nlsq.least_squares:Timer: optimization elapsed=0.012963s


INFO:nlsq.least_squares:Convergence reason=`ftol` termination condition is satisfied. | iterations=3 | final_cost=51.4853 | elapsed=0.013s | final_gradient_norm=9.8790e-07


PERFORMANCE:nlsq.curve_fit:Timer: curve_fit elapsed=0.107846s




INFO:nlsq.curve_fit:Starting curve fit n_params=4 | n_data_points=10000 | method=trf | solver=auto | batch_size=None | has_bounds=False | dynamic_sizing=False


INFO:nlsq.least_squares:Starting least squares optimization method=trf | n_params=4 | loss=linear | ftol=1.0000e-08 | xtol=1.0000e-08 | gtol=1.0000e-08


INFO:nlsq.optimizer.trf:Starting TRF optimization (no bounds) n_params=4 | n_residuals=10000 | max_nfev=None


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=0 | cost=70.05556329898204 | grad_norm=624.2669 | nfev=1


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=1 | cost=51.48536518653959 | grad_norm=0.7895 | step=3.9357337308308855 | nfev=2


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=2 | cost=51.48534778305284 | grad_norm=9.1010e-04 | step=3.9357337308308855 | nfev=3


PERFORMANCE:nlsq.least_squares:Timer: optimization elapsed=0.023614s


INFO:nlsq.least_squares:Convergence reason=`ftol` termination condition is satisfied. | iterations=3 | final_cost=51.4853 | elapsed=0.024s | final_gradient_norm=1.9123e-06


PERFORMANCE:nlsq.curve_fit:Timer: curve_fit elapsed=0.115855s





Device: cuda:0
Average fit time: 373.7 ± 364.9 ms
Throughput: 2.7 fits/second

Running on CPU - results are valid but slower than GPU
  With GPU: Expect 5-50x speedup for large datasets
Batch Processing Benchmark:


INFO:nlsq.curve_fit:Starting curve fit n_params=2 | n_data_points=30 | method=trf | solver=auto | batch_size=None | has_bounds=False | dynamic_sizing=False


INFO:nlsq.least_squares:Starting least squares optimization method=trf | n_params=2 | loss=linear | ftol=1.0000e-08 | xtol=1.0000e-08 | gtol=1.0000e-08


Batch size: 20 datasets
Points per dataset: 30
Total data points: 600

Method 1: Sequential fitting (baseline)


INFO:nlsq.optimizer.trf:Starting TRF optimization (no bounds) n_params=2 | n_residuals=30 | max_nfev=None


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=0 | cost=0.5225872357815996 | grad_norm=8.2417 | nfev=1


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=1 | cost=0.026476618753407236 | grad_norm=0.4081 | step=3.0413812651491097 | nfev=2


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=2 | cost=0.025113551640039436 | grad_norm=0.0054 | step=3.0413812651491097 | nfev=3


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=3 | cost=0.025113230763703094 | grad_norm=3.8822e-05 | step=3.0413812651491097 | nfev=4


PERFORMANCE:nlsq.least_squares:Timer: optimization elapsed=0.750440s


INFO:nlsq.least_squares:Convergence reason=`ftol` termination condition is satisfied. | iterations=4 | final_cost=0.0251 | elapsed=0.750s | final_gradient_norm=3.3752e-07


PERFORMANCE:nlsq.curve_fit:Timer: curve_fit elapsed=1.150753s




INFO:nlsq.curve_fit:Starting curve fit n_params=2 | n_data_points=30 | method=trf | solver=auto | batch_size=None | has_bounds=False | dynamic_sizing=False


INFO:nlsq.least_squares:Starting least squares optimization method=trf | n_params=2 | loss=linear | ftol=1.0000e-08 | xtol=1.0000e-08 | gtol=1.0000e-08


INFO:nlsq.optimizer.trf:Starting TRF optimization (no bounds) n_params=2 | n_residuals=30 | max_nfev=None


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=0 | cost=8.320835066567092 | grad_norm=36.2767 | nfev=1


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=1 | cost=1.8979661734514948 | grad_norm=47.4771 | step=3.0413812651491097 | nfev=2


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=2 | cost=0.0626532033318479 | grad_norm=4.2452 | step=3.0413812651491097 | nfev=3


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=3 | cost=0.036295782157610354 | grad_norm=0.0690 | step=3.0413812651491097 | nfev=4


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=4 | cost=0.03628807560742915 | grad_norm=8.7024e-05 | step=3.0413812651491097 | nfev=5


PERFORMANCE:nlsq.least_squares:Timer: optimization elapsed=0.052319s


INFO:nlsq.least_squares:Convergence reason=`ftol` termination condition is satisfied. | iterations=5 | final_cost=0.0363 | elapsed=0.052s | final_gradient_norm=2.3089e-07


PERFORMANCE:nlsq.curve_fit:Timer: curve_fit elapsed=0.111363s




INFO:nlsq.curve_fit:Starting curve fit n_params=2 | n_data_points=30 | method=trf | solver=auto | batch_size=None | has_bounds=False | dynamic_sizing=False


INFO:nlsq.least_squares:Starting least squares optimization method=trf | n_params=2 | loss=linear | ftol=1.0000e-08 | xtol=1.0000e-08 | gtol=1.0000e-08


INFO:nlsq.optimizer.trf:Starting TRF optimization (no bounds) n_params=2 | n_residuals=30 | max_nfev=None


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=0 | cost=1.9733037364865122 | grad_norm=17.5519 | nfev=1


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=1 | cost=0.09630531953369037 | grad_norm=5.4999 | step=3.0413812651491097 | nfev=2


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=2 | cost=0.02805578332578466 | grad_norm=0.1880 | step=3.0413812651491097 | nfev=3


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=3 | cost=0.0279543368786802 | grad_norm=6.1316e-04 | step=3.0413812651491097 | nfev=4


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=4 | cost=0.02795433536611485 | grad_norm=1.4272e-06 | step=3.0413812651491097 | nfev=5


PERFORMANCE:nlsq.least_squares:Timer: optimization elapsed=0.040030s


INFO:nlsq.least_squares:Convergence reason=`ftol` termination condition is satisfied. | iterations=5 | final_cost=0.0280 | elapsed=0.040s | final_gradient_norm=3.8472e-09


PERFORMANCE:nlsq.curve_fit:Timer: curve_fit elapsed=0.130349s




INFO:nlsq.curve_fit:Starting curve fit n_params=2 | n_data_points=30 | method=trf | solver=auto | batch_size=None | has_bounds=False | dynamic_sizing=False


INFO:nlsq.least_squares:Starting least squares optimization method=trf | n_params=2 | loss=linear | ftol=1.0000e-08 | xtol=1.0000e-08 | gtol=1.0000e-08


INFO:nlsq.optimizer.trf:Starting TRF optimization (no bounds) n_params=2 | n_residuals=30 | max_nfev=None


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=0 | cost=0.5056953819997421 | grad_norm=8.5765 | nfev=1


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=1 | cost=0.038859944634798034 | grad_norm=0.9166 | step=3.0413812651491097 | nfev=2


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=2 | cost=0.03584341708851256 | grad_norm=0.0118 | step=3.0413812651491097 | nfev=3


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=3 | cost=0.035842767103525054 | grad_norm=6.9662e-05 | step=3.0413812651491097 | nfev=4


PERFORMANCE:nlsq.least_squares:Timer: optimization elapsed=0.036099s


INFO:nlsq.least_squares:Convergence reason=`ftol` termination condition is satisfied. | iterations=4 | final_cost=0.0358 | elapsed=0.036s | final_gradient_norm=5.2621e-07


PERFORMANCE:nlsq.curve_fit:Timer: curve_fit elapsed=0.099576s




INFO:nlsq.curve_fit:Starting curve fit n_params=2 | n_data_points=30 | method=trf | solver=auto | batch_size=None | has_bounds=False | dynamic_sizing=False


INFO:nlsq.least_squares:Starting least squares optimization method=trf | n_params=2 | loss=linear | ftol=1.0000e-08 | xtol=1.0000e-08 | gtol=1.0000e-08


INFO:nlsq.optimizer.trf:Starting TRF optimization (no bounds) n_params=2 | n_residuals=30 | max_nfev=None


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=0 | cost=1.3205521228555646 | grad_norm=9.5356 | nfev=1


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=1 | cost=0.03503011364648486 | grad_norm=0.3664 | step=3.0413812651491097 | nfev=2


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=2 | cost=0.0339275339343679 | grad_norm=0.0022 | step=3.0413812651491097 | nfev=3


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=3 | cost=0.0339274936912418 | grad_norm=4.1347e-06 | step=3.0413812651491097 | nfev=4


PERFORMANCE:nlsq.least_squares:Timer: optimization elapsed=0.018396s


INFO:nlsq.least_squares:Convergence reason=`ftol` termination condition is satisfied. | iterations=4 | final_cost=0.0339 | elapsed=0.018s | final_gradient_norm=1.3768e-08


PERFORMANCE:nlsq.curve_fit:Timer: curve_fit elapsed=0.098067s




INFO:nlsq.curve_fit:Starting curve fit n_params=2 | n_data_points=30 | method=trf | solver=auto | batch_size=None | has_bounds=False | dynamic_sizing=False


INFO:nlsq.least_squares:Starting least squares optimization method=trf | n_params=2 | loss=linear | ftol=1.0000e-08 | xtol=1.0000e-08 | gtol=1.0000e-08


INFO:nlsq.optimizer.trf:Starting TRF optimization (no bounds) n_params=2 | n_residuals=30 | max_nfev=None


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=0 | cost=2.7302633602190287 | grad_norm=18.3216 | nfev=1


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=1 | cost=0.06071924609874196 | grad_norm=1.2321 | step=3.0413812651491097 | nfev=2


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=2 | cost=0.040180569506111194 | grad_norm=0.0614 | step=3.0413812651491097 | nfev=3


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=3 | cost=0.04010596605740606 | grad_norm=0.0014 | step=3.0413812651491097 | nfev=4


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=4 | cost=0.04010590979352266 | grad_norm=3.6026e-05 | step=3.0413812651491097 | nfev=5


PERFORMANCE:nlsq.least_squares:Timer: optimization elapsed=0.035933s


INFO:nlsq.least_squares:Convergence reason=`ftol` termination condition is satisfied. | iterations=5 | final_cost=0.0401 | elapsed=0.036s | final_gradient_norm=9.3706e-07


PERFORMANCE:nlsq.curve_fit:Timer: curve_fit elapsed=0.086681s




INFO:nlsq.curve_fit:Starting curve fit n_params=2 | n_data_points=30 | method=trf | solver=auto | batch_size=None | has_bounds=False | dynamic_sizing=False


INFO:nlsq.least_squares:Starting least squares optimization method=trf | n_params=2 | loss=linear | ftol=1.0000e-08 | xtol=1.0000e-08 | gtol=1.0000e-08


INFO:nlsq.optimizer.trf:Starting TRF optimization (no bounds) n_params=2 | n_residuals=30 | max_nfev=None


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=0 | cost=1.489797999660618 | grad_norm=5.8385 | nfev=1


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=1 | cost=0.05463793023727093 | grad_norm=1.6251 | step=3.0413812651491097 | nfev=2


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=2 | cost=0.03688097099174209 | grad_norm=0.0422 | step=3.0413812651491097 | nfev=3


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=3 | cost=0.03687051949325786 | grad_norm=1.6124e-04 | step=3.0413812651491097 | nfev=4


PERFORMANCE:nlsq.least_squares:Timer: optimization elapsed=0.018504s


INFO:nlsq.least_squares:Convergence reason=`ftol` termination condition is satisfied. | iterations=4 | final_cost=0.0369 | elapsed=0.019s | final_gradient_norm=8.7535e-07


PERFORMANCE:nlsq.curve_fit:Timer: curve_fit elapsed=0.076328s




INFO:nlsq.curve_fit:Starting curve fit n_params=2 | n_data_points=30 | method=trf | solver=auto | batch_size=None | has_bounds=False | dynamic_sizing=False


INFO:nlsq.least_squares:Starting least squares optimization method=trf | n_params=2 | loss=linear | ftol=1.0000e-08 | xtol=1.0000e-08 | gtol=1.0000e-08


INFO:nlsq.optimizer.trf:Starting TRF optimization (no bounds) n_params=2 | n_residuals=30 | max_nfev=None


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=0 | cost=1.6650300549743582 | grad_norm=11.3179 | nfev=1


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=1 | cost=0.053309973523201715 | grad_norm=0.3497 | step=3.0413812651491097 | nfev=2


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=2 | cost=0.05285810559257974 | grad_norm=0.0029 | step=3.0413812651491097 | nfev=3


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=3 | cost=0.052858061548556864 | grad_norm=2.2856e-05 | step=3.0413812651491097 | nfev=4


PERFORMANCE:nlsq.least_squares:Timer: optimization elapsed=0.036924s


INFO:nlsq.least_squares:Convergence reason=`ftol` termination condition is satisfied. | iterations=4 | final_cost=0.0529 | elapsed=0.037s | final_gradient_norm=2.1189e-07


PERFORMANCE:nlsq.curve_fit:Timer: curve_fit elapsed=0.117616s




INFO:nlsq.curve_fit:Starting curve fit n_params=2 | n_data_points=30 | method=trf | solver=auto | batch_size=None | has_bounds=False | dynamic_sizing=False


INFO:nlsq.least_squares:Starting least squares optimization method=trf | n_params=2 | loss=linear | ftol=1.0000e-08 | xtol=1.0000e-08 | gtol=1.0000e-08


INFO:nlsq.optimizer.trf:Starting TRF optimization (no bounds) n_params=2 | n_residuals=30 | max_nfev=None


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=0 | cost=0.09855331294894148 | grad_norm=0.6839 | nfev=1


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=1 | cost=0.02693976600136428 | grad_norm=0.0649 | step=3.0413812651491097 | nfev=2


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=2 | cost=0.026902642802058847 | grad_norm=1.1020e-05 | step=3.0413812651491097 | nfev=3


PERFORMANCE:nlsq.least_squares:Timer: optimization elapsed=0.016100s


INFO:nlsq.least_squares:Convergence reason=`ftol` termination condition is satisfied. | iterations=3 | final_cost=0.0269 | elapsed=0.016s | final_gradient_norm=7.0072e-08


PERFORMANCE:nlsq.curve_fit:Timer: curve_fit elapsed=0.075463s




INFO:nlsq.curve_fit:Starting curve fit n_params=2 | n_data_points=30 | method=trf | solver=auto | batch_size=None | has_bounds=False | dynamic_sizing=False


INFO:nlsq.least_squares:Starting least squares optimization method=trf | n_params=2 | loss=linear | ftol=1.0000e-08 | xtol=1.0000e-08 | gtol=1.0000e-08


INFO:nlsq.optimizer.trf:Starting TRF optimization (no bounds) n_params=2 | n_residuals=30 | max_nfev=None


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=0 | cost=5.833445277837116 | grad_norm=32.0825 | nfev=1


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=1 | cost=1.9666442899880727 | grad_norm=48.6621 | step=3.0413812651491097 | nfev=2


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=2 | cost=0.05178535154183159 | grad_norm=4.5342 | step=3.0413812651491097 | nfev=3


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=3 | cost=0.020866599103336762 | grad_norm=0.0854 | step=3.0413812651491097 | nfev=4


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=4 | cost=0.020854126941025466 | grad_norm=3.4918e-05 | step=3.0413812651491097 | nfev=5


PERFORMANCE:nlsq.least_squares:Timer: optimization elapsed=0.023942s


INFO:nlsq.least_squares:Convergence reason=`ftol` termination condition is satisfied. | iterations=5 | final_cost=0.0209 | elapsed=0.024s | final_gradient_norm=5.9272e-08


PERFORMANCE:nlsq.curve_fit:Timer: curve_fit elapsed=0.079642s




INFO:nlsq.curve_fit:Starting curve fit n_params=2 | n_data_points=30 | method=trf | solver=auto | batch_size=None | has_bounds=False | dynamic_sizing=False


INFO:nlsq.least_squares:Starting least squares optimization method=trf | n_params=2 | loss=linear | ftol=1.0000e-08 | xtol=1.0000e-08 | gtol=1.0000e-08


INFO:nlsq.optimizer.trf:Starting TRF optimization (no bounds) n_params=2 | n_residuals=30 | max_nfev=None


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=0 | cost=3.48984996591589 | grad_norm=18.9520 | nfev=1


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=1 | cost=0.024647345824899317 | grad_norm=0.6176 | step=3.0413812651491097 | nfev=2


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=2 | cost=0.01935201617172621 | grad_norm=0.0164 | step=3.0413812651491097 | nfev=3


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=3 | cost=0.019347065225487167 | grad_norm=1.6843e-04 | step=3.0413812651491097 | nfev=4


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=4 | cost=0.019347064435196286 | grad_norm=2.0962e-06 | step=3.0413812651491097 | nfev=5


PERFORMANCE:nlsq.least_squares:Timer: optimization elapsed=0.035473s


INFO:nlsq.least_squares:Convergence reason=`ftol` termination condition is satisfied. | iterations=5 | final_cost=0.0193 | elapsed=0.035s | final_gradient_norm=2.6483e-08


PERFORMANCE:nlsq.curve_fit:Timer: curve_fit elapsed=0.089629s




INFO:nlsq.curve_fit:Starting curve fit n_params=2 | n_data_points=30 | method=trf | solver=auto | batch_size=None | has_bounds=False | dynamic_sizing=False


INFO:nlsq.least_squares:Starting least squares optimization method=trf | n_params=2 | loss=linear | ftol=1.0000e-08 | xtol=1.0000e-08 | gtol=1.0000e-08


INFO:nlsq.optimizer.trf:Starting TRF optimization (no bounds) n_params=2 | n_residuals=30 | max_nfev=None


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=0 | cost=8.056843023016562 | grad_norm=35.1498 | nfev=1


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=1 | cost=1.402475229975219 | grad_norm=38.3875 | step=3.0413812651491097 | nfev=2


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=2 | cost=0.05009364032918688 | grad_norm=3.1498 | step=3.0413812651491097 | nfev=3


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=3 | cost=0.03467420628105072 | grad_norm=0.0369 | step=3.0413812651491097 | nfev=4


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=4 | cost=0.03467198597679173 | grad_norm=7.9309e-05 | step=3.0413812651491097 | nfev=5


PERFORMANCE:nlsq.least_squares:Timer: optimization elapsed=0.044155s


INFO:nlsq.least_squares:Convergence reason=`ftol` termination condition is satisfied. | iterations=5 | final_cost=0.0347 | elapsed=0.044s | final_gradient_norm=3.0924e-07


PERFORMANCE:nlsq.curve_fit:Timer: curve_fit elapsed=0.104643s




INFO:nlsq.curve_fit:Starting curve fit n_params=2 | n_data_points=30 | method=trf | solver=auto | batch_size=None | has_bounds=False | dynamic_sizing=False


INFO:nlsq.least_squares:Starting least squares optimization method=trf | n_params=2 | loss=linear | ftol=1.0000e-08 | xtol=1.0000e-08 | gtol=1.0000e-08


INFO:nlsq.optimizer.trf:Starting TRF optimization (no bounds) n_params=2 | n_residuals=30 | max_nfev=None


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=0 | cost=7.728596857043053 | grad_norm=36.1759 | nfev=1


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=1 | cost=2.743440108141602 | grad_norm=61.8911 | step=3.0413812651491097 | nfev=2


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=2 | cost=0.0964562961219832 | grad_norm=6.2433 | step=3.0413812651491097 | nfev=3


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=3 | cost=0.043194788215163815 | grad_norm=0.1855 | step=3.0413812651491097 | nfev=4


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=4 | cost=0.04313263531161013 | grad_norm=0.0016 | step=3.0413812651491097 | nfev=5


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=5 | cost=0.04313262795810138 | grad_norm=1.6615e-05 | step=3.0413812651491097 | nfev=6


PERFORMANCE:nlsq.least_squares:Timer: optimization elapsed=0.024291s


INFO:nlsq.least_squares:Convergence reason=`ftol` termination condition is satisfied. | iterations=6 | final_cost=0.0431 | elapsed=0.024s | final_gradient_norm=1.7615e-07


PERFORMANCE:nlsq.curve_fit:Timer: curve_fit elapsed=0.079757s




INFO:nlsq.curve_fit:Starting curve fit n_params=2 | n_data_points=30 | method=trf | solver=auto | batch_size=None | has_bounds=False | dynamic_sizing=False


INFO:nlsq.least_squares:Starting least squares optimization method=trf | n_params=2 | loss=linear | ftol=1.0000e-08 | xtol=1.0000e-08 | gtol=1.0000e-08


INFO:nlsq.optimizer.trf:Starting TRF optimization (no bounds) n_params=2 | n_residuals=30 | max_nfev=None


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=0 | cost=2.9778524743212875 | grad_norm=20.9598 | nfev=1


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=1 | cost=0.10523167305450498 | grad_norm=2.0545 | step=3.0413812651491097 | nfev=2


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=2 | cost=0.04083584832792366 | grad_norm=0.1154 | step=3.0413812651491097 | nfev=3


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=3 | cost=0.040544924741293766 | grad_norm=9.4944e-05 | step=3.0413812651491097 | nfev=4


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=4 | cost=0.04054492254401376 | grad_norm=1.4620e-06 | step=3.0413812651491097 | nfev=5


PERFORMANCE:nlsq.least_squares:Timer: optimization elapsed=0.028800s


INFO:nlsq.least_squares:Convergence reason=`ftol` termination condition is satisfied. | iterations=5 | final_cost=0.0405 | elapsed=0.029s | final_gradient_norm=1.1556e-08


PERFORMANCE:nlsq.curve_fit:Timer: curve_fit elapsed=0.094478s




INFO:nlsq.curve_fit:Starting curve fit n_params=2 | n_data_points=30 | method=trf | solver=auto | batch_size=None | has_bounds=False | dynamic_sizing=False


INFO:nlsq.least_squares:Starting least squares optimization method=trf | n_params=2 | loss=linear | ftol=1.0000e-08 | xtol=1.0000e-08 | gtol=1.0000e-08


INFO:nlsq.optimizer.trf:Starting TRF optimization (no bounds) n_params=2 | n_residuals=30 | max_nfev=None


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=0 | cost=3.376114940021496 | grad_norm=21.4950 | nfev=1


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=1 | cost=0.10054577290454113 | grad_norm=1.9075 | step=3.0413812651491097 | nfev=2


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=2 | cost=0.042859863265695464 | grad_norm=0.1248 | step=3.0413812651491097 | nfev=3


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=3 | cost=0.04248257269924174 | grad_norm=0.0021 | step=3.0413812651491097 | nfev=4


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=4 | cost=0.042482434500367865 | grad_norm=2.6584e-05 | step=3.0413812651491097 | nfev=5


PERFORMANCE:nlsq.least_squares:Timer: optimization elapsed=0.037874s


INFO:nlsq.least_squares:Convergence reason=`ftol` termination condition is satisfied. | iterations=5 | final_cost=0.0425 | elapsed=0.038s | final_gradient_norm=3.6812e-07


PERFORMANCE:nlsq.curve_fit:Timer: curve_fit elapsed=0.091124s




INFO:nlsq.curve_fit:Starting curve fit n_params=2 | n_data_points=30 | method=trf | solver=auto | batch_size=None | has_bounds=False | dynamic_sizing=False


INFO:nlsq.least_squares:Starting least squares optimization method=trf | n_params=2 | loss=linear | ftol=1.0000e-08 | xtol=1.0000e-08 | gtol=1.0000e-08


INFO:nlsq.optimizer.trf:Starting TRF optimization (no bounds) n_params=2 | n_residuals=30 | max_nfev=None


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=0 | cost=2.6353573211828185 | grad_norm=18.4231 | nfev=1


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=1 | cost=0.05596385833470584 | grad_norm=1.2959 | step=3.0413812651491097 | nfev=2


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=2 | cost=0.03398330750470534 | grad_norm=0.0319 | step=3.0413812651491097 | nfev=3


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=3 | cost=0.0339678145734979 | grad_norm=3.9896e-04 | step=3.0413812651491097 | nfev=4


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=4 | cost=0.033967809846695944 | grad_norm=1.0274e-05 | step=3.0413812651491097 | nfev=5


PERFORMANCE:nlsq.least_squares:Timer: optimization elapsed=0.022704s


INFO:nlsq.least_squares:Convergence reason=`ftol` termination condition is satisfied. | iterations=5 | final_cost=0.0340 | elapsed=0.023s | final_gradient_norm=2.5696e-07


PERFORMANCE:nlsq.curve_fit:Timer: curve_fit elapsed=0.077133s




INFO:nlsq.curve_fit:Starting curve fit n_params=2 | n_data_points=30 | method=trf | solver=auto | batch_size=None | has_bounds=False | dynamic_sizing=False


INFO:nlsq.least_squares:Starting least squares optimization method=trf | n_params=2 | loss=linear | ftol=1.0000e-08 | xtol=1.0000e-08 | gtol=1.0000e-08


INFO:nlsq.optimizer.trf:Starting TRF optimization (no bounds) n_params=2 | n_residuals=30 | max_nfev=None


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=0 | cost=0.2914249477792962 | grad_norm=1.4116 | nfev=1


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=1 | cost=0.025824259288055415 | grad_norm=0.3382 | step=3.0413812651491097 | nfev=2


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=2 | cost=0.025106527591904928 | grad_norm=0.0012 | step=3.0413812651491097 | nfev=3


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=3 | cost=0.025106519764777166 | grad_norm=1.6388e-06 | step=3.0413812651491097 | nfev=4


PERFORMANCE:nlsq.least_squares:Timer: optimization elapsed=0.033985s


INFO:nlsq.least_squares:Convergence reason=`ftol` termination condition is satisfied. | iterations=4 | final_cost=0.0251 | elapsed=0.034s | final_gradient_norm=3.6076e-09


PERFORMANCE:nlsq.curve_fit:Timer: curve_fit elapsed=0.107394s




INFO:nlsq.curve_fit:Starting curve fit n_params=2 | n_data_points=30 | method=trf | solver=auto | batch_size=None | has_bounds=False | dynamic_sizing=False


INFO:nlsq.least_squares:Starting least squares optimization method=trf | n_params=2 | loss=linear | ftol=1.0000e-08 | xtol=1.0000e-08 | gtol=1.0000e-08


INFO:nlsq.optimizer.trf:Starting TRF optimization (no bounds) n_params=2 | n_residuals=30 | max_nfev=None


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=0 | cost=2.123858539270184 | grad_norm=19.5436 | nfev=1


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=1 | cost=0.3194475287094348 | grad_norm=13.0121 | step=3.0413812651491097 | nfev=2


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=2 | cost=0.039945514496164514 | grad_norm=0.7339 | step=3.0413812651491097 | nfev=3


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=3 | cost=0.03868054501194026 | grad_norm=0.0027 | step=3.0413812651491097 | nfev=4


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=4 | cost=0.03868052907733231 | grad_norm=3.1974e-06 | step=3.0413812651491097 | nfev=5


PERFORMANCE:nlsq.least_squares:Timer: optimization elapsed=0.038747s


INFO:nlsq.least_squares:Convergence reason=`ftol` termination condition is satisfied. | iterations=5 | final_cost=0.0387 | elapsed=0.039s | final_gradient_norm=7.0885e-09


PERFORMANCE:nlsq.curve_fit:Timer: curve_fit elapsed=0.109095s




INFO:nlsq.curve_fit:Starting curve fit n_params=2 | n_data_points=30 | method=trf | solver=auto | batch_size=None | has_bounds=False | dynamic_sizing=False


INFO:nlsq.least_squares:Starting least squares optimization method=trf | n_params=2 | loss=linear | ftol=1.0000e-08 | xtol=1.0000e-08 | gtol=1.0000e-08


INFO:nlsq.optimizer.trf:Starting TRF optimization (no bounds) n_params=2 | n_residuals=30 | max_nfev=None


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=0 | cost=0.414044657047935 | grad_norm=8.0441 | nfev=1


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=1 | cost=0.03074395419030401 | grad_norm=0.5270 | step=3.0413812651491097 | nfev=2


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=2 | cost=0.028435235457733764 | grad_norm=0.0039 | step=3.0413812651491097 | nfev=3


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=3 | cost=0.028435104521428715 | grad_norm=1.1705e-05 | step=3.0413812651491097 | nfev=4


PERFORMANCE:nlsq.least_squares:Timer: optimization elapsed=0.018126s


INFO:nlsq.least_squares:Convergence reason=`ftol` termination condition is satisfied. | iterations=4 | final_cost=0.0284 | elapsed=0.018s | final_gradient_norm=6.2394e-08


PERFORMANCE:nlsq.curve_fit:Timer: curve_fit elapsed=0.085123s




INFO:nlsq.curve_fit:Starting curve fit n_params=2 | n_data_points=30 | method=trf | solver=auto | batch_size=None | has_bounds=False | dynamic_sizing=False


INFO:nlsq.least_squares:Starting least squares optimization method=trf | n_params=2 | loss=linear | ftol=1.0000e-08 | xtol=1.0000e-08 | gtol=1.0000e-08


INFO:nlsq.optimizer.trf:Starting TRF optimization (no bounds) n_params=2 | n_residuals=30 | max_nfev=None


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=0 | cost=0.39741912997511564 | grad_norm=4.2868 | nfev=1


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=1 | cost=0.045004298397526676 | grad_norm=0.2318 | step=3.0413812651491097 | nfev=2


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=2 | cost=0.044661997353402 | grad_norm=4.5960e-04 | step=3.0413812651491097 | nfev=3


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=3 | cost=0.04466198857245933 | grad_norm=8.9628e-06 | step=3.0413812651491097 | nfev=4


PERFORMANCE:nlsq.least_squares:Timer: optimization elapsed=0.020946s


INFO:nlsq.least_squares:Convergence reason=`ftol` termination condition is satisfied. | iterations=4 | final_cost=0.0447 | elapsed=0.021s | final_gradient_norm=1.0308e-07


PERFORMANCE:nlsq.curve_fit:Timer: curve_fit elapsed=0.080947s




  Time for 20 datasets: 3196 ms (159.8 ms/fit)
  Estimated time for 20: 3.2 s

Method 2: Batched fitting with vmap (optimized)


  Time for 20 datasets: 332 ms (16.621 ms/fit)
  Throughput: 60 fits/second

Speedup: 10x faster with vmap + JIT ✓

Key insight: vmap parallelizes across datasets, JIT compiles once
Memory Optimization Strategies:

1. Use float32 instead of float64:
   float64 memory: 24 bytes per element
   float32 memory: 12 bytes per element
   Savings: 50%
   → Use float32 unless high precision is critical

2. Process data in chunks (streaming):
   # For very large datasets (millions of points)
   chunk_size = 100000
   for i in range(0, len(data), chunk_size):
       chunk = data[i:i+chunk_size]
       result = fit(chunk)
       results.append(result)

3. Clear JAX cache if needed:
   from jax import clear_caches
   clear_caches()  # Frees compilation cache

4. Monitor memory usage:
   Example: (10000, 1000) array uses 38.1 MB

5. Typical memory requirements:
   10K points:     ~0.1 MB (negligible)
   1M points:      ~10 MB (easy)
   100M points:    ~1 GB (manageable)
   1B points:      ~10 GB (ne

INFO:nlsq.curve_fit:Starting curve fit n_params=2 | n_data_points=50 | method=trf | solver=auto | batch_size=None | has_bounds=False | dynamic_sizing=False


INFO:nlsq.least_squares:Starting least squares optimization method=trf | n_params=2 | loss=linear | ftol=1.0000e-08 | xtol=1.0000e-08 | gtol=1.0000e-08


INFO:nlsq.optimizer.trf:Starting TRF optimization (no bounds) n_params=2 | n_residuals=50 | max_nfev=None


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=0 | cost=2.9468953550296324 | grad_norm=4.9613 | nfev=1


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=1 | cost=0.3529934034139043 | grad_norm=3.0189 | step=2.0223748416156684 | nfev=2


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=2 | cost=0.3032873773528026 | grad_norm=0.0338 | step=2.0223748416156684 | nfev=3


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=3 | cost=0.3032835877714485 | grad_norm=1.1071e-04 | step=2.0223748416156684 | nfev=4


PERFORMANCE:nlsq.least_squares:Timer: optimization elapsed=0.553422s


INFO:nlsq.least_squares:Convergence reason=`ftol` termination condition is satisfied. | iterations=4 | final_cost=0.3033 | elapsed=0.553s | final_gradient_norm=7.2451e-07


PERFORMANCE:nlsq.curve_fit:Timer: curve_fit elapsed=1.000428s




INFO:nlsq.curve_fit:Starting curve fit n_params=2 | n_data_points=50 | method=trf | solver=auto | batch_size=None | has_bounds=False | dynamic_sizing=False


INFO:nlsq.least_squares:Starting least squares optimization method=trf | n_params=2 | loss=linear | ftol=1.0000e-08 | xtol=1.0000e-08 | gtol=1.0000e-08


INFO:nlsq.optimizer.trf:Starting TRF optimization (no bounds) n_params=2 | n_residuals=50 | max_nfev=None


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=0 | cost=2.9468953550296324 | grad_norm=4.9613 | nfev=1


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=1 | cost=0.3529934034139043 | grad_norm=3.0189 | step=2.0223748416156684 | nfev=2


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=2 | cost=0.3032873773528026 | grad_norm=0.0338 | step=2.0223748416156684 | nfev=3


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=3 | cost=0.3032835877714485 | grad_norm=1.1071e-04 | step=2.0223748416156684 | nfev=4


PERFORMANCE:nlsq.least_squares:Timer: optimization elapsed=0.041052s


INFO:nlsq.least_squares:Convergence reason=`ftol` termination condition is satisfied. | iterations=4 | final_cost=0.3033 | elapsed=0.041s | final_gradient_norm=7.2451e-07


PERFORMANCE:nlsq.curve_fit:Timer: curve_fit elapsed=0.133407s




INFO:nlsq.curve_fit:Starting curve fit n_params=2 | n_data_points=50 | method=trf | solver=auto | batch_size=None | has_bounds=False | dynamic_sizing=False


INFO:nlsq.least_squares:Starting least squares optimization method=trf | n_params=2 | loss=linear | ftol=1.0000e-08 | xtol=1.0000e-08 | gtol=1.0000e-08


INFO:nlsq.optimizer.trf:Starting TRF optimization (no bounds) n_params=2 | n_residuals=50 | max_nfev=None


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=0 | cost=2.9468953550296324 | grad_norm=4.9613 | nfev=1


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=1 | cost=0.3529934034139043 | grad_norm=3.0189 | step=2.0223748416156684 | nfev=2


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=2 | cost=0.3032873773528026 | grad_norm=0.0338 | step=2.0223748416156684 | nfev=3


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=3 | cost=0.3032835877714485 | grad_norm=1.1071e-04 | step=2.0223748416156684 | nfev=4


PERFORMANCE:nlsq.least_squares:Timer: optimization elapsed=0.038311s


INFO:nlsq.least_squares:Convergence reason=`ftol` termination condition is satisfied. | iterations=4 | final_cost=0.3033 | elapsed=0.038s | final_gradient_norm=7.2451e-07


PERFORMANCE:nlsq.curve_fit:Timer: curve_fit elapsed=0.124149s




INFO:nlsq.curve_fit:Starting curve fit n_params=2 | n_data_points=100 | method=trf | solver=auto | batch_size=None | has_bounds=False | dynamic_sizing=False


INFO:nlsq.least_squares:Starting least squares optimization method=trf | n_params=2 | loss=linear | ftol=1.0000e-08 | xtol=1.0000e-08 | gtol=1.0000e-08


INFO:nlsq.optimizer.trf:Starting TRF optimization (no bounds) n_params=2 | n_residuals=100 | max_nfev=None


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=0 | cost=6.061838072007549 | grad_norm=10.4187 | nfev=1


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=1 | cost=0.5865391819900988 | grad_norm=6.5477 | step=2.0223748416156684 | nfev=2


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=2 | cost=0.46485710604296204 | grad_norm=0.0485 | step=2.0223748416156684 | nfev=3


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=3 | cost=0.4648532975226764 | grad_norm=1.5736e-04 | step=2.0223748416156684 | nfev=4


PERFORMANCE:nlsq.least_squares:Timer: optimization elapsed=0.175336s


INFO:nlsq.least_squares:Convergence reason=`ftol` termination condition is satisfied. | iterations=4 | final_cost=0.4649 | elapsed=0.175s | final_gradient_norm=1.0270e-06


PERFORMANCE:nlsq.curve_fit:Timer: curve_fit elapsed=0.447863s




INFO:nlsq.curve_fit:Starting curve fit n_params=2 | n_data_points=100 | method=trf | solver=auto | batch_size=None | has_bounds=False | dynamic_sizing=False


INFO:nlsq.least_squares:Starting least squares optimization method=trf | n_params=2 | loss=linear | ftol=1.0000e-08 | xtol=1.0000e-08 | gtol=1.0000e-08


INFO:nlsq.optimizer.trf:Starting TRF optimization (no bounds) n_params=2 | n_residuals=100 | max_nfev=None


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=0 | cost=6.061838072007549 | grad_norm=10.4187 | nfev=1


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=1 | cost=0.5865391819900988 | grad_norm=6.5477 | step=2.0223748416156684 | nfev=2


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=2 | cost=0.46485710604296204 | grad_norm=0.0485 | step=2.0223748416156684 | nfev=3


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=3 | cost=0.4648532975226764 | grad_norm=1.5736e-04 | step=2.0223748416156684 | nfev=4


PERFORMANCE:nlsq.least_squares:Timer: optimization elapsed=0.033181s


INFO:nlsq.least_squares:Convergence reason=`ftol` termination condition is satisfied. | iterations=4 | final_cost=0.4649 | elapsed=0.033s | final_gradient_norm=1.0270e-06


PERFORMANCE:nlsq.curve_fit:Timer: curve_fit elapsed=0.097723s




INFO:nlsq.curve_fit:Starting curve fit n_params=2 | n_data_points=100 | method=trf | solver=auto | batch_size=None | has_bounds=False | dynamic_sizing=False


INFO:nlsq.least_squares:Starting least squares optimization method=trf | n_params=2 | loss=linear | ftol=1.0000e-08 | xtol=1.0000e-08 | gtol=1.0000e-08


INFO:nlsq.optimizer.trf:Starting TRF optimization (no bounds) n_params=2 | n_residuals=100 | max_nfev=None


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=0 | cost=6.061838072007549 | grad_norm=10.4187 | nfev=1


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=1 | cost=0.5865391819900988 | grad_norm=6.5477 | step=2.0223748416156684 | nfev=2


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=2 | cost=0.46485710604296204 | grad_norm=0.0485 | step=2.0223748416156684 | nfev=3


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=3 | cost=0.4648532975226764 | grad_norm=1.5736e-04 | step=2.0223748416156684 | nfev=4


PERFORMANCE:nlsq.least_squares:Timer: optimization elapsed=0.029497s


INFO:nlsq.least_squares:Convergence reason=`ftol` termination condition is satisfied. | iterations=4 | final_cost=0.4649 | elapsed=0.029s | final_gradient_norm=1.0270e-06


PERFORMANCE:nlsq.curve_fit:Timer: curve_fit elapsed=0.092232s




INFO:nlsq.curve_fit:Starting curve fit n_params=2 | n_data_points=200 | method=trf | solver=auto | batch_size=None | has_bounds=False | dynamic_sizing=False


INFO:nlsq.least_squares:Starting least squares optimization method=trf | n_params=2 | loss=linear | ftol=1.0000e-08 | xtol=1.0000e-08 | gtol=1.0000e-08


INFO:nlsq.optimizer.trf:Starting TRF optimization (no bounds) n_params=2 | n_residuals=200 | max_nfev=None


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=0 | cost=12.287700982100029 | grad_norm=22.5842 | nfev=1


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=1 | cost=1.1570582684372313 | grad_norm=15.7184 | step=2.0223748416156684 | nfev=2


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=2 | cost=0.8741373268216581 | grad_norm=0.2828 | step=2.0223748416156684 | nfev=3


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=3 | cost=0.8740758961801374 | grad_norm=1.1294e-04 | step=2.0223748416156684 | nfev=4


PERFORMANCE:nlsq.least_squares:Timer: optimization elapsed=0.214306s


INFO:nlsq.least_squares:Convergence reason=`ftol` termination condition is satisfied. | iterations=4 | final_cost=0.8741 | elapsed=0.214s | final_gradient_norm=3.1673e-07


PERFORMANCE:nlsq.curve_fit:Timer: curve_fit elapsed=0.476053s




INFO:nlsq.curve_fit:Starting curve fit n_params=2 | n_data_points=200 | method=trf | solver=auto | batch_size=None | has_bounds=False | dynamic_sizing=False


INFO:nlsq.least_squares:Starting least squares optimization method=trf | n_params=2 | loss=linear | ftol=1.0000e-08 | xtol=1.0000e-08 | gtol=1.0000e-08


INFO:nlsq.optimizer.trf:Starting TRF optimization (no bounds) n_params=2 | n_residuals=200 | max_nfev=None


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=0 | cost=12.287700982100029 | grad_norm=22.5842 | nfev=1


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=1 | cost=1.1570582684372313 | grad_norm=15.7184 | step=2.0223748416156684 | nfev=2


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=2 | cost=0.8741373268216581 | grad_norm=0.2828 | step=2.0223748416156684 | nfev=3


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=3 | cost=0.8740758961801374 | grad_norm=1.1294e-04 | step=2.0223748416156684 | nfev=4


PERFORMANCE:nlsq.least_squares:Timer: optimization elapsed=0.024640s


INFO:nlsq.least_squares:Convergence reason=`ftol` termination condition is satisfied. | iterations=4 | final_cost=0.8741 | elapsed=0.025s | final_gradient_norm=3.1673e-07


PERFORMANCE:nlsq.curve_fit:Timer: curve_fit elapsed=0.092229s




INFO:nlsq.curve_fit:Starting curve fit n_params=2 | n_data_points=200 | method=trf | solver=auto | batch_size=None | has_bounds=False | dynamic_sizing=False


INFO:nlsq.least_squares:Starting least squares optimization method=trf | n_params=2 | loss=linear | ftol=1.0000e-08 | xtol=1.0000e-08 | gtol=1.0000e-08


INFO:nlsq.optimizer.trf:Starting TRF optimization (no bounds) n_params=2 | n_residuals=200 | max_nfev=None


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=0 | cost=12.287700982100029 | grad_norm=22.5842 | nfev=1


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=1 | cost=1.1570582684372313 | grad_norm=15.7184 | step=2.0223748416156684 | nfev=2


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=2 | cost=0.8741373268216581 | grad_norm=0.2828 | step=2.0223748416156684 | nfev=3


PERFORMANCE:nlsq.optimizer.trf:Iteration iter=3 | cost=0.8740758961801374 | grad_norm=1.1294e-04 | step=2.0223748416156684 | nfev=4


PERFORMANCE:nlsq.least_squares:Timer: optimization elapsed=0.025534s


INFO:nlsq.least_squares:Convergence reason=`ftol` termination condition is satisfied. | iterations=4 | final_cost=0.8741 | elapsed=0.026s | final_gradient_norm=3.1673e-07


PERFORMANCE:nlsq.curve_fit:Timer: curve_fit elapsed=0.076039s




Benchmark Results:
N Points     Mean Time (ms)       Throughput (fits/s)
------------------------------------------------------------
50             134.69 ± 6.43              7.4
100             99.61 ± 1.98             10.0
200             87.73 ± 7.48             11.4



Interpretation:
  - Nearly flat scaling: Well-optimized (GPU benefits)
  - Linear scaling: Expected for iterative optimization
  - Superlinear scaling: May indicate memory issues or poor caching
