# Comprehensive Comparison and Real-World Applications
Comprehensive comparison of all spline methods and compares with scipy/statsmodels implementations.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.interpolate import UnivariateSpline
import sys
sys.path.append('../src')

from splines import RegressionSpline, NaturalCubicSpline, SmoothingSpline
from utils import (
    generate_sinusoidal_data,
    mean_squared_error,
    r_squared
)

%matplotlib inline
plt.style.use('seaborn-v0_8-darkgrid')

## 1. Side-by-Side Comparison: All Methods

In [None]:
# Generate data
np.random.seed(42)
x_train, y_train = generate_sinusoidal_data(n_samples=50, noise_std=0.25, x_range=(0, 10))
x_test = np.linspace(0, 10, 500)
y_true = np.sin(2*np.pi*x_test/10)

# Setup
manual_knots = np.linspace(2, 8, 6)
all_knots = np.sort(np.concatenate([[0, 10], manual_knots]))

# Fit all models
models = {}

# 1. Regular regression spline
reg_model = RegressionSpline(degree=3)
reg_model.fit(x_train, y_train, manual_knots)
models['Regular Spline'] = reg_model.predict(x_test)

# 2. Natural cubic spline
nat_model = NaturalCubicSpline()
nat_model.fit(x_train, y_train, all_knots)
models['Natural Spline'] = nat_model.predict(x_test)

# 3. Smoothing spline (optimal lambda via CV)
smooth_model = SmoothingSpline()
lambdas = np.logspace(-3, 1, 20)
best_lam, _ = smooth_model.cross_validate(x_train, y_train, lambdas, cv_folds=5)
smooth_model = SmoothingSpline(lambda_=best_lam)
smooth_model.fit(x_train, y_train)
models['Smoothing Spline'] = smooth_model.predict(x_test)

# 4. Global polynomial (for comparison)
poly_deg = 9
poly_coeffs = np.polyfit(x_train, y_train, poly_deg)
models['Polynomial (deg 9)'] = np.polyval(poly_coeffs, x_test)

# Plot all
fig, ax = plt.subplots(figsize=(14, 7))

ax.scatter(x_train, y_train, alpha=0.4, s=30, label='Data', color='gray', zorder=3)
ax.plot(x_test, y_true, 'k--', linewidth=2.5, alpha=0.5, label='True function', zorder=1)

colors = ['blue', 'purple', 'red', 'orange']
linestyles = ['-', '--', '-.', ':']
for (name, y_pred), color, ls in zip(models.items(), colors, linestyles):
    ax.plot(x_test, y_pred, color=color, linestyle=ls, linewidth=2, label=name, alpha=0.8)

ax.set_xlabel('x', fontsize=13)
ax.set_ylabel('y', fontsize=13)
ax.set_title('Comparison of All Spline Methods', fontsize=15)
ax.legend(fontsize=11, loc='upper right')
ax.grid(True, alpha=0.3)
plt.show()

# Compute metrics
print("\n" + "="*80)
print(f"{'Method':<25} {'Training MSE':<15} {'Training R²':<15} {'Test MSE':<15}")
print("="*80)
for name in models.keys():
    if name == 'Regular Spline':
        y_pred_train = reg_model.predict(x_train)
    elif name == 'Natural Spline':
        y_pred_train = nat_model.predict(x_train)
    elif name == 'Smoothing Spline':
        y_pred_train = smooth_model.predict(x_train)
    else:
        y_pred_train = np.polyval(poly_coeffs, x_train)
    
    train_mse = mean_squared_error(y_train, y_pred_train)
    train_r2 = r_squared(y_train, y_pred_train)
    test_mse = mean_squared_error(y_true, models[name])
    
    print(f"{name:<25} {train_mse:<15.4f} {train_r2:<15.4f} {test_mse:<15.4f}")
print("="*80)

## 2. Comparison with SciPy UnivariateSpline

In [None]:
cur_model = SmoothingSpline(lambda_=0.1)
cur_model.fit(x_train, y_train)
y_cur = cur_model.predict(x_test)

# SciPy implementation (uses B-spline basis)
# Note: scipy's 's' parameter is related to smoothing, lower = less smooth
scipy_model = UnivariateSpline(x_train, y_train, s=5.0, k=3)
y_scipy = scipy_model(x_test)

# Plot
fig, ax = plt.subplots(figsize=(12, 6))
ax.scatter(x_train, y_train, alpha=0.5, s=30, label='Data', color='gray', zorder=3)
ax.plot(x_test, y_cur, 'b-', linewidth=2.5, label='Current Smoothing Spline', alpha=0.8)
ax.plot(x_test, y_scipy, 'r--', linewidth=2.5, label='SciPy UnivariateSpline', alpha=0.8)
ax.plot(x_test, y_true, 'g:', linewidth=2, label='True function', alpha=0.5)

ax.set_xlabel('x', fontsize=12)
ax.set_ylabel('y', fontsize=12)
ax.set_title('Current Implementation vs SciPy', fontsize=14)
ax.legend(fontsize=11)
ax.grid(True, alpha=0.3)
plt.show()

# Compare
diff = np.abs(y_cur - y_scipy)
print(f"\nMax absolute difference: {np.max(diff):.4f}")
print(f"Mean absolute difference: {np.mean(diff):.4f}")
print("\nNote: Some difference expected due to:")
print("  - SciPy uses B-spline basis (more numerically stable)")
print("  - Different parameterizations of smoothing parameter")

## 3. Performance on Different Data Characteristics

Test all methods on different types of functions.

In [None]:
from utils import generate_polynomial_data, generate_step_data, generate_discontinuous_data

np.random.seed(42)

# Different data types
datasets = [
    ('Smooth (Sine)', *generate_sinusoidal_data(50, 0.2, (0, 10))),
    ('Polynomial', *generate_polynomial_data(50, 4, 0.3, (-1, 1))),
    ('Step Function', *generate_step_data(50, 4, 0.2, (0, 10))),
]

fig, axes = plt.subplots(len(datasets), 1, figsize=(12, 4*len(datasets)))

for ax, (name, x_train, y_train) in zip(axes, datasets):
    x_test = np.linspace(np.min(x_train), np.max(x_train), 500)
    
    # Fit models
    # Smoothing spline
    smooth_model = SmoothingSpline(lambda_=0.1)
    smooth_model.fit(x_train, y_train)
    y_smooth = smooth_model.predict(x_test)
    
    # Regular spline with fixed knots
    n_knots = min(6, len(x_train) // 8)
    knots = np.linspace(np.min(x_train), np.max(x_train), n_knots)[1:-1]
    if len(knots) > 0:
        reg_model = RegressionSpline(degree=3)
        reg_model.fit(x_train, y_train, knots)
        y_reg = reg_model.predict(x_test)
    else:
        y_reg = np.polyval(np.polyfit(x_train, y_train, 3), x_test)
    
    # Plot
    ax.scatter(x_train, y_train, alpha=0.5, s=30, color='gray', zorder=3, label='Data')
    ax.plot(x_test, y_smooth, 'b-', linewidth=2, label='Smoothing Spline', alpha=0.8)
    ax.plot(x_test, y_reg, 'r--', linewidth=2, label='Regular Spline', alpha=0.8)
    
    ax.set_xlabel('x', fontsize=11)
    ax.set_ylabel('y', fontsize=11)
    ax.set_title(f'{name}', fontsize=13)
    ax.legend(fontsize=10)
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\nObservations:")
print("  - Smoothing splines handle ALL data types well")
print("  - Regular splines sensitive to knot placement")
print("  - Step functions challenge smooth methods (expected)")

## 4. Computational Performance Comparison

In [None]:
import time

# Test on different dataset sizes
sizes = [20, 50, 100, 200]
times_regular = []
times_smooth = []
times_scipy = []

for n in sizes:
    x_train, y_train = generate_sinusoidal_data(n, 0.2, (0, 10), random_state=42)
    
    # Regular spline
    knots = np.linspace(2, 8, min(6, n//10))
    start = time.time()
    reg_model = RegressionSpline(degree=3)
    reg_model.fit(x_train, y_train, knots)
    _ = reg_model.predict(x_train)
    times_regular.append(time.time() - start)
    
    if n <= 100:  # Only test on smaller datasets
        start = time.time()
        smooth_model = SmoothingSpline(lambda_=0.1)
        smooth_model.fit(x_train, y_train)
        _ = smooth_model.predict(x_train)
        times_smooth.append(time.time() - start)
    else:
        times_smooth.append(np.nan)
    
    # SciPy (fast, uses B-splines)
    start = time.time()
    scipy_model = UnivariateSpline(x_train, y_train, s=5.0, k=3)
    _ = scipy_model(x_train)
    times_scipy.append(time.time() - start)

# Plot
fig, ax = plt.subplots(figsize=(10, 6))
ax.plot(sizes, times_regular, 'o-', linewidth=2, markersize=8, label='Regular Spline')
ax.plot(sizes[:len([t for t in times_smooth if not np.isnan(t)])], 
        [t for t in times_smooth if not np.isnan(t)], 
        's-', linewidth=2, markersize=8, label='Current Smoothing Spline')
ax.plot(sizes, times_scipy, '^-', linewidth=2, markersize=8, label='SciPy UnivariateSpline')

ax.set_xlabel('Dataset size (n)', fontsize=12)
ax.set_ylabel('Time (seconds)', fontsize=12)
ax.set_title('Computational Performance Comparison', fontsize=14)
ax.legend(fontsize=11)
ax.grid(True, alpha=0.3)
ax.set_yscale('log')
plt.show()

print("\nPerformance notes:")
print("  - Regular splines: Fast (simple least squares)")
print("  - Current smoothing spline: Slow (numerical integration for penalty matrix)")
print("  - SciPy: Very fast (optimized C code, B-spline basis)")
print("\n  → For production use, prefer SciPy or implement B-spline basis!")

## 5. Summary and Recommendations

### Method Comparison Summary

| Method | Pros | Cons | When to Use |
|--------|------|------|-------------|
| **Regular Regression Spline** | Simple, fast, interpretable | Requires manual knot selection, boundary variance | Known functional form, few knots needed |
| **Natural Cubic Spline** | Reduced boundary variance, fewer parameters | Still requires knot selection | Extrapolation important, sparse boundaries |
| **Smoothing Spline** | Automatic knots, single tuning parameter | Computationally expensive (with truncated basis) | Unknown functional form, automatic tuning desired |
| **Global Polynomial** | Very simple | Poor local control, numerical instability | Very smooth functions, low degree |

1. **Truncated power basis**:
   - Natural parametrization: $g_1(x)=1, g_2(x)=x, \dots, g_{k+1}(x)=x^k, g_{k+1+j}(x)=(x-t_j)_+^k$
   - Numerically unstable for large datasets
   
2. **B-spline basis** (not implemented here):
   - "A much better computational choice, both for speed and numerical accuracy"
   - "Pretty much the standard in software"
   - Use `scipy.interpolate` for production code

3. **Smoothing splines**:
   - Motivated from functional perspective: $\min_f \sum(y_i-f(x_i))^2 + \lambda\int(f'')^2dx$
   - "Often deliver similar fits to kernel regression...in a sense simpler"
   - Only one tuning parameter (λ) vs kernel regression (bandwidth h + kernel choice)


Use `scipy.interpolate.UnivariateSpline` or `statsmodels`