# Natural Splines vs Regular Splines

**Natural splines** address the boundary variance problem by:
- Using degree $k$ polynomials between interior knots
- Using degree $(k-1)/2$ polynomials beyond boundaries
- For cubic natural splines (k=3): **linear beyond boundaries**

A way to remedy this problem is to force the piecewise polynomial function to have a lower degree to the left of the leftmost knot, and to the right of the rightmost knot.

**Key advantage**: Natural splines use only $m$ basis functions (vs $m+k+1$ for regular splines).

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import sys
sys.path.append('../src')

from splines import RegressionSpline, NaturalCubicSpline
from utils import (
    generate_sinusoidal_data,
    plot_spline_fit,
    mean_squared_error,
    r_squared
)

%matplotlib inline
plt.style.use('seaborn-v0_8-darkgrid')

## 1. Direct Comparison: Regular vs Natural Cubic Splines

In [None]:
np.random.seed(42)
x_train = np.concatenate([
    np.array([0.0, 0.5, 1.0]),
    np.linspace(2, 8, 35),
    np.array([9.0, 9.5, 10.0])
])
y_train = np.sin(2*np.pi*x_train/10) + np.random.normal(0, 0.2, len(x_train))

interior_knots = np.linspace(2, 8, 6)
boundary_knots = np.array([0.0, 10.0])
all_knots = np.sort(np.concatenate([boundary_knots, interior_knots]))

regular_model = RegressionSpline(degree=3)
regular_model.fit(x_train, y_train, interior_knots)

natural_model = NaturalCubicSpline()
natural_model.fit(x_train, y_train, all_knots)

x_test = np.linspace(0, 10, 500)
y_regular = regular_model.predict(x_test)
y_natural = natural_model.predict(x_test)
y_true = np.sin(2*np.pi*x_test/10)

fig, axes = plt.subplots(2, 1, figsize=(14, 10))

ax = axes[0]
ax.scatter(x_train, y_train, alpha=0.5, s=40, label='Training data', color='gray', zorder=3)
ax.plot(x_test, y_regular, 'b-', linewidth=2.5, label='Regular cubic spline')
ax.plot(x_test, y_true, 'g--', alpha=0.5, linewidth=2, label='True function')
for knot in interior_knots:
    ax.axvline(knot, color='r', linestyle='--', alpha=0.2)
ax.axvspan(-0.5, 1.5, alpha=0.1, color='orange', label='Boundary regions')
ax.axvspan(8.5, 10.5, alpha=0.1, color='orange')
ax.set_ylabel('y', fontsize=12)
ax.set_title('Regular Cubic Spline (may have high variance at boundaries)', fontsize=13)
ax.legend(fontsize=10)
ax.grid(True, alpha=0.3)
ax.set_ylim(-2, 2)

ax = axes[1]
ax.scatter(x_train, y_train, alpha=0.5, s=40, label='Training data', color='gray', zorder=3)
ax.plot(x_test, y_natural, 'purple', linewidth=2.5, label='Natural cubic spline')
ax.plot(x_test, y_true, 'g--', alpha=0.5, linewidth=2, label='True function')
for knot in all_knots:
    ax.axvline(knot, color='r', linestyle='--', alpha=0.2)
ax.axvspan(-0.5, 1.5, alpha=0.1, color='lightblue', label='Linear beyond boundaries')
ax.axvspan(8.5, 10.5, alpha=0.1, color='lightblue')
ax.set_xlabel('x', fontsize=12)
ax.set_ylabel('y', fontsize=12)
ax.set_title('Natural Cubic Spline (linear beyond boundaries → stable)', fontsize=13)
ax.legend(fontsize=10)
ax.grid(True, alpha=0.3)
ax.set_ylim(-2, 2)

plt.tight_layout()
plt.show()

print(f"Regular spline: {len(interior_knots) + 3 + 1} = {len(regular_model.coefficients)} parameters")
print(f"Natural spline: {len(all_knots)} parameters (fewer!)")

## 2. Zoom In on Boundary Behavior

Let's examine the boundary regions more closely.

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

ax = axes[0]
mask_left = x_test <= 2.5
ax.scatter(x_train[x_train <= 2.5], y_train[x_train <= 2.5], 
          alpha=0.7, s=60, color='gray', zorder=3, label='Data')
ax.plot(x_test[mask_left], y_regular[mask_left], 'b-', linewidth=3, label='Regular spline')
ax.plot(x_test[mask_left], y_natural[mask_left], 'purple', linewidth=3, label='Natural spline', linestyle='--')
ax.plot(x_test[mask_left], y_true[mask_left], 'g:', linewidth=2, label='True', alpha=0.7)
ax.axvline(0, color='red', linestyle=':', alpha=0.5, label='Boundary')
ax.set_xlabel('x', fontsize=12)
ax.set_ylabel('y', fontsize=12)
ax.set_title('Left Boundary (x ∈ [0, 2.5])', fontsize=13)
ax.legend(fontsize=10)
ax.grid(True, alpha=0.3)

ax = axes[1]
mask_right = x_test >= 7.5
ax.scatter(x_train[x_train >= 7.5], y_train[x_train >= 7.5], 
          alpha=0.7, s=60, color='gray', zorder=3, label='Data')
ax.plot(x_test[mask_right], y_regular[mask_right], 'b-', linewidth=3, label='Regular spline')
ax.plot(x_test[mask_right], y_natural[mask_right], 'purple', linewidth=3, label='Natural spline', linestyle='--')
ax.plot(x_test[mask_right], y_true[mask_right], 'g:', linewidth=2, label='True', alpha=0.7)
ax.axvline(10, color='red', linestyle=':', alpha=0.5, label='Boundary')
ax.set_xlabel('x', fontsize=12)
ax.set_ylabel('y', fontsize=12)
ax.set_title('Right Boundary (x ∈ [7.5, 10])', fontsize=13)
ax.legend(fontsize=10)
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 3. Variance Reduction at Boundaries

Demonstrate variance reduction through bootstrap resampling.

In [None]:
np.random.seed(42)
n_samples = 40
x_base = np.concatenate([
    np.array([0.0, 0.5, 1.0]),
    np.linspace(2, 8, 30),
    np.array([9.0, 9.5, 10.0])
])
true_func = lambda x: np.sin(2*np.pi*x/10)

n_bootstrap = 50
x_test = np.linspace(0, 10, 500)

y_regular_samples = []
y_natural_samples = []

for i in range(n_bootstrap):
    y_train = true_func(x_base) + np.random.normal(0, 0.2, len(x_base))
    
    regular_model = RegressionSpline(degree=3)
    regular_model.fit(x_base, y_train, interior_knots)
    y_regular_samples.append(regular_model.predict(x_test))
    
    natural_model = NaturalCubicSpline()
    natural_model.fit(x_base, y_train, all_knots)
    y_natural_samples.append(natural_model.predict(x_test))

y_regular_samples = np.array(y_regular_samples)
y_natural_samples = np.array(y_natural_samples)

var_regular = np.var(y_regular_samples, axis=0)
var_natural = np.var(y_natural_samples, axis=0)

fig, ax = plt.subplots(figsize=(12, 6))

ax.plot(x_test, var_regular, 'b-', linewidth=2, label='Regular spline variance')
ax.plot(x_test, var_natural, 'purple', linewidth=2, label='Natural spline variance')

ax.axvspan(0, 2, alpha=0.1, color='orange')
ax.axvspan(8, 10, alpha=0.1, color='orange')

ax.set_xlabel('x', fontsize=12)
ax.set_ylabel('Variance', fontsize=12)
ax.set_title(f'Prediction Variance from {n_bootstrap} Bootstrap Samples', fontsize=14)
ax.legend(fontsize=12)
ax.grid(True, alpha=0.3)
plt.show()

boundary_mask = (x_test < 2) | (x_test > 8)
interior_mask = (x_test >= 2) & (x_test <= 8)

print("\nAverage variance at BOUNDARIES:")
print(f"  Regular: {np.mean(var_regular[boundary_mask]):.4f}")
print(f"  Natural: {np.mean(var_natural[boundary_mask]):.4f}")
print(f"  Reduction: {(1 - np.mean(var_natural[boundary_mask])/np.mean(var_regular[boundary_mask]))*100:.1f}%")

print("\nAverage variance in INTERIOR:")
print(f"  Regular: {np.mean(var_regular[interior_mask]):.4f}")
print(f"  Natural: {np.mean(var_natural[interior_mask]):.4f}")

## 4. Parameter Efficiency

Natural splines use fewer parameters while maintaining good fit.

In [None]:
np.random.seed(123)
x_train, y_train = generate_sinusoidal_data(n_samples=50, noise_std=0.2, x_range=(0, 10))
x_test = np.linspace(0, 10, 500)

n_knots_list = [4, 6, 8, 10]

results = []
for n_knots in n_knots_list:
    interior_knots = np.linspace(2, 8, n_knots)
    all_knots = np.sort(np.concatenate([[0, 10], interior_knots]))
    
    reg_model = RegressionSpline(degree=3)
    reg_model.fit(x_train, y_train, interior_knots)
    mse_reg = mean_squared_error(y_train, reg_model.predict(x_train))
    n_params_reg = len(reg_model.coefficients)
    
    nat_model = NaturalCubicSpline()
    nat_model.fit(x_train, y_train, all_knots)
    mse_nat = mean_squared_error(y_train, nat_model.predict(x_train))
    n_params_nat = len(nat_model.coefficients)
    
    results.append({
        'n_knots': n_knots,
        'regular_params': n_params_reg,
        'natural_params': n_params_nat,
        'regular_mse': mse_reg,
        'natural_mse': mse_nat
    })

print("\n" + "="*80)
print(f"{'Knots':<10} {'Regular':<20} {'Natural':<20} {'MSE Comparison':<30}")
print(f"{'(m)':<10} {'Params | MSE':<20} {'Params | MSE':<20} {'Regular vs Natural':<30}")
print("="*80)
for r in results:
    print(f"{r['n_knots']:<10} "
          f"{r['regular_params']:<7} | {r['regular_mse']:<11.4f} "
          f"{r['natural_params']:<7} | {r['natural_mse']:<11.4f} "
          f"{r['regular_mse']/r['natural_mse']:<30.2f}")

## Key Takeaways

1. **Natural splines** are linear beyond boundary knots → reduced variance at boundaries
2. Use only $m$ basis functions (vs $m+k+1$ for regular splines) → more parameter efficient
3. Provide more **stable extrapolation** beyond the data range
4. Particularly useful when data is sparse near boundaries
5. For cubic natural splines: linear extrapolation is often more reasonable than cubic