# Regression Splines

## Mathematical Background

Given samples $(x_i, y_i)$ for $i=1,\dots,n$, we estimate the regression function $r(x) = E(Y|X=x)$ by fitting a $k$-th order spline with knots at prespecified locations $t_1, \dots, t_m$.

We minimize:
$$\sum_{i=1}^n \left(y_i - \sum_{j=1}^{m+k+1} \beta_j g_j(x_i)\right)^2 = \|y - G\beta\|_2^2$$

Solution: $\hat{\beta} = (G^T G)^{-1} G^T y$

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import sys
sys.path.append('../src')

from splines import RegressionSpline
from utils import (
    generate_sinusoidal_data, 
    generate_polynomial_data,
    plot_spline_fit, 
    mean_squared_error, 
    r_squared
)

%matplotlib inline
plt.style.use('seaborn-v0_8-darkgrid')

## 1. Fit Cubic Regression Spline to Sinusoidal Data

In [None]:
# Generate noisy sinusoidal data
np.random.seed(42)
x_train, y_train = generate_sinusoidal_data(n_samples=50, noise_std=0.2, 
                                            x_range=(0, 10), frequency=1.0)

# Choose knot locations (5 interior knots)
knots = np.array([2.0, 4.0, 5.0, 6.0, 8.0])

# Fit cubic regression spline (degree=3)
model = RegressionSpline(degree=3)
model.fit(x_train, y_train, knots)

# Predict on fine grid for smooth curve
x_test = np.linspace(0, 10, 500)
y_pred = model.predict(x_test)

# True function
y_true_func = lambda x: np.sin(2 * np.pi * x / 10)

# Plot
fig = plot_spline_fit(x_train, y_train, x_test, y_pred, knots, 
                      y_true_func=y_true_func,
                      title="Cubic Regression Spline: Sinusoidal Data")
plt.show()

# Compute metrics
y_pred_train = model.predict(x_train)
mse = mean_squared_error(y_train, y_pred_train)
r2 = r_squared(y_train, y_pred_train)

print(f"Training MSE: {mse:.4f}")
print(f"Training R²: {r2:.4f}")
print(f"Number of knots: {len(knots)}")
print(f"Number of parameters: {len(knots) + 3 + 1} = {len(model.coefficients)}")

## 2. Effect of Number of Knots

More knots → more flexibility → better fit (but risk of overfitting)

In [None]:
# Generate data
np.random.seed(42)
x_train, y_train = generate_sinusoidal_data(n_samples=50, noise_std=0.2, x_range=(0, 10))
x_test = np.linspace(0, 10, 500)

# Try different numbers of knots
knot_configs = [
    (np.array([5.0]), "1 knot"),
    (np.linspace(2, 8, 3), "3 knots"),
    (np.linspace(2, 8, 5), "5 knots"),
    (np.linspace(1, 9, 10), "10 knots")
]

fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.ravel()

for ax, (knots, label) in zip(axes, knot_configs):
    # Fit model
    model = RegressionSpline(degree=3)
    model.fit(x_train, y_train, knots)
    y_pred = model.predict(x_test)
    
    # Compute metrics
    y_pred_train = model.predict(x_train)
    mse = mean_squared_error(y_train, y_pred_train)
    r2 = r_squared(y_train, y_pred_train)
    
    # Plot
    ax.scatter(x_train, y_train, alpha=0.5, s=30, label='Data', color='gray')
    ax.plot(x_test, y_pred, 'b-', linewidth=2, label='Spline')
    ax.plot(x_test, np.sin(2*np.pi*x_test/10), 'g--', alpha=0.5, label='True')
    
    for knot in knots:
        ax.axvline(knot, color='r', linestyle='--', alpha=0.3)
    
    ax.set_title(f'{label}\nMSE={mse:.4f}, R²={r2:.4f}', fontsize=11)
    ax.set_xlabel('x')
    ax.set_ylabel('y')
    ax.legend(loc='upper right', fontsize=9)
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 3. Effect of Polynomial Degree

Compare linear (degree=1), quadratic (degree=2), and cubic (degree=3) splines.

In [None]:
# Generate data
np.random.seed(42)
x_train, y_train = generate_polynomial_data(n_samples=50, degree=3, noise_std=0.3)
x_test = np.linspace(-1, 1, 500)

# Fixed knots
knots = np.linspace(-0.6, 0.6, 4)

# Try different degrees
degrees = [1, 2, 3]

fig, axes = plt.subplots(1, 3, figsize=(15, 4))

for ax, degree in zip(axes, degrees):
    model = RegressionSpline(degree=degree)
    model.fit(x_train, y_train, knots)
    y_pred = model.predict(x_test)
    
    # Metrics
    y_pred_train = model.predict(x_train)
    mse = mean_squared_error(y_train, y_pred_train)
    
    # Plot
    ax.scatter(x_train, y_train, alpha=0.5, s=30, color='gray')
    ax.plot(x_test, y_pred, 'b-', linewidth=2)
    
    for knot in knots:
        ax.axvline(knot, color='r', linestyle='--', alpha=0.3)
    
    degree_names = {1: 'Linear', 2: 'Quadratic', 3: 'Cubic'}
    ax.set_title(f'{degree_names[degree]} Spline (k={degree})\nMSE={mse:.4f}', fontsize=12)
    ax.set_xlabel('x')
    ax.set_ylabel('y')
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 4. Boundary Behavior Problem

One problem with regression splines is that the estimates tend to display erratic behavior, i.e., they have high variance, at the boundaries.

Let's demonstrate this issue.

In [None]:
# Generate data with sparse points at boundaries
np.random.seed(42)
x_train = np.concatenate([
    np.linspace(0, 1, 5),      # Few points at left boundary
    np.linspace(1.5, 8.5, 40), # Many points in middle
    np.linspace(9, 10, 5)      # Few points at right boundary
])
y_train = np.sin(2*np.pi*x_train/10) + np.random.normal(0, 0.2, len(x_train))

# Fit with many knots (including near boundaries)
knots = np.linspace(2, 8, 8)

model = RegressionSpline(degree=3)
model.fit(x_train, y_train, knots)

x_test = np.linspace(0, 10, 500)
y_pred = model.predict(x_test)

# Plot
fig, ax = plt.subplots(figsize=(12, 6))
ax.scatter(x_train, y_train, alpha=0.6, s=40, label='Training data', color='gray', zorder=3)
ax.plot(x_test, y_pred, 'b-', linewidth=2, label='Regression spline')
ax.plot(x_test, np.sin(2*np.pi*x_test/10), 'g--', alpha=0.5, linewidth=2, label='True function')

for knot in knots:
    ax.axvline(knot, color='r', linestyle='--', alpha=0.2)

# Highlight boundary regions
ax.axvspan(0, 1.5, alpha=0.1, color='red', label='Boundary regions')
ax.axvspan(8.5, 10, alpha=0.1, color='red')

ax.set_xlabel('x', fontsize=12)
ax.set_ylabel('y', fontsize=12)
ax.set_title('Boundary Variance Problem in Regression Splines', fontsize=14)
ax.legend(fontsize=11)
ax.grid(True, alpha=0.3)
plt.show()

print("Notice: The spline may exhibit erratic behavior (high variance) at the boundaries!")
print("This motivates the use of NATURAL SPLINES (next notebook).")

## 5. Comparison with Polynomial Regression

Splines are more flexible than global polynomials.

In [None]:
# Generate data
np.random.seed(123)
x_train, y_train = generate_sinusoidal_data(n_samples=50, noise_std=0.25, x_range=(0, 10))
x_test = np.linspace(0, 10, 500)

# Fit regression spline
knots = np.linspace(2, 8, 5)
spline_model = RegressionSpline(degree=3)
spline_model.fit(x_train, y_train, knots)
y_spline = spline_model.predict(x_test)

# Fit global polynomial (degree 7 to match number of parameters)
n_params_spline = len(knots) + 3 + 1  # m + k + 1
poly_degree = n_params_spline - 1
poly_coeffs = np.polyfit(x_train, y_train, poly_degree)
y_poly = np.polyval(poly_coeffs, x_test)

# Plot
fig, ax = plt.subplots(figsize=(12, 6))
ax.scatter(x_train, y_train, alpha=0.5, s=30, label='Data', color='gray', zorder=3)
ax.plot(x_test, y_spline, 'b-', linewidth=2, label=f'Regression spline ({n_params_spline} params)')
ax.plot(x_test, y_poly, 'r-', linewidth=2, label=f'Global polynomial (degree {poly_degree})', alpha=0.7)
ax.plot(x_test, np.sin(2*np.pi*x_test/10), 'g--', alpha=0.5, linewidth=2, label='True function')

for knot in knots:
    ax.axvline(knot, color='b', linestyle='--', alpha=0.2)

ax.set_xlabel('x', fontsize=12)
ax.set_ylabel('y', fontsize=12)
ax.set_title('Regression Spline vs Global Polynomial', fontsize=14)
ax.legend(fontsize=11)
ax.grid(True, alpha=0.3)
ax.set_ylim(-2, 2)
plt.show()

# Compare errors
mse_spline = mean_squared_error(y_train, spline_model.predict(x_train))
mse_poly = mean_squared_error(y_train, np.polyval(poly_coeffs, x_train))

print(f"\nTraining MSE:")
print(f"  Spline: {mse_spline:.4f}")
print(f"  Polynomial: {mse_poly:.4f}")
print(f"\nSplines provide LOCAL flexibility, polynomials are GLOBAL.")

## Key Takeaways

1. **Regression splines** fit splines to data via least squares: $\hat{\beta} = (G^T G)^{-1} G^T y$
2. More knots → more flexibility → better training fit (but potential overfitting)
3. Higher polynomial degree → smoother derivatives but more parameters
4. **Main limitation**: High variance at boundaries (solved by natural splines)
5. **Splines vs polynomials**: Splines offer local control, polynomials are global