# Parametric and non-parametric methods

Main takeaways:

1. Parametric regression works with smaller datasets, whereas nonparametric regression need larger datasets to capture patterns
2. In both cases there is some point at which increasing the level of complexity in the model leads to overfitting
3. Other things equal, nonparametric methods allow for more complexity before overfitting the larger the dataset  
4. The flexibility of a learner (its ability to capture different functional forms) is related to its functional class $f$: more flexible models are more complex models

We start by loading libraries and generating a synthetic dataset:

In [2]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor

# Generate synthetic data
np.random.seed(42)
X = np.linspace(-3, 3, 50).reshape(-1,1)  # Feature is reshaped to create a 2D array with one column!
y = np.sin(X[:,0]) + np.random.normal(0, 0.8, X.shape[0])  # True function + noise. Note that X is a 2D array so we must pick one columns 
# y = np.sin(X).ravel() + np.random.normal(0, 0.8, X.shape[0])  # Alternatively one can flatten the 2D array with ravel  

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


## 1. Parametric regression

In [None]:
# ---- PARAMETRIC ESTIMATION ----
plt.figure(figsize=(12, 5))
degrees = [1, 3,15]  # Polynomial degrees

for i, d in enumerate(degrees, 1):
    poly = PolynomialFeatures(degree=d)
    X_poly = poly.fit_transform(X_train)
    
    model = LinearRegression()
    model.fit(X_poly, y_train)
    
    X_test_poly = poly.transform(X_test)
    y_pred = model.predict(X_test_poly)
    
    plt.subplot(1, 3, i)
    plt.scatter(X_train, y_train, color='gray', label='Train Data')
    plt.scatter(X_test, y_test, color='red', label='Test Data')
    plt.plot(np.sort(X_test.ravel()), y_pred[np.argsort(X_test.ravel())], label=f'Degree {d}', lw=2)
    plt.plot(X, np.sin(X), label='True Function', linestyle='dashed', color='black')
    plt.title(f'Polynomial Regression (Degree {d})')
    plt.legend()

plt.tight_layout()
plt.show()

## 2. Non-parametric: regression tree

In [None]:
# ---- NONPARAMETRIC: Regression Tree ----
plt.figure(figsize=(12, 5))
max_depths = [1, 3, 5]

for i, depth in enumerate(max_depths, 1):
    tree = DecisionTreeRegressor(max_depth=depth)
    tree.fit(X_train, y_train)
    
    y_pred = tree.predict(X_test)  # Predict over full X for visualization
    
    plt.subplot(1, 3, i)
    plt.scatter(X_train, y_train, color='gray', label='Train Data')
    plt.scatter(X_test, y_test, color='red', label='Test Data')
    plt.plot(X, np.sin(X), label='True Function', linestyle='dashed', color='black')
    plt.plot(np.sort(X_test.ravel()), y_pred[np.argsort(X_test.ravel())], label=f'Regression Tree (depth={depth})', lw=2)
    plt.title(f'Regression Tree with depth={depth}')
    plt.legend()

plt.tight_layout()
plt.show()

## 3. Sample size in non-parametric regression

In [None]:
# Generate bigger dataset
# # Generate synthetic data
np.random.seed(42)
X = np.linspace(-3, 3, 500).reshape(-1,1)  # Feature
y = np.sin(X).ravel() + np.random.normal(0, .8, X.shape[0])  # True function + noise

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

plt.figure(figsize=(12, 5))

for i, depth in enumerate(max_depths, 1):
    tree = DecisionTreeRegressor(max_depth=depth)
    tree.fit(X_train, y_train)
    
    y_pred = tree.predict(X_test)  # Predict over full X for visualization
    
    plt.subplot(1, 3, i)
    plt.scatter(X_train, y_train, color='gray', label='Train Data',alpha=0.2)
    plt.scatter(X_test, y_test, color='red', label='Test Data',alpha=0.2)
    plt.plot(X, np.sin(X), label='True Function', linestyle='dashed', color='black')
    plt.plot(np.sort(X_test.ravel()), y_pred[np.argsort(X_test.ravel())], label=f'Regression Tree (depth={depth})', lw=2)
    plt.title(f'Regression Tree with depth={depth}')
    plt.legend()

plt.tight_layout()
plt.show()

Even bigger dataset (n=50000)

In [None]:
# Generate bigger dataset
# # Generate synthetic data
np.random.seed(42)
X = np.linspace(-3, 3, 50000).reshape(-1,1)  # Feature
y = np.sin(X).ravel() + np.random.normal(0, .8, X.shape[0])  # True function + noise

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

plt.figure(figsize=(12, 5))

for i, depth in enumerate(max_depths, 1):
    tree = DecisionTreeRegressor(max_depth=depth)
    tree.fit(X_train, y_train)
    
    y_pred = tree.predict(X_test)  # Predict over full X for visualization
    
    plt.subplot(1, 3, i)
    plt.scatter(X_train, y_train, color='gray', label='Train Data',alpha=0.05)
    plt.scatter(X_test, y_test, color='red', label='Test Data',alpha=0.05)
    plt.plot(X, np.sin(X), label='True Function', linestyle='dashed', color='black')
    plt.plot(np.sort(X_test.ravel()), y_pred[np.argsort(X_test.ravel())], label=f'Regression Tree (depth={depth})', lw=2)
    plt.title(f'Regression Tree with depth={depth}')
    plt.legend()

plt.tight_layout()
plt.show()