In [1]:
import sklearn.datasets
import sklearn.preprocessing
import sklearn.model_selection
import sklearn.pipeline

import matplotlib.pyplot as plt

import nonlinear_regression as nlr

# 3. Nonlinear Regression
We now take a look at a regression problem where a linear function is not a good model of the data.

## 3.1 Load and Visualize the Dataset

In [None]:
X, y = nlr.load_nonlinear_cos_dataset(noise=0.1)
nlr.scatter_data(X, y)

## 3.2 Fitting a Polynomial to the Data
The dataset seems to be a nonlinear function. Let's try a polynomial feature transformation again. Try changing the parameter `degree` from 1 to 30 and see what happens.

In [None]:
poly = sklearn.preprocessing.PolynomialFeatures(degree=1)
lr = sklearn.linear_model.LinearRegression()
pipe = sklearn.pipeline.make_pipeline(poly, lr)
pipe.fit(X, y)

nlr.plot_prediction(X, y, pipe)

mae = sklearn.metrics.mean_absolute_error(y, pipe.predict(X))
print(f"The MAE of the LR model is {mae}")

### Q3.2
1. For which polynomial order (parameter `degree`) do you measure the smallest Mean Absolute Error (MAE)?
2. Which polynomial order  do you think fits the data the best?
3. What happens if `degree` is large, e.g., larger than 25?
4. What happens if `degree` is small, e.g., one, two, or three?

## 3.3 Train-Test Split
To detect under- and overfitting with metrics, we need to split our entire dataset into two parts: a training set and a test set. The training set is used to train our model. The test set is used to measure performance. Let's repeat the process from above, just that this time we use a training and a test set.

In [None]:
# Load the train and test set
X_train, y_train = nlr.load_nonlinear_pol_dataset(n=30, seed=1)
X_test, y_test = nlr.load_nonlinear_pol_dataset(n=100, seed=2)

# Plot the two sets
fig, axes = plt.subplots(1, 2, figsize=(15, 4))
axes[0].set_title("Train data")
axes[1].set_title("Test data")
nlr.scatter_data(X_train, y_train, axes[0])
nlr.scatter_data(X_test, y_test, axes[1])

In [None]:
# Create the model
poly = sklearn.preprocessing.PolynomialFeatures(degree=1)
lr = sklearn.linear_model.LinearRegression()
pipe = sklearn.pipeline.make_pipeline(poly, lr)

# Train the model
pipe.fit(X_train, y_train)

nlr.plot_prediction(X_test, y_test, pipe)

# Evaluate the model
mae_train = sklearn.metrics.mean_absolute_error(y_train, pipe.predict(X_train))
mae_test = sklearn.metrics.mean_absolute_error(y_test, pipe.predict(X_test))
print(f"The MAE of the LR(degree={poly.degree}) model on the train set is {mae_train}")
print(f"The MAE of the LR(degree={poly.degree}) model on the test set is {mae_test}")

### Q3.3
1. Why should you split your dataset into a train and test set?
2. Which degree of polynomials fits the test set best? Is it lower or higher than the one that fits the train set best?
3. How can you detect overfitting / underfitting with error metrics?