In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

# Task 1: Cross-Validation with Ridge Regression

You are given data from the following function:
$f(x) = - 0.1x^3 + x^2 - 0.1x + 1 $.

However, your data set is highly noisy and unfortunately also seems to be quite unbalanced and small.

In this exercise your task is to apply ridge regression to approximate the data distribution. In ridge regression the regularization term $\alpha \sum_{j=1}^p \beta_j^2$ is added to the error function.
The main goal of this task is to optimize the hyperparameter $\alpha$ using k-fold cross validation.

First we load the data. You do not have to change anything here.

In [None]:
def f(X, noisy=False):
  y = np.zeros(X.shape)
  param = [1, -.1, 1, -.1]
  for i, weight in enumerate(param):
    y += weight * X**(i)

  # Adding noise
  if noisy:
    y += np.random.normal(0, 1, X.shape)
  return y

In [None]:
# Number of data points
np.random.seed(2)
data_points = 150
n_outliers = 50

# Generating synthetic data
X = np.concatenate((np.random.uniform(-6, -5, int(data_points/2)),
                    np.random.uniform(-5, -4, 2),
                    np.random.uniform(-4, -1, int(data_points/4)),
                    np.random.uniform(0, 1.5, int(data_points/4)),
                    np.random.uniform(1.5, 3.9, 3),
                    np.random.uniform(4, 6, int(data_points/2))))

np.random.shuffle(X)
y = f(X, noisy=True)
X = X[:, np.newaxis]

# Introduce outliers
outlier_indices = np.random.choice(data_points, n_outliers, replace=False)
outlier_indices = np.concatenate((outlier_indices,
                                  np.where(np.logical_and(X < 4, X > 1))[0],
                                  np.where(np.logical_and(X > -5, X < -4))[0]))
y[outlier_indices] += np.random.normal(0, 10, outlier_indices.shape[0])

# generate test/holdout data set to compare cross validation
# and train-test-split validation at the end
X_holdout = np.random.uniform(-6, 6, 100)[:, np.newaxis]
y_holdout = f(X_holdout, noisy=True)

# number of weights to fit in regression
degree = 10
poly_features = PolynomialFeatures(degree=degree, include_bias=True)

# number of folds
num_folds = 5

# plot data
# Generate a dense set of X values for smooth plotting
X_dense = np.linspace(X.min(), X.max(), 500)[:, np.newaxis]
y_dense = f(X_dense)

plt.figure()
plt.scatter(X, y, color='black', label='Training data')
plt.scatter(X_holdout, y_holdout, color='blue', alpha=0.4, label='Holdout data')
plt.plot(X_dense, y_dense, color='red', label='True function')
plt.legend()
plt.title('Training Data and True Function')
plt.xlabel('X')
plt.ylabel('y')
plt.show()

The following function will be used later to use your implementation of the k-fold cross validation with different values of alphas. You do not have to change anything in this function.

In [None]:
def test_alphas(cross_validation=True, verbose=False):
  """
  Tests out a range of values for alphas for ridge regression
  @param cross_validation: Whether to use cross validation or one test and one train split
  @return: alphas, losses, best_alpha, best_loss
  """
  np.random.seed(1)
  num_alphas = 200
  alphas = np.logspace(-5, 4, num_alphas)

  min_loss = np.inf
  best_alpha = None

  losses = []

  for i in range(num_alphas):
    alpha = alphas[i]

    # initialize model with current alpha
    model = make_pipeline(poly_features, Ridge(alpha, solver='svd'))

    if cross_validation:
      # do k-fold cross validation of model
      score = k_fold_cross_validation(X, y, num_folds, model)
    else:
      # do validation with train-test-split
      score = train_test_split_validation(X, y, model)

    if verbose:
      print(f"alpha: {alpha}, mse loss: {score}")
    losses.append(score)

    # store best alpha
    if score < min_loss:
      min_loss = score
      best_alpha = alpha

  print("-------------------------------------------------------------------------------------")
  print(f"Best alpha = {best_alpha} (log(alpha) = {np.log10(best_alpha)}), mse loss {min_loss}")

  return alphas, losses, best_alpha, min_loss


a) Complete the function, implementing the k-fold cross validation on the data set (X, y), given the model we want to train. You are free to implement the ridge regression yourself. However, as our main goal is to implement cross-validation and not ridge regression, you can also use the class `sklearn.linear_model.Ridge`. For score we will use the mean squared error. You can implement this by yourself of use the function `mean_squared_error` from scikit learn.

In [None]:
from logging import error
def k_fold_cross_validation(X, y, k, model):
  """
  Performs k-fold cross validation on the model
  @param X: Data x-values
  @param y: Data true Labels
  @param k: Number of folds
  @param model: Model to train
  @return: Mean  mse scores over the k validations
  """
  # TODO: implement k-fold cross validation
  raise NotImplementedError


- use this cell to test your implementation

In [None]:
alphas_cross, losses_cross, best_alpha_cross, min_loss_cross = test_alphas(verbose=True)

# initialize model with the alpha that returned the lowest average loss
model_cross = make_pipeline(poly_features, Ridge(best_alpha_cross, solver='svd'))
model_cross.fit(X, y)

# generate a dense set of X values for smooth plotting
X_dense = np.linspace(X.min(), X.max(), 500)[:, np.newaxis]
y_pred_cross = model_cross.predict(X_dense)

# plot
plt.figure()
plt.scatter(X, y, color='black', label='Test data')
plt.plot(X_dense, y_pred_cross, color='blue', label=f'model approximation using alpha {best_alpha_cross:.5f}')
plt.plot(X_dense, f(X_dense), color='black', linestyle='--', label='true function')
plt.title('Fit with Best Alpha')
plt.xlabel('X')
plt.ylabel('y')
plt.legend()
plt.show()

In [None]:
# plot all the alphas and their average losses
plt.figure()
plt.plot(np.log10(alphas_cross), losses_cross, color='blue')
plt.xlabel('log(Alpha)')
plt.ylabel('Average MSE Loss')
plt.title('Average Loss vs. Alpha')
plt.show()

b) We want to compare k-fold cross validation to a validation, where the data is only split once into one train and one test set and trained once on the train set, before it is evaluated once on the test set. Complete the function implementing the training of the model, where the data is only split into one train and one test set

In [None]:
def train_test_split_validation(X, y, model):
  """
  Performs validation on the model using one test and one train set
  @param X: Data x-values
  @param y: Data true Labels
  @param model: Model to train
  @return: mse score from testing
  """

  # TODO: split the data into test and training data

  # TODO: train the model on the training data

  # TODO: use the trained model to make predictions on the test data

  # TODO: calculate the mse loss of the predictions on the test data

  # TODO: return the loss
  raise NotImplementedError

In [None]:
alphas_split, losses_split, best_alpha_split, min_loss_split = test_alphas(cross_validation=False, verbose=True)

# initialize model with the alpha that returned the lowest average loss
model_split = make_pipeline(poly_features, Ridge(best_alpha_split, solver='svd'))
model_split.fit(X, y)

#print("------------------------------------------------------------")
#print(f"Train-test-split weights: \n{model_split['ridge'].coef_}")
#print(f"Cross validation weights: \n{model_cross['ridge'].coef_}")

# Generate a dense set of X values for smooth plotting
X_dense = np.linspace(X.min(), X.max(), 500)[:, np.newaxis]
y_pred_split = model_split.predict(X_dense)

plt.figure()
plt.scatter(X, y, color='black', label='Test data')
plt.plot(X_dense, y_pred_split, color='red', label=f'train-test-split alpha {best_alpha_split:.5f}')
plt.plot(X_dense, y_pred_cross, color='blue', label=f'cross validation alpha {best_alpha_cross:.5f}')
plt.plot(X_dense, f(X_dense), color='black', linestyle='--', label='true function')
plt.title('Fit with Best Alpha')
plt.xlabel('X')
plt.ylabel('y')
plt.legend()
plt.show()

c) Compare the performance of the model using the alpha optimized with cross validation and with the model using the alpha optimized with one train-test-split. For this use the Test data (holdout data) stored in the variables `X_holdout`, `y_holdout`. This data was never used before for testing in this task.

In [None]:
# TODO: calculate mse error on holdout data for both models
loss_cross = None
loss_split = None

print(f"loss on holdout data from cross validation = {loss_cross}, with alpha = {best_alpha_cross}")
print(f"loss on holdout data from train-test-split validation = {loss_split}, with alpha = {best_alpha_split}")