# Active learning with Gaussian Process regression (GPR) model

In this exercise we will build on what we learned in the previous exercise by applying the Gaussian process regression (GPR) model in an active learning setting.

In active learning we iteratively improve a model by using the model to query for additional training data. To select the data that can improve the model the most, we utilize uncertainty about the predictions. The idea is that by selecting the data the model is most uncertain about and adding it to the training dataset, we can reduce uncertainty an thus improve the overall performance of the model. This is especially useful when labeling data is expensive and we want to be smart about what data we choose to label. 

## Dependencies

First we import the dependencies.

If you are in Colab, you need to install the [GPyTorch](https://gpytorch.ai/) package by uncommenting and running the line `!pip3 install gpytorch` below before proceeding.

In [None]:
# install dependencies
# !pip3 install gpytorch


In [None]:
from matplotlib import pyplot as plt
import torch
import gpytorch


# Data

Again we consider synthetic data generated by the Schwefel function.

We first visualize the function on a grid of input points and then we sample the initial training dataset with a small amount of additive observation noise. 

In [None]:
def schwefel(x):
    """The Schwefel function has many local optima."""
    return 418.9829 * x.shape[-1] - (x * torch.sin(torch.sqrt(torch.abs(x)))).sum(dim=-1)

def noisy_schwefel(x, noise_std=1.0):
    """The Schwefel function with observation noise."""
    return schwefel(x) + noise_std * torch.randn(x.shape[0])

def standardize(y):
    """Standardize a vector to have zero mean and unit standard deviation."""
    return (y - y.mean()) / y.std()

# Define a grid of points on which to evaluate the function
n_grid = 100
levels = 30
x_min = torch.tensor([0, 0])
x_max = torch.tensor([430, 430])

x0 = torch.linspace(0, 1, n_grid)
x1 = torch.linspace(0, 1, n_grid)
g0, g1 = torch.meshgrid(x0, x1, indexing="xy")
x_grid = torch.stack((g0.reshape(-1), g1.reshape(-1)), 1)

y_grid = schwefel(x_grid * (x_max - x_min) + x_min)

vmin, vmax = y_grid.min(), y_grid.max()

plt.figure(figsize=(5,4))
plt.title("Schwefel function")
plt.contourf(x0.numpy(), x1.numpy(), y_grid.reshape(n_grid, n_grid).numpy(), vmin=vmin, vmax=vmax, levels=levels)
plt.colorbar()
plt.show()


In [None]:
# Sample a training set and plot it
n_train = 50

torch.manual_seed(0)
x_train = torch.rand(n_train, 2)
y_train = noisy_schwefel(x_train * (x_max - x_min) + x_min)

plt.figure(figsize=(5,4))
plt.title('Training data')
plt.scatter(x_train[:,0], x_train[:,1], c=y_train, vmin=vmin, vmax=vmax)
plt.colorbar()
plt.xlim(0, 1); plt.ylim(0, 1)
plt.show()


# Model

We can use the same simple GPyTorch GPR model we used in the previous exercise. 

In [None]:
# We will use the simplest form of GP model
class ExactGPModel(gpytorch.models.ExactGP):
    def __init__(self, train_x, train_y, likelihood):
        super(ExactGPModel, self).__init__(train_x, train_y, likelihood)
        self.mean_module = gpytorch.means.ConstantMean()
        self.covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.RBFKernel())

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)


Since we need to train the model multiple times, we define a helper function that takes data as input and returns a trained model. 
We likewise define a function to make predictions with a trained model. 
Finally we define functions to compute error metrics to evaluate the model performance. 

In [None]:
def train_gpr(x_train, y_train, training_iter=100):
    """Train a Gaussian process regression model."""
    # Initialize likelihood and model
    likelihood = gpytorch.likelihoods.GaussianLikelihood()
    model = ExactGPModel(x_train, standardize(y_train), likelihood)
    # Training mode
    model.train()
    likelihood.train()
    # Use the adam optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=0.1)  # Includes GaussianLikelihood parameters
    # Loss function - the marginal log likelihood
    mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)
    # Training loop
    losses = []
    for i in range(training_iter):
        # Zero gradients from previous iteration
        optimizer.zero_grad()
        # Output from model
        output = model(x_train)
        # Calc loss and backprop gradients
        loss = -mll(output, standardize(y_train))
        loss.backward()
        losses.append(loss.item())
        optimizer.step()
    return model, likelihood, losses


def predict(model, likelihood, x):
    """Predict using a Gaussian process regression model."""
    model.eval()
    likelihood.eval()
    with torch.no_grad(), gpytorch.settings.fast_pred_var():
        f_pred = model(x)  # model posterior distribution
        y_pred = likelihood(f_pred)  # posterior predictive distribution
    return y_pred


def mse(y_true, y_pred):
    """Compute mean squared error."""
    return torch.mean((y_true - y_pred)**2)


def r2(y_true, y_pred):
    """Compute coefficient of determination."""
    ssr = torch.sum((y_true - y_pred)**2)
    sst = torch.sum((y_true - torch.mean(y_true))**2)
    return 1 - (ssr / sst)


Before we proceed to improve the model with active learning, let us see how the model performs on the initial training data. 

In [None]:
model, likelihood, losses = train_gpr(x_train, y_train)
y_pred = predict(model, likelihood, x_grid)
y_pred_mean = y_pred.mean
y_pred_var = y_pred.variance

# plt.figure(figsize=(6,3))
# plt.title("Training loss")
# plt.plot(losses)
# plt.xlabel("Iteration")
# plt.ylabel("Loss")
# plt.show()

plt.figure(figsize=(5,4))
plt.title("Prediction mean")
plt.contourf(x0.numpy(), x1.numpy(), y_pred_mean.reshape(n_grid, n_grid).numpy(), levels=levels)
plt.scatter(x_train[:,0], x_train[:,1], c=standardize(y_train))  # plot training data
plt.colorbar()
plt.show()

plt.figure(figsize=(5,4))
plt.title("Prediction uncertainty")
plt.contourf(x0.numpy(), x1.numpy(), torch.sqrt(y_pred_var).reshape(n_grid, n_grid).numpy(), levels=levels)
plt.scatter(x_train[:,0], x_train[:,1], c="black")  # plot training data
plt.colorbar()
plt.show()


Now let us see if we can improve the model. The code below runs an active learning loop where in each iteration we:
* Train a model on the current training dataset. 
* Use the trained model to make predictions on a pool of data (here we use the grid data).
* Select the data point with the highest predicted uncertainty.
* Label the new data point with the Schwefel function.
* Add the new data point to the training dataset.
* Evaluate the model on a test dataset and compute error metrics.
* Repeat. 

In [None]:
# Active learning loop

def run_active_learning_loop(x_data, y_data, x_pool, x_test, y_test, n_steps=10):
    """Run the active learning loop."""
    mse_list, r2_list = [], []
    for i in range(n_steps):
        print(f"Step: {i+1}/{n_steps}")
        # Train GPR model
        model, likelihood, losses = train_gpr(x_data, y_data)
        # Predict on new data
        y_pred = predict(model, likelihood, x_pool)
        # Select most uncertain data point
        index = y_pred.variance.argmax()
        x_new = x_pool[index].unsqueeze(0)
        # Label the new data point
        y_new = noisy_schwefel(x_new * (x_max - x_min) + x_min)
        # Add the new data point to the dataset
        x_data = torch.cat([x_data, x_new])
        y_data = torch.cat([y_data, y_new])
        # Evaluate the model on the test dataset and save the results
        y_pred = predict(model, likelihood, x_test)
        mse_list.append(mse(standardize(y_test), y_pred.mean))
        r2_list.append(r2(standardize(y_test), y_pred.mean))
    return x_data, y_data, mse_list, r2_list


# Run the active learning loop
active_learning_steps = 20
x_data, y_data, mse_list, r2_list = run_active_learning_loop(
    x_train.clone(),
    y_train.clone(),
    x_grid,
    x_grid,
    y_grid,
    n_steps=active_learning_steps
)


To see if the model improved, we can plot the error metrics from each iteration. 

In [None]:
# Plot errors

plt.figure(figsize=(6,3))
plt.title("MSE")
plt.plot(mse_list)
plt.xlabel("Iteration")
plt.ylabel("MSE")
plt.show()

plt.figure(figsize=(6,3))
plt.title("R^2")
plt.plot(r2_list)
plt.xlabel("Iteration")
plt.ylabel("R^2")
plt.show()


We can also take a look at the final dataset and the model predictions. What do you observe? Has the model improved? Can we improve it even more?

In [None]:
print("Data size:", x_data.shape, y_data.shape)

model, likelihood, losses = train_gpr(x_data, y_data)
y_pred = predict(model, likelihood, x_grid)
y_pred_mean = y_pred.mean
y_pred_var = y_pred.variance

# plt.figure(figsize=(6,3))
# plt.title("Training loss")
# plt.plot(losses)
# plt.xlabel("Iteration")
# plt.ylabel("Loss")
# plt.show()

plt.figure(figsize=(5,4))
plt.title('Data')
plt.scatter(x_data[:,0], x_data[:,1], c=y_data, vmin=vmin, vmax=vmax)
plt.colorbar()
plt.xlim(0, 1); plt.ylim(0, 1)
plt.show()

plt.figure(figsize=(5,4))
plt.title("Prediction mean")
plt.contourf(x0.numpy(), x1.numpy(), y_pred_mean.reshape(n_grid, n_grid).numpy(), levels=levels)
plt.scatter(x_data[:,0], x_data[:,1], c=standardize(y_data))  # plot training data
plt.colorbar()
plt.show()

plt.figure(figsize=(5,4))
plt.title("Prediction uncertainty")
plt.contourf(x0.numpy(), x1.numpy(), torch.sqrt(y_pred_var).reshape(n_grid, n_grid).numpy(), levels=levels)
plt.scatter(x_data[:,0], x_data[:,1], c="black")  # plot training data
plt.colorbar()
plt.show()


## Additional Exercises:

* Try to increase the number of active learning iterations. Can you further reduce the error and uncertainty?
* Try to reduce the number of initial training data points. How does it affect the final dataset and results?
* Try to change the input range by changing `x_min` and `x_max` to create a more complicated function.
* How would you create an algorithm that selects a batch of diverse new training points (instead of just a single point) in each iteration?