In [35]:
#Import important packages

import os
import typing
from sklearn.gaussian_process.kernels import *
import numpy as np
from sklearn.gaussian_process import GaussianProcessRegressor
import matplotlib.pyplot as plt
from matplotlib import cm
import gpytorch
from matplotlib import rcParams
import torch
import pandas as pd
from data import Data
from sklearn.model_selection import train_test_split

In [36]:
# Set `EXTENDED_EVALUATION` to `True` in order to visualize your predictions.
EXTENDED_EVALUATION = True
EVALUATION_GRID_POINTS = 300  # Number of grid points used in extended evaluation
EVALUATION_GRID_POINTS_3D = 50  # Number of points displayed in 3D during evaluation


# Cost function constants
COST_W_UNDERPREDICT = 25.0
COST_W_NORMAL = 1.0
COST_W_OVERPREDICT = 10.0

In [37]:
class Model(gpytorch.models.ExactGP):
    def __init__(self, train_x, train_y, likelihood):
        super(Model, self).__init__(train_x, train_y, likelihood)
        self.mean_module = gpytorch.means.ConstantMean()
        self.covar_module = gpytorch.kernels.ScaleKernel(
            gpytorch.kernels.RBFKernel(
                ard_num_dims=train_x.shape[1]
            )
        )

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)
    

def make_predictions(self, test_features: np.ndarray) -> typing.Tuple[np.ndarray, np.ndarray, np.ndarray]:
    """
    Predict the pollution concentration for a given set of locations.
    :param test_features: Locations as a 2d NumPy float array of shape (NUM_SAMPLES, 2)
    :return:
        Tuple of three 1d NumPy float arrays, each of shape (NUM_SAMPLES,),
        containing your predictions, the GP posterior mean, and the GP posterior stddev (in that order)
    """

    # TODO: Use your GP to estimate the posterior mean and stddev for each location here
    gp_mean = np.zeros(test_features.shape[0], dtype=float)
    gp_std = np.zeros(test_features.shape[0], dtype=float)

    return predictions, gp_mean, gp_std

def fitting_model(model, train_GT: np.ndarray,train_features: np.ndarray):
    """
    Fit your model on the given training data.
    :param train_features: Training features as a 2d NumPy float array of shape (NUM_SAMPLES, 2)
    :param train_GT: Training pollution concentrations as a 1d NumPy float array of shape (NUM_SAMPLES,)
    """

    # TODO: Fit your model here


In [38]:
def cost_function(ground_truth: np.ndarray, predictions: np.ndarray) -> float:
    """
    Calculates the cost of a set of predictions.

    :param ground_truth: Ground truth pollution levels as a 1d NumPy float array
    :param predictions: Predicted pollution levels as a 1d NumPy float array
    :return: Total cost of all predictions as a single float
    """
    assert ground_truth.ndim == 1 and predictions.ndim == 1 and ground_truth.shape == predictions.shape

    # Unweighted cost
    cost = (ground_truth - predictions) ** 2
    weights = np.ones_like(cost) * COST_W_NORMAL

    # Case i): underprediction
    mask_1 = predictions < ground_truth
    weights[mask_1] = COST_W_UNDERPREDICT

    # Case ii): significant overprediction
    mask_2 = (predictions >= 1.2*ground_truth)
    weights[mask_2] = COST_W_OVERPREDICT

    # Weigh the cost and return the average
    return np.mean(cost * weights)


In [39]:
def perform_extended_evaluation(model: Model, output_dir: str = '/results'):
    """
    Visualizes the predictions of a fitted model.
    :param model: Fitted model to be visualized
    :param output_dir: Directory in which the visualizations will be stored
    """
    print('Performing extended evaluation')
    fig = plt.figure(figsize=(30, 10))
    fig.suptitle('Extended visualization of task 1')

    # Visualize on a uniform grid over the entire coordinate system
    grid_lat, grid_lon = np.meshgrid(
        np.linspace(0, EVALUATION_GRID_POINTS - 1, num=EVALUATION_GRID_POINTS) / EVALUATION_GRID_POINTS,
        np.linspace(0, EVALUATION_GRID_POINTS - 1, num=EVALUATION_GRID_POINTS) / EVALUATION_GRID_POINTS,
    )
    visualization_xs = np.stack((grid_lon.flatten(), grid_lat.flatten()), axis=1)

    # Obtain predictions, means, and stddevs over the entire map
    predictions, gp_mean, gp_stddev = model.make_predictions(visualization_xs)
    predictions = np.reshape(predictions, (EVALUATION_GRID_POINTS, EVALUATION_GRID_POINTS))
    gp_mean = np.reshape(gp_mean, (EVALUATION_GRID_POINTS, EVALUATION_GRID_POINTS))
    gp_stddev = np.reshape(gp_stddev, (EVALUATION_GRID_POINTS, EVALUATION_GRID_POINTS))

    vmin, vmax = 0.0, 65.0
    vmax_stddev = 35.5

    # Plot the actual predictions
    ax_predictions = fig.add_subplot(1, 3, 1)
    predictions_plot = ax_predictions.imshow(predictions, vmin=vmin, vmax=vmax)
    ax_predictions.set_title('Predictions')
    fig.colorbar(predictions_plot)

    # Plot the raw GP predictions with their stddeviations
    ax_gp = fig.add_subplot(1, 3, 2, projection='3d')
    ax_gp.plot_surface(
        X=grid_lon,
        Y=grid_lat,
        Z=gp_mean,
        facecolors=cm.get_cmap()(gp_stddev / vmax_stddev),
        rcount=EVALUATION_GRID_POINTS_3D,
        ccount=EVALUATION_GRID_POINTS_3D,
        linewidth=0,
        antialiased=False
    )
    ax_gp.set_zlim(vmin, vmax)
    ax_gp.set_title('GP means, colors are GP stddev')

    # Plot the standard deviations
    ax_stddev = fig.add_subplot(1, 3, 3)
    stddev_plot = ax_stddev.imshow(gp_stddev, vmin=vmin, vmax=vmax_stddev)
    ax_stddev.set_title('GP estimated stddev')
    fig.colorbar(stddev_plot)

    # Save figure to pdf
    figure_path = os.path.join(output_dir, 'extended_evaluation.pdf')
    fig.savefig(figure_path)
    print(f'Saved extended evaluation to {figure_path}')

    plt.show()


In [40]:
def main():
    # Load the training dateset and test features
    train_features = np.loadtxt('train_x.csv', delimiter=',', skiprows=1)
    train_GT = np.loadtxt('train_y.csv', delimiter=',', skiprows=1)
    test_features = np.loadtxt('test_x.csv', delimiter=',', skiprows=1)
    likelihood = gpytorch.likelihoods.GaussianLikelihood()
    
    train_X, val_X, train_Y, val_Y = train_test_split(train_features,train_GT, 
                                                      train_size=0.8,test_size=0.2,
                                                      random_state=0,shuffle=True)
    
    train_X = torch.Tensor(train_X)
    train_Y = torch.Tensor(train_Y).squeeze()

    val_X = torch.Tensor(val_X)
    val_Y = torch.Tensor(val_Y).squeeze()
    
    print(f"train_X.shape: {train_X.shape}")
    print(f"train_y.shape: {train_Y.shape}")
    print(f"val_X.shape: {val_X.shape}")
    print(f"val_y.shape: {val_Y.shape}")
    
    # Fit the model
    print('Fitting model')
    
    likelihood = gpytorch.likelihoods.GaussianLikelihood()
    model = Model(train_X, train_Y, likelihood)
    
    model.train()
    likelihood.train()
    
    optimizer = torch.optim.Adam(model.parameters(), lr=0.5)
    mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)
    
    iterations = 80
    for i in range(iterations):
        # Zero gradients from previous iteration
        optimizer.zero_grad()
        # Output from model
        output = model(train_X)
        # Calc loss and backprop gradients
        loss = -mll(output, train_Y)
        loss.backward()
        print('Iteration %d/%d - Loss: %.3f' , (i, iterations, loss.item()))
        optimizer.step()
    
    model.eval()
    likelihood.eval()
    with torch.no_grad(), gpytorch.settings.fast_pred_var():
        f_x = model(val_X)
        mean = f_x.mean
        f_x_lower, f_x_upper = f_x.confidence_region()
        y = likelihood(f_x)
        y_lower, y_upper = y.confidence_region()   
    cost = cost_function(val_Y.cpu().numpy(), mean.cpu().numpy())
    print("Cost of evaluation set: %.3f",cost)

    #model = Model(train_features,train_GT,likelihood)
    #fitting_model(model,train_GT,train_features)

    # Predict on the test features
    #print('Predicting on test features')
    #predictions, gp_mean, gp_std = model.make_predictions(test_features)
    
    #print(predictions)
    
    #cost_function(train_GT, predictions)

    #if EXTENDED_EVALUATION:
    #    perform_extended_evaluation(model, output_dir='.')


if __name__ == "__main__":
    main()


train_X.shape: torch.Size([12151, 2])
train_y.shape: torch.Size([12151])
val_X.shape: torch.Size([3038, 2])
val_y.shape: torch.Size([3038])
Fitting model
Iteration %d/%d - Loss: %.3f (0, 80, 171.55404663085938)
Iteration %d/%d - Loss: %.3f (1, 80, 112.38656616210938)
Iteration %d/%d - Loss: %.3f (2, 80, 77.14680480957031)
Iteration %d/%d - Loss: %.3f (3, 80, 52.21416091918945)
Iteration %d/%d - Loss: %.3f (4, 80, 32.83803176879883)
Iteration %d/%d - Loss: %.3f (5, 80, 19.757619857788086)
Iteration %d/%d - Loss: %.3f (6, 80, 11.305923461914062)
Iteration %d/%d - Loss: %.3f (7, 80, 6.902866363525391)
Iteration %d/%d - Loss: %.3f (8, 80, 6.283727169036865)
Iteration %d/%d - Loss: %.3f (9, 80, 7.857424736022949)
Iteration %d/%d - Loss: %.3f (10, 80, 10.279191970825195)
Iteration %d/%d - Loss: %.3f (11, 80, 12.319568634033203)
Iteration %d/%d - Loss: %.3f (12, 80, 13.10788631439209)
Iteration %d/%d - Loss: %.3f (13, 80, 12.533940315246582)
Iteration %d/%d - Loss: %.3f (14, 80, 11.0410985946