# GP emulation

The simulated data include several outputs (e.g. soluble Aitken mode, soluble accumulation mode) and many inputs. In this Notebook, we implement the methods needed to use a Gaussian process to emulate the simulated data. We also include a means of retrieving the learned kernel parameters.

In [None]:
def train_GP_pipe(X_train, y_train, nu=0.5, length=1.0, coefficient=1.0, random_state=0):
    """
    Using the C3 server, fit a traing GP regression pipe on the supplied training and testing data
    
    By default, we take the Matern kernel with shape parameter nu=0.5. We also use an anisotropic kernel by default.

    Arguments
    
    X_train, y_train : pandas DataFrames
    nu : float
        The shape parameter for a Gaussian Process model with Matern kernel
    length : float
        A model's Matern kernel initializes itself with a length scale which is then updated according to a maximum
        likelihood principle with the data. This is the initial length scale used in an anisotropic kernel.
    coefficient : float
        The kernel function K may be written like

            K(x, x') = c * k(||x - x'||),

        where c := *coefficient is the variance term (i.e. when x=x') and k is some other function.
    random_state : integer

    Value

    GPReg_pipe
        A c3 type, containing a serialized sklearn GP regression model after it has been trained
    """
    nrows = X_train.shape[1]
    
    # create kernel
    GPReg_kernel = c3.SklearnGPRKernelMatern(lengthScale=[length]*nrows,
                                             nu=nu,
                                             coefficient=coefficient).build().kernel

    # define technique
    GPReg_technique = c3.GaussianProcessRegressionTechnique(randomState=random_state,
                                                            kernel = GPReg_kernel)

    # create pipe
    GPReg_pipe = c3.GaussianProcessRegressionPipe(technique=GPReg_technique)

    # train it
    trained_GPReg_pipe = GPReg_pipe.train(input=c3.Dataset.fromPython(X_train),
                                          targetOutput=c3.Dataset.fromPython(y_train))

    return(trained_GPReg_pipe)


def kernel_GP_pipe(trained_pipe):
    """
    Given the trained GP regression pipe, grab the learned kernel
    
    Arguments

    trained_pipe : GPReg_pipe
        A c3 type, containing a serialized sklearn GP regression model after it has been trained

    Value

    dictionary
        The learned parameters of the kernel function for the fitted model
    """
    model = c3.PythonSerialization.deserialize(serialized=trained_pipe.trainedModel.model)
    
    return(model.kernel_.get_params())

# Diagnostics

In [2]:
def make_test_data_for_viewing_GP(X_train, input_to_view, granularity=100):
    """
    This function is for diagnostic purposes. It makes up a bunch of points at which the model may predict so that a
    smooth curve can be drawn.
    """

    import pandas as pd
    import numpy as np

    # Make a mesh of points to test, where the levels of the input_to_view are dense in [0, 1] and the other inputs are
    # all the same as in the training data points
    X_test = pd.DataFrame()
    test_input = X_train[input_to_view]
    test_input_to_view = np.linspace(np.min(test_input), np.max(test_input), num=granularity)
    
    for test_input in test_input_to_view:
        
        X_test_next_piece = X_train.copy()
        X_test_next_piece[input_to_view] = [test_input]*X_test_next_piece.shape[0]
        X_test = pd.concat([X_test, X_test_next_piece], axis=0)

    return(X_test)


def view_GP_curve(X_train, y_train, inputs_to_train, input_to_view, granularity=100, nu=0.5):
    """
    This function is for diagnostic purposes. It plots the marginal response curve for a trained model. That is, it 
    computes a bunch of predictions and then plots the interpolated curve which is the model output integrated over the
    empirical distributions of all but one input.
    """
    
    import numpy as np
    import matplotlib.pyplot as plt

    X_test = make_test_data_for_viewing_GP(X_train, input_to_view, granularity)
    
    trained_pipe = train_GP_pipe(X_train[inputs_to_train],
                                 y_train,
                                 nu=nu)

    predictions_c3 = trained_pipe.process(c3.Dataset.fromPython(X_test[inputs_to_train]))
    predictions = [x[0] for x in c3.Dataset.toNumpy(predictions_c3)]

    nrows = X_train.shape[0]
    mean_predictions = [np.mean(predictions[k+0:k+nrows]) for k in [nrows*g for g in range(0, granularity)]]
    
    plt.scatter(X_train[input_to_view], y_train, label="Training observations")
    plt.plot(X_test[input_to_view].unique(), mean_predictions, label="Mean prediction")
    
    plt.xlabel(input_to_view)
    plt.ylabel("Marginal response")
    
    plt.legend()
    plt.show()

    return(X_test[input_to_view].unique(), predictions, mean_predictions)