# Variable selection

How do we decide which inputs are most important for a GP? We're fetching the kernel parameters from the fitted GP and taking the inputs associated with the smallest length scale parameters. (We will go on to use different length scale parameter vectors as representatives of each GP fit for each GeoSurfaceTimePoint and then cluster these points in time and space based on which inputs are most important.)

In [None]:
def ard(fitted_gpr,
        parameter_names):
    """
    For automatic relevance determination. Obtain the fitted length scale parameters for the fitted_gpr object.

    Parameters

    fitted_gpr : Scikit-learn GaussianProcessRegressor
        a fitted GP
    inputs : list
        parameters considered from the dataset for fitting the GP

    Value

    numpy array
        list of parameters with length scales, ordered by length scale
    """

    # To record the inputs' predictive relevances R_j
    dtype = [('input', 'U35'), ('length_scale', float)]
    length_scales_and_names = []

    # Obtain the length scales from the 'fitted_gpr' parameter GP
    variable_length_scales = kernel_GP_pipe(fitted_gpr)['k2__length_scale']

    # To produce a dictionary of parameters and their length scales
    variable_names = np.array(parameter_names)

    # We tabulate the variables and their length scales...
    for var in range(len(variable_names)):
        length_scales_and_names.append((variable_names[var], variable_length_scales[var]))

    # And then order them by length scale
    scored_inputs = np.array(length_scales_and_names, dtype=dtype)
    return(np.sort(scored_inputs, order='input'))


def cv(df,
       inputs,
       output,
       folds=10,
       nu=0.5,
       mean_zero=False):
    """
    Estimate the cross-validation error of a model

    Parameters

    df : Pandas DataFrame
        full data set
    folds : integer
        number of folds for cross-validation
    inputs : list
        parameters considered from the dataset for fitting the GP
    output : str
        output of interest
    nu : numeric
        shape parameter for Matern kernel, ideally an integer multiple of 0.5
    mean_zero : logical
        if False, fit a mean field using linear regression, center the training response, and add back the mean after
        predicting

    Value

    list
        The list of validation error on each of the k=1,2,...,<folds> folds.
    """
    import random

    # To record the test error on each fold...
    cv_error = []

    # Assign folds to the samples
    df["fold"] = [x % folds for x in random.sample(range(df.shape[0]), df.shape[0])]

    # Set aside data
    X = np.array(df[inputs])
    y = np.array(df[output])

    for fold in range(folds):

        # Train / test split
        X_train = np.array(df[(df.fold != fold)][inputs])
        y_train = np.array(df[(df.fold != fold)][output])
        X_test = np.array(df[(df.fold == fold)][inputs])
        y_test = np.array(df[(df.fold == fold)][output])
        
        # Fit GP
        my_model = train_GP_pipe(X_train, y_train)

        # Obtain predictions
        y_predictions = np.array(my_model.process(c3.Dataset.fromPython(X_test)).m_data)

        # Compute and record error
        cv_error.append(np.mean((y_predictions - y_test) ** 2))

    return(cv_error)

# Diagnostics

In [None]:
def select_vars(df,
                ard_results,
                output,
                max_num_vars=10,
                folds=10,
                nu=0.5,
                mean_zero=False):
    """
    This is currently a diagnostic tool. We ask how the cross validation error changes as a function of the number of
    inputs we use to fit a GP. We won't be using this method on every GP we fit, though. We will first cluster the
    GeoSurfaceTimePoints by some appropriate method, and then we will select variables within each cluster.
    """

    import numpy as np

    ordered_inputs = ard_results["input"][0:max_num_vars]
    var_counter = list(range(1, len(ordered_inputs) + 1))
    cv_means = []
    cv_stds = []
    
    for k in var_counter:

        cv_results_k = cv(df=df,
                          inputs=ordered_inputs[0:k],
                          output=output,
                          folds=folds,
                          nu=nu,
                          mean_zero=mean_zero)
        cv_means.append(np.mean(cv_results_k))
        cv_stds.append(np.std(cv_results_k))
    
    return(pd.DataFrame({
        'num_vars' : var_counter,
        'next_var' : ordered_inputs,
        'mean' : cv_means,
        'std' : cv_stds
    }))


def plot_cv_curve(selection_results):
    """
    This function just produces a plot of the results of the above diagnostic function.
    """
    
    import matplotlib.pyplot as plt
    
    plt.plot(selection_results['num_vars'], selection_results['mean'])
    plt.fill_between(
        selection_results['num_vars'].ravel(),
        selection_results['mean'] - 1.96 * selection_results['std'],
        selection_results['mean'] + 1.96 * selection_results['std'],
        alpha = 0.5
    )
    
    _ = plt.title("CV error as a function of $k$ the number of top inputs")
    plt.show()