# GP Regression on Molecules #

An example notebook for basic GP regression on a molecular dataset using a Tanimoto fingerprint kernel and the Photoswitch Dataset:

Paper: https://arxiv.org/abs/2008.03226

Code: https://github.com/Ryan-Rhys/The-Photoswitch-Dataset



In [None]:
%%capture
# Imports

# To import from the gprotorch package
import sys
sys.path.append('..')
sys.path.append('../benchmarks/')

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import torch

from gprotorch.dataloader import DataLoaderMP
from gprotorch.dataloader.data_utils import transform_data
from gpytorch_metrics import negative_log_predictive_density, mean_standardized_log_loss, quantile_coverage_error
from gprotorch.kernels.graph_kernels.graph_kernel_utils import \
    get_label_adj_mats, adj_mat_preprocessing

import gpytorch
from botorch import fit_gpytorch_model
from rdkit.Chem import MolFromSmiles
from grakel import Graph

import scipy.sparse as sp
from functools import lru_cache
import warnings

warnings.filterwarnings(action='ignore', category=UserWarning, module=r'gpytorch')
warnings.filterwarnings(action='ignore', category=gpytorch.utils.warnings.NumericalWarning, module=r'gpytorch')

We define our model. See

https://docs.gpytorch.ai/en/latest/examples/01_Exact_GPs/Simple_GP_Regression.html

for further examples!


In [None]:
# We define our GP model using the Tanimoto kernel

from gprotorch.kernels.fingerprint_kernels.tanimoto_kernel import TanimotoKernel
from gprotorch import SIGP, Inputs
from gprotorch.kernels.graph_kernels import RandomWalk

class GraphGP(SIGP):
    def __init__(self, train_x, train_y, likelihood):
        super().__init__(train_x, train_y, likelihood)
        self.mean = gpytorch.means.ConstantMean()
        self.covariance = RandomWalk()
        self.covariance.weight = 1/17

    def forward(self, x):
        mean = self.mean(torch.zeros(len(x.data), 1)).float()
        covariance = self.covariance(x)
        jitter = max(covariance.diag().mean().detach().item()*1e-4, 1e-4)
        covariance += torch.eye(len(x.data))*jitter
        return gpytorch.distributions.MultivariateNormal(mean, covariance)

We define our experiment parameters. In this case we are reproducing the results of the E isomer transition wavelength prediction task from https://arxiv.org/abs/2008.03226 using 20 random splits in the ratio 80/20.

In [None]:
# Regression experiments parameters, number of random splits and split size

n_trials = 1
test_set_size = 0.2

Load the Photoswitch Dataset via the DataLoaderMP class which contains several molecular property prediction benchmark datasets!

In [None]:
bond_types = {1.0: 'S', 1.5: 'A', 2.0: 'D', 3.0: 'O'}

def get_adjacencies(smiles):
    ''' from leo's example, modified '''
    mols = [MolFromSmiles(smile) for smile in smiles]
    adjacencies = [get_label_adj_mats(x, adj_mat_format="torch_sparse") for x in mols]

    node_nums = [x[1] for x in adjacencies]
    adjacencies, largest_mol, label_dim = adj_mat_preprocessing(adjacencies)

    inputs = [mat.to_dense()[:n, :n, :] for mat, n in zip(adjacencies, node_nums)]
    return inputs

The training/evaluation loop

In [None]:
datasets = {'Photoswitch': '../data/property_prediction/photoswitches.csv',
            # 'ESOL': '../data/property_prediction/ESOL.csv',
            # 'FreeSolv': '../data/property_prediction/FreeSolv.csv',
            # 'Lipophilicity': '../data/property_prediction/Lipophilicity.csv'
}

for dataset_name, data_loc in datasets.items():
    loader = DataLoaderMP()
    loader.load_benchmark(dataset_name, data_loc)
    
    X = get_adjacencies(loader.features)
    y = loader.labels

    print(dataset_name); print('\n' + '-'*50)
    r2_list = []; rmse_list = []; mae_list = []; nlpd_list = []; msll_list = []; qce_list = []

    for i in range(0, n_trials):
        np.random.seed(i); torch.manual_seed(i)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_set_size, random_state=i)

        #  We standardise the outputs but leave the inputs unchanged
        # this seems to introduce numerical instabilities
        _, y_train, _, y_test, y_scaler = transform_data(
            np.zeros_like(y_train), y_train, np.zeros_like(y_test), y_test)

        # Convert numpy arrays to PyTorch tensors and flatten the label vectors
        y_train = torch.tensor(y_train).flatten().float()
        y_test = torch.tensor(y_test).flatten().float()

        X_train = Inputs(X_train)
        X_test = Inputs(X_test)

        # initialise GP likelihood and model
        likelihood = gpytorch.likelihoods.GaussianLikelihood()
        model = GraphGP(X_train, y_train, likelihood)

        # Find optimal model hyperparameters
        model.train()
        likelihood.train()

        # "Loss" for GPs - the marginal log likelihood
        mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)

        # Use the BoTorch utility for fitting GPs in order to use the LBFGS-B optimiser (recommended)
        # fit_gpytorch_model(mll)

        optimizer = torch.optim.Adam(model.parameters(), lr=0.5)  # Includes GaussianLikelihood parameters

        for j in range(150):
            optimizer.zero_grad()
            output = model(X_train)
            loss = -mll(output, y_train)
            loss.backward()
            # print(loss.item())
            optimizer.step()
        # print('Training successful')

        # Get into evaluation (predictive posterior) mode
        model.eval()
        likelihood.eval()

        # full GP predictive distribution
        trained_pred_dist = likelihood(model(X_test))

        # Compute NLPD on the Test set
        nlpd = negative_log_predictive_density(trained_pred_dist, y_test)
        # Compute MSLL on Test set
        msll = mean_standardized_log_loss(trained_pred_dist, y_test)

        # Compute quantile coverage error on test set
        qce = quantile_coverage_error(trained_pred_dist, y_test, quantile=95)

        # print(f'NLPD: {nlpd:.2f}')
        # print(f'MSLL: {msll:.2f}')
        # print(f'QCE: {qce:.2f}')

        # mean and variance GP prediction
        f_pred = model(X_test)

        y_pred = f_pred.mean

        # Transform back to real data space to compute metrics and detach gradients
        y_pred = y_scaler.inverse_transform(y_pred.detach().unsqueeze(dim=1))
        y_test = y_scaler.inverse_transform(y_test.detach().unsqueeze(dim=1))

        # Output Standardised RMSE and RMSE on Train Set
        # y_train = y_train.detach()
        # y_pred_train = model(X_train).mean.detach()
        # train_rmse_stan = np.sqrt(mean_squared_error(y_train, y_pred_train))
        # train_rmse = np.sqrt(
        #     mean_squared_error(y_scaler.inverse_transform(y_train.unsqueeze(dim=1)),
        #                         y_scaler.inverse_transform(y_pred_train.unsqueeze(dim=1))))
        # print("\nStandardised Train RMSE: {:.3f}".format(train_rmse_stan))
        # print("Train RMSE: {:.3f}".format(train_rmse))

        # Compute R^2, RMSE and MAE on Test set
        score = r2_score(y_test, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        mae = mean_absolute_error(y_test, y_pred)

        # print("\nR^2: {:.3f}".format(score))
        # print("RMSE: {:.3f}".format(rmse))
        # print("MAE: {:.3f}".format(mae))

        nlpd_list.append(nlpd)
        msll_list.append(msll)
        qce_list.append(qce)

        r2_list.append(score)
        rmse_list.append(rmse)
        mae_list.append(mae)

    nlpd_list = torch.tensor(nlpd_list)
    msll_list = torch.tensor(msll_list)
    qce_list = torch.tensor(qce_list)

    r2_list = np.array(r2_list)
    rmse_list = np.array(rmse_list)
    mae_list = np.array(mae_list)

    print("\nmean NLPD: {:.4f} +- {:.4f}".format(torch.mean(nlpd_list), torch.std(nlpd_list) / torch.sqrt(torch.tensor(n_trials))))
    print("mean MSLL: {:.4f} +- {:.4f}".format(torch.mean(msll_list), torch.std(msll_list) / np.sqrt(torch.tensor(n_trials))))
    print("mean QCE: {:.4f} +- {:.4f}".format(torch.mean(qce_list), torch.std(qce_list) / np.sqrt(torch.tensor(n_trials))))

    print("mean R^2: {:.4f} +- {:.4f}".format(np.mean(r2_list), np.std(r2_list) / np.sqrt(len(r2_list))))
    print("mean RMSE: {:.4f} +- {:.4f}".format(np.mean(rmse_list), np.std(rmse_list) / np.sqrt(len(rmse_list))))
    print("mean MAE: {:.4f} +- {:.4f}\n".format(np.mean(mae_list), np.std(mae_list) / np.sqrt(len(mae_list))))
