# GP Regression on Molecules #

An example notebook for basic GP regression on a molecular dataset using a Tanimoto fingerprint kernel and the Photoswitch Dataset:

Paper: https://arxiv.org/abs/2008.03226

Code: https://github.com/Ryan-Rhys/The-Photoswitch-Dataset



In [1]:
%%capture
# Imports

# To import from the gprotorch package
import sys
sys.path.append('..')

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import torch

from gprotorch.dataloader import DataLoaderMP
from gprotorch.dataloader.data_utils import transform_data

import gpytorch
from botorch import fit_gpytorch_model
from rdkit.Chem import MolFromSmiles
from grakel import Graph
from grakel.kernels import NeighborhoodSubgraphPairwiseDistance, \
    ShortestPath, RandomWalk, WeisfeilerLehman, GraphletSampling, PyramidMatch, \
    NeighborhoodHash, VertexHistogram, EdgeHistogram, WeisfeilerLehmanOptimalAssignment

import scipy.sparse as sp
from functools import lru_cache
import warnings

warnings.filterwarnings(
    action='ignore',
    category=UserWarning,
    module=r'gpytorch'
)

We define our model. See

https://docs.gpytorch.ai/en/latest/examples/01_Exact_GPs/Simple_GP_Regression.html

for further examples!


In [2]:
# We define our GP model using the Tanimoto kernel

from gprotorch.kernels.fingerprint_kernels.tanimoto_kernel import TanimotoKernel
from gprotorch import SIGP, Inputs, Kernel

class GraphGP(SIGP):
    def __init__(self, train_x, train_y, likelihood, kernel):
        super().__init__(train_x, train_y, likelihood)
        self.mean = gpytorch.means.ConstantMean()
        self.covariance = GraphKernel(kernel)

    def forward(self, x):
        mean = self.mean(torch.zeros(len(x.data), 1)).float()
        covariance = self.covariance(x)
        covariance += torch.eye(len(x.data))*covariance.diag().mean().detach()*1e-3
        return gpytorch.distributions.MultivariateNormal(mean, covariance)

class GraphKernel(Kernel):
    def __init__(self, graph_kernel, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.kernel = graph_kernel

    @lru_cache(maxsize=3)
    def kern(self, X):
        return torch.tensor(self.kernel.fit_transform(X.data)).float()

    def forward(self, X):
        return self.scale(self.kern(X))

We define our experiment parameters. In this case we are reproducing the results of the E isomer transition wavelength prediction task from https://arxiv.org/abs/2008.03226 using 20 random splits in the ratio 80/20.

In [3]:
# Regression experiments parameters, number of random splits and split size

n_trials = 10
test_set_size = 0.2

Load the Photoswitch Dataset via the DataLoaderMP class which contains several molecular property prediction benchmark datasets!

In [4]:
# Load the Photoswitch dataset

loader = DataLoaderMP()
loader.load_benchmark("Photoswitch", "../data/property_prediction/photoswitches.csv")
bond_types = {1.0: 'S', 1.5: 'A', 2.0: 'D', 3.0: 'O'}

# Featurise the molecules. 
# We use the fragprints representations (a concatenation of Morgan fingerprints and RDKit fragment features)

def to_graph(mol):
    ''' from leo's branch, modified '''
    node_labels = {i: mol.GetAtomWithIdx(i).GetSymbol() for i in range(mol.GetNumAtoms())}
    edges = {}
    for bond in mol.GetBonds():
        start_idx = bond.GetBeginAtomIdx()
        end_idx = bond.GetEndAtomIdx()
        bond_type = bond.GetBondTypeAsDouble()

        edges[(start_idx, end_idx)] = bond_types[bond_type]
        edges[(end_idx, start_idx)] = bond_types[bond_type]
    edge_list = list(edges.keys())
    assert len(edge_list) == len(set(edge_list))

    graph = Graph(edge_list,
        node_labels=node_labels,
        edge_labels=edges,
        graph_format='adjacency')
    return graph

X = [to_graph(MolFromSmiles(mol)) for mol in loader.features]
y = loader.labels

The training/evaluation loop

In [None]:
kernels = [
    WeisfeilerLehman(),
    GraphletSampling(),
    NeighborhoodSubgraphPairwiseDistance(r=3, d=2),
    ShortestPath(),
    PyramidMatch(),
    NeighborhoodHash(),
    VertexHistogram(),
    EdgeHistogram(),
    WeisfeilerLehmanOptimalAssignment(),
    # RandomWalk()
]
for kernel in kernels:
    print(kernel)
    r2_list = []; rmse_list = []; mae_list = []
    for i in range(0, n_trials):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_set_size, random_state=i)

        #  We standardise the outputs but leave the inputs unchanged
        _, y_train, _, y_test, y_scaler = transform_data(
            np.zeros_like(y_train), y_train, np.zeros_like(y_test), y_test)

        # Convert numpy arrays to PyTorch tensors and flatten the label vectors
        y_train = torch.tensor(y_train).flatten().float()
        y_test = torch.tensor(y_test).flatten().float()

        X_train = Inputs(X_train)
        X_test = Inputs(X_test)

        # initialise GP likelihood and model
        likelihood = gpytorch.likelihoods.GaussianLikelihood()
        model = GraphGP(X_train, y_train, likelihood, kernel)

        # Find optimal model hyperparameters
        model.train()
        likelihood.train()

        # "Loss" for GPs - the marginal log likelihood
        mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)

        # Use the BoTorch utility for fitting GPs in order to use the LBFGS-B optimiser (recommended)
        # fit_gpytorch_model(mll)

        optimizer = torch.optim.Adam(model.parameters(), lr=0.1)  # Includes GaussianLikelihood parameters

        for i in range(100):
            optimizer.zero_grad()
            output = model(X_train)
            loss = -mll(output, y_train)
            loss.backward()
            # print(loss.item())
            optimizer.step()
        # print('Training successful')

        # Get into evaluation (predictive posterior) mode
        model.eval()
        likelihood.eval()

        # mean and variance GP prediction
        f_pred = model(X_test)

        y_pred = f_pred.mean
        y_var = f_pred.variance

        # Transform back to real data space to compute metrics and detach gradients. Must unsqueeze dimension
        # to make compatible with inverse_transform in scikit-learn version > 1
        y_pred = y_scaler.inverse_transform(y_pred.detach().unsqueeze(dim=1))
        y_test = y_scaler.inverse_transform(y_test.detach().unsqueeze(dim=1))

        # Output Standardised RMSE and RMSE on Train Set
        y_train = y_train.detach()
        y_pred_train = model(X_train).mean.detach()
        train_rmse_stan = np.sqrt(mean_squared_error(y_train, y_pred_train))
        train_rmse = np.sqrt(mean_squared_error(y_scaler.inverse_transform(y_train.unsqueeze(dim=1)), y_scaler.inverse_transform(y_pred_train.unsqueeze(dim=1))))
        # print("\nStandardised Train RMSE: {:.3f}".format(train_rmse_stan))
        # print("Train RMSE: {:.3f}".format(train_rmse))

        # Compute R^2, RMSE and MAE on Test set
        score = r2_score(y_test, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        mae = mean_absolute_error(y_test, y_pred)

        # print("\nR^2: {:.3f}".format(score))
        # print("RMSE: {:.3f}".format(rmse))
        # print("MAE: {:.3f}".format(mae))

        r2_list.append(score)
        rmse_list.append(rmse)
        mae_list.append(mae)

    r2_list = np.array(r2_list)
    rmse_list = np.array(rmse_list)
    mae_list = np.array(mae_list)

    print("\nmean R^2: {:.4f} +- {:.4f}".format(np.mean(r2_list), np.std(r2_list)/np.sqrt(len(r2_list))))
    print("mean RMSE: {:.4f} +- {:.4f}".format(np.mean(rmse_list), np.std(rmse_list)/np.sqrt(len(rmse_list))))
    print("mean MAE: {:.4f} +- {:.4f}\n".format(np.mean(mae_list), np.std(mae_list)/np.sqrt(len(mae_list))))

WeisfeilerLehman()

mean R^2: 0.8578 +- 0.0111
mean RMSE: 24.6764 +- 0.9513
mean MAE: 15.4473 +- 0.4898

GraphletSampling()

mean R^2: 0.2613 +- 0.0511
mean RMSE: 56.7347 +- 2.9410
mean MAE: 42.2645 +- 1.2055

NeighborhoodSubgraphPairwiseDistance(d=2)
