# GP Regression on Molecules #

An example notebook for basic GP regression on a molecular dataset using a Tanimoto fingerprint kernel and the Photoswitch Dataset:

Paper: https://arxiv.org/abs/2008.03226

Code: https://github.com/Ryan-Rhys/The-Photoswitch-Dataset



In [1]:
# Imports

# To import from the gprotorch package
import sys
sys.path.append('..')

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import torch

from gprotorch.dataloader import DataLoaderMP
from gprotorch.dataloader.data_utils import transform_data

import gpytorch
from botorch import fit_gpytorch_model
from rdkit.Chem import MolFromSmiles
from grakel import Graph
from grakel.kernels import ShortestPath, NeighborhoodSubgraphPairwiseDistance
import scipy.sparse as sp

  from .autonotebook import tqdm as notebook_tqdm
To use the Graphein submodule graphein.protein.features.sequence.embeddings, you need to install: biovec 
biovec cannot be installed via conda
DEBUG:matplotlib:matplotlib data path: /Users/adityaravuri/miniconda3/envs/gauche/lib/python3.8/site-packages/matplotlib/mpl-data
DEBUG:matplotlib:CONFIGDIR=/Users/adityaravuri/.matplotlib
DEBUG:matplotlib:matplotlib version 3.4.3
DEBUG:matplotlib:interactive is False
DEBUG:matplotlib:platform is darwin


DEBUG:matplotlib:CACHEDIR=/Users/adityaravuri/.matplotlib
DEBUG:matplotlib.font_manager:Using fontManager instance from /Users/adityaravuri/.matplotlib/fontlist-v330.json
DEBUG:matplotlib.pyplot:Loaded backend module://matplotlib_inline.backend_inline version unknown.
DEBUG:matplotlib.pyplot:Loaded backend module://matplotlib_inline.backend_inline version unknown.
To do so, use the following command: conda install -c pytorch3d pytorch3d
To do so, use the following command: conda install -c pytorch3d pytorch3d
INFO:rdkit:Enabling RDKit 2021.03.5 jupyter extensions


We define our model. See

https://docs.gpytorch.ai/en/latest/examples/01_Exact_GPs/Simple_GP_Regression.html

for further examples!


In [2]:
# We define our GP model using the Tanimoto kernel

from gprotorch.kernels.fingerprint_kernels.tanimoto_kernel import TanimotoKernel
from gprotorch import SIGP, Inputs

class ExactGPModel(SIGP):
    def __init__(self, train_x, train_y, likelihood, graph_kernel):
        super(ExactGPModel, self).__init__(train_x, train_y, likelihood)
        self.mean = gpytorch.means.ConstantMean()
        self.scale = gpytorch.kernels.LinearKernel()
        self.graph_kernel = graph_kernel
        self.kernel = None

    def forward(self, x):
        n = len(x.data)
        zeros = torch.zeros(n, 1)
        if self.training:
            if self.kernel is None:
                kernel = self.graph_kernel.fit_transform(x.data)
                self.kernel = torch.tensor(kernel).float()
            else:
                kernel = self.kernel
        else:
            if x == self.train_inputs[0]:
                kernel = self.kernel
            else:
                kernel = self.graph_kernel.fit_transform(x.data)
                kernel = torch.tensor(kernel).float()
        mean_x = self.mean(zeros).float()
        covar_x = self.scale(zeros + 1) * kernel
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)

We define our experiment parameters. In this case we are reproducing the results of the E isomer transition wavelength prediction task from https://arxiv.org/abs/2008.03226 using 20 random splits in the ratio 80/20.

In [3]:
# Regression experiments parameters, number of random splits and split size

n_trials = 3
test_set_size = 0.2

Load the Photoswitch Dataset via the DataLoaderMP class which contains several molecular property prediction benchmark datasets!

In [4]:
# Load the Photoswitch dataset

loader = DataLoaderMP()
loader.load_benchmark("Photoswitch", "../data/property_prediction/photoswitches.csv")

# Featurise the molecules. 
# We use the fragprints representations (a concatenation of Morgan fingerprints and RDKit fragment features)

def get_label_adj_mats(mol):
    ''' from leo's branch '''
    from collections import defaultdict
    adj_mats = defaultdict(lambda: [[], []])
    num_atoms = mol.GetNumAtoms()

    for bond in mol.GetBonds():

        start_atom = bond.GetBeginAtom().GetSymbol()
        start_idx = bond.GetBeginAtomIdx()
        end_atom = bond.GetEndAtom().GetSymbol()
        end_idx = bond.GetEndAtomIdx()

        label = hash(
            frozenset([start_atom, end_atom, bond.GetBondTypeAsDouble()])
        )

        adj_mats[label][0].extend([start_idx, end_idx])
        adj_mats[label][1].extend([end_idx, start_idx])

    def mat_transform(indices):
        mat = sp.coo_matrix(
            (np.ones(len(indices[0])), np.array(indices)),
            (num_atoms, num_atoms),
        ).tocsr()
        return mat

    adj_mats = {k: mat_transform(v) for k, v in adj_mats.items()}

    return adj_mats, num_atoms

def to_graph(x):
    edges = []; adjs, n_atm = get_label_adj_mats(x)
    for g in adjs.values():
        n = len(g.indices)
        for i in range(n):
            rows, cols = g.nonzero()
            edges.append((rows[i], cols[i]))
            edges.append((cols[i], rows[i]))
    list(set(edges))
    graph = Graph(edges,
        node_labels={i: 1 for i in range(n_atm)},
        edge_labels={i:1 for i in edges},
        graph_format='adjacency')
    return graph

X = [MolFromSmiles(mol) for mol in loader.features]
X = [to_graph(x) for x in X]
y = loader.labels

# initialise performance metric lists

r2_list = []
rmse_list = []
mae_list = []

The training/evaluation loop

In [5]:
for i in range(0, n_trials):

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_set_size, random_state=i)

    #  We standardise the outputs but leave the inputs unchanged
    _, y_train, _, y_test, y_scaler = transform_data(
        np.zeros_like(y_train), y_train, np.zeros_like(y_test), y_test)

    # Convert numpy arrays to PyTorch tensors and flatten the label vectors
    y_train = torch.tensor(y_train).flatten().float()
    y_test = torch.tensor(y_test).flatten().float()
    
    X_train = Inputs(X_train)
    X_test = Inputs(X_test)

    # initialise GP likelihood and model
    likelihood = gpytorch.likelihoods.GaussianLikelihood()
    kernel = NeighborhoodSubgraphPairwiseDistance(r=3, d=2)
    model = ExactGPModel(X_train, y_train, likelihood, kernel)

    # Find optimal model hyperparameters
    model.train()
    likelihood.train()

    # "Loss" for GPs - the marginal log likelihood
    mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)

    # Use the BoTorch utility for fitting GPs in order to use the LBFGS-B optimiser (recommended)
    fit_gpytorch_model(mll)
    
    optimizer = torch.optim.Adam(model.parameters(), lr=0.1)  # Includes GaussianLikelihood parameters

    # for i in range(100):
    #     optimizer.zero_grad()
    #     output = model(X_train)
    #     loss = -mll(output, y_train)
    #     loss.backward()
    #     print(loss.item())
    #     optimizer.step()
    print('Training successful')

    # Get into evaluation (predictive posterior) mode
    model.eval()
    likelihood.eval()

    # mean and variance GP prediction
    f_pred = model(X_test)

    y_pred = f_pred.mean
    y_var = f_pred.variance

    # Transform back to real data space to compute metrics and detach gradients. Must unsqueeze dimension
    # to make compatible with inverse_transform in scikit-learn version > 1
    y_pred = y_scaler.inverse_transform(y_pred.detach().unsqueeze(dim=1))
    y_test = y_scaler.inverse_transform(y_test.detach().unsqueeze(dim=1))

    # Output Standardised RMSE and RMSE on Train Set
    y_train = y_train.detach()
    y_pred_train = model(X_train).mean.detach()
    train_rmse_stan = np.sqrt(mean_squared_error(y_train, y_pred_train))
    train_rmse = np.sqrt(mean_squared_error(y_scaler.inverse_transform(y_train.unsqueeze(dim=1)), y_scaler.inverse_transform(y_pred_train.unsqueeze(dim=1))))
    print("\nStandardised Train RMSE: {:.3f}".format(train_rmse_stan))
    print("Train RMSE: {:.3f}".format(train_rmse))

    # Compute R^2, RMSE and MAE on Test set
    score = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)

    print("\nR^2: {:.3f}".format(score))
    print("RMSE: {:.3f}".format(rmse))
    print("MAE: {:.3f}".format(mae))

    r2_list.append(score)
    rmse_list.append(rmse)
    mae_list.append(mae)

r2_list = np.array(r2_list)
rmse_list = np.array(rmse_list)
mae_list = np.array(mae_list)

print("\nmean R^2: {:.4f} +- {:.4f}".format(np.mean(r2_list), np.std(r2_list)/np.sqrt(len(r2_list))))
print("mean RMSE: {:.4f} +- {:.4f}".format(np.mean(rmse_list), np.std(rmse_list)/np.sqrt(len(rmse_list))))
print("mean MAE: {:.4f} +- {:.4f}\n".format(np.mean(mae_list), np.std(mae_list)/np.sqrt(len(mae_list))))

Training successful

Standardised Train RMSE: 0.144
Train RMSE: 9.607

R^2: 0.707
RMSE: 33.645
MAE: 22.516
Training successful

Standardised Train RMSE: 0.177
Train RMSE: 11.577

R^2: 0.702
RMSE: 37.156
MAE: 26.420
Training successful

Standardised Train RMSE: 0.133
Train RMSE: 8.849

R^2: 0.646
RMSE: 38.355
MAE: 28.438

mean R^2: 0.6852 +- 0.0160
mean RMSE: 36.3856 +- 1.1539
mean MAE: 25.7914 +- 1.4192



The mean RMSE should be ca. 36.4 +- 1.2 nanometres.