# Inference using the SLAP models 

Predict reaction outcome for morpholine/piperazine products.

For most requirements, using the command-line interface (`inference.py`) will be faster/simpler than using this notebook. But if you want to change things or have a look under the hood, feel free to use this notebook.

The input to this are SMILES of the desired product(s).
Inputs can be supplied directly as a csv file with one column named "smiles" and arbitrary additional columns.

The output is:
- all (one or two) reactionSMILES that lead to this product using the SLAP platform
- for each reaction, a classification of whether the reaction is expected to work
- for each reaction, a rough estimate of the confidence for this prediction

The output is written to a new csv file containing all columns from the input file, and six new columns: `rxn1_smiles`, `rxn1_prediction`, `rxn1_confidence`, `rxn2_smiles`, `rxn2_prediction`, `rxn2_confidence`.

Columns `rxn2_*` may have empty fields.

Predictions are given as `0` (meaning no reaction expected) or `1` (meaning successful reaction expected). Only if the reaction is known, instead of the prediction, the mean of the known reaction outcome(s) is returned.
Confidence is given as an integer in the range `0-4`, with `0` indicating the highest confidence.
Confidence is determined based on the complexity of the prediction problem using the following heuristic:
- `0`: known reactions
- `1`: both reactants known in other reactions
- `2`: exactly one reactant known in other reactions
- `3`: unknown reactants, similar to training data
- `4`: unknown reactants, dissimilar to training data


In [1]:
import pathlib
import statistics
import sys
sys.path.append(str(pathlib.Path("__file__").absolute().parents[1]))
import warnings

import numpy as np
import pandas as pd
import pytorch_lightning as pl
import torch
from torch.utils.data import DataLoader

from src.model.classifier import load_trained_model
from inference import import_valid_smiles_from_vl
with warnings.catch_warnings(record=True):  
    # ignore the descriptastorus package warning about missing normalizations
    from src.data.dataloader import SLAPProductDataset, collate_fn



In [2]:
    use_validation_data = True  # <-- toggle this to change the models used
    
    # paths to the best models
    if use_validation_data:
        # the next three are trained with full data, including validation plate data
        model_0D = TRAINED_MODEL_DIR / "2023-03-06-105610_484882" / "best.ckpt"  # FFN
        model_1D = TRAINED_MODEL_DIR / "2023-03-06-112027_188465" / "best.ckpt"  # D-MPNN
        model_2D = TRAINED_MODEL_DIR / "2023-03-06-112721_778803" / "best.ckpt"  # D-MPNN
        # path to the OneHotEncoder state for model_0D
        ohe_state_dict = LOG_DIR / "OHE_state_dict_bhTczANzKRRqIgUR.json"  # with validation plate data
        
    else:
        # the next three are trained without using validation plate data
        model_0D = TRAINED_MODEL_DIR / "2022-12-16-144509_863758" / "best.ckpt"  # FFN
        model_1D = TRAINED_MODEL_DIR / "2022-12-16-145840_448790" / "best.ckpt"  # D-MPNN
        model_2D = TRAINED_MODEL_DIR / "2022-12-06-115456_273019" / "best.ckpt"  # D-MPNN
        # path to the OneHotEncoder state for model_0D
        ohe_state_dict = LOG_DIR / "OHE_state_dict_FqIDTIsCHoURGQcv.json"  # without validation plate data

To use the notebook on your products, change `raw_dir` to the directory that your CSV file containing SMILES is in. Then change `filename_base` to the filename of your csv file without the `.csv` suffix. If you do not want to use all the SMILES in your file (e.g. because some are not valid SLAP products), suppy a `valid_idx_file`. You can set the value to `None` if you want to use all SMILES.

In [3]:
# Import product SMILES and generate reactionSMILES. This will take some time.
# This will throw warnings if any reactions cannot be generated, 
# e.g. if there are two morpholines in the same product.
raw_dir = pathlib.Path("../data/VL")  # <-- change me
filename_base = "VL_smiles_chunk_00961"  # <-- change me
valid_idx_file = raw_dir / f"{filename_base}_valid.csv"  # <-- change me or set me to None
df = import_valid_smiles_from_vl(raw_dir, filename_base, valid_idx_file=valid_idx_file)
data = SLAPProductDataset(smiles=df["smiles"].values.tolist())

Original error message: More than one reaction found for SLAP reagent 'CC(CC(N)COC[Si](C)(C)C)C[Si](C)(C)C' and aldehyde 'O=CC1=NO[C@@H]2CCCC[C@H]12'.
Reactions:
C[Si](C)(C)[CH2:8][CH:7]([CH2:6][CH:4]([CH2:3][O:17][CH2:18][Si:19]([CH3:20])([CH3:21])[CH3:22])[NH2:5])[CH3:23].O=[CH:2][C:1]1=[N:9][O:11][C@H:13]2[C@@H:10]1[CH2:12][CH2:14][CH2:16][CH2:15]2>>[C:1]1([CH:2]2[NH:5][CH:4]([CH2:3][O:17][CH2:18][Si:19]([CH3:20])([CH3:21])[CH3:22])[CH2:6][CH:7]([CH3:23])[CH2:8]2)=[N:9][O:11][C@H:13]2[C@@H:10]1[CH2:12][CH2:14][CH2:16][CH2:15]2
C[Si](C)(C)[CH2:8][O:7][CH2:6][CH:4]([CH2:3][CH:17]([CH3:18])[CH2:19][Si:20]([CH3:21])([CH3:22])[CH3:23])[NH2:5].O=[CH:2][C:1]1=[N:9][O:11][C@H:13]2[C@@H:10]1[CH2:12][CH2:14][CH2:16][CH2:15]2>>[C:1]1([CH:2]2[NH:5][CH:4]([CH2:3][CH:17]([CH3:18])[CH2:19][Si:20]([CH3:21])([CH3:22])[CH3:23])[CH2:6][O:7][CH2:8]2)=[N:9][O:11][C@H:13]2[C@@H:10]1[CH2:12][CH2:14][CH2:16][CH2:15]2
Original error message: More than one reaction found for SLAP reagent 'CC(OC[Si](C)(C)C)C(

In [4]:
# Process data. This includes generating reaction graphs and takes some time.

data.process({"dataset_0D": dict(
    reaction=True, 
    global_features=["OHE",],
    global_featurizer_state_dict_path=ohe_state_dict,
    graph_type="bond_edges", 
    featurizers="custom",
),
             "dataset_1D_slap": dict(
    reaction=True, 
    global_features=None, 
    graph_type="bond_nodes", 
    featurizers="custom",
),
              "dataset_1D_aldehyde": dict(
    reaction=True, 
    global_features=None, 
    graph_type="bond_nodes", 
    featurizers="custom",
),
              "dataset_2D": dict(
    reaction=True, 
    global_features=None, 
    graph_type="bond_nodes", 
    featurizers="custom",
),
            })

In [5]:
# run all the predictions

if data.dataset_0D:
    # load the trained model if it is not loaded
    if isinstance(model_0D, str):
        model_0D = load_trained_model("FFN", model_0D)
        model_0D.eval()
    trainer = pl.Trainer(accelerator="gpu", logger=False, max_epochs=-1)
    dl = DataLoader(data.dataset_0D, collate_fn=collate_fn)
    probabilities_0D = torch.concat(trainer.predict(model_0D, dl))
    predictions_0D = (probabilities_0D > 0.5).numpy().astype(float)
    

if data.dataset_1D_aldehyde:
    # load the trained model if it is not loaded
    if isinstance(model_1D, str):
        model_1D = load_trained_model("D-MPNN", model_1D)
        model_1D.eval()
    trainer = pl.Trainer(accelerator="gpu", logger=False, max_epochs=-1)
    dl = DataLoader(data.dataset_1D_aldehyde, collate_fn=collate_fn)
    probabilities_1D_aldehyde = torch.concat(trainer.predict(model_1D, dl))
    predictions_1D_aldehyde = (probabilities_1D_aldehyde > 0.5).numpy().astype(float)

if data.dataset_1D_slap:
    # load the trained model if it is not loaded
    if isinstance(model_1D, str):
        model_1D = load_trained_model("D-MPNN", model_1D)
        model_1D.eval()
    trainer = pl.Trainer(accelerator="gpu", logger=False, max_epochs=-1)
    dl = DataLoader(data.dataset_1D_slap, collate_fn=collate_fn)
    probabilities_1D_slap = torch.concat(trainer.predict(model_1D, dl))
    predictions_1D_slap = (probabilities_1D_slap > 0.5).numpy().astype(float)

if data.dataset_2D:
    # load the trained model if it is not loaded
    if isinstance(model_2D, str):
        model_2D = load_trained_model("D-MPNN", model_2D)
        model_2D.eval()
    trainer = pl.Trainer(accelerator="gpu", logger=False, max_epochs=-1)
    dl = DataLoader(data.dataset_2D, collate_fn=collate_fn)
    probabilities_2D = torch.concat(trainer.predict(model_2D, dl))
    predictions_2D = (probabilities_2D > 0.5).numpy().astype(float)


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

  batched_global_features = torch.tensor(global_features, dtype=torch.float32)
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

In [6]:
# assemble outputs
predictions = np.full(len(data.reactions), np.nan, dtype=float)

predictions[data.idx_known] = [statistics.mean(data.known_outcomes[i]) for i in data.idx_known]  # for known reaction we add the average reaction outcome
try:
    predictions[data.idx_0D] = predictions_0D
except NameError:
    pass
try:
    predictions[data.idx_1D_slap] = predictions_1D_slap
except NameError:
    pass
try:
    predictions[data.idx_1D_aldehyde] = predictions_1D_aldehyde
except NameError:
    pass
try:
    predictions[data.idx_2D] = predictions_2D
except NameError:
    pass


In [7]:
# check if we have not predicted for anything
# this should be only the reactions in data.invalid_idxs
rxn_idxs_no_pred = np.argwhere(np.isnan(predictions)).flatten()

rxn_idxs_invalid = [data.product_idxs.index(i) for i in data.invalid_idxs]

assert set(rxn_idxs_no_pred) == set(rxn_idxs_invalid)

In [8]:
# convert the 1D- product_idxs to the directionally reverse 2D indices
arr = np.full((len(data.smiles), 2), fill_value=-1)
last_idx = -1
for i, idx in enumerate(data.product_idxs):
    if idx == last_idx:
        arr[idx, 1] = i
    else:
        last_idx = idx
        arr[idx, 0] = i
     

In [9]:
confidence_dict = {
    "known": 0,
    "0D": 1,
    "1D_SLAP": 2,
    "1D_aldehyde": 2,
    "2D_similar": 3,
    "2D_dissimilar": 4,
}

In [10]:
# translate problem type to integer
rxn_problem_types = list(map(confidence_dict.get, data.problem_type))

In [11]:
# we add a nonsense value to the end of each of these lists so that indexing with -1 will return the nonsense value
reactions_augmented = data.reactions + [""]
predictions_augmented = list(predictions) + [np.nan]
rxn_problem_types_augmented = rxn_problem_types + [99]


In [12]:
# obtain individual new columns for output df
df["rxn1_smiles"] = [data.reactions[i] for i in arr[:,0]]

df["rxn1_predictions"] = [predictions[i] for i in arr[:,0]]

df["rxn1_confidence"] = [rxn_problem_types[i] for i in arr[:,0]]

df["rxn2_smiles"] = [reactions_augmented[i] for i in arr[:,1]]

df["rxn2_predictions"] = [predictions_augmented[i] for i in arr[:,1]]

df["rxn2_confidence"] = [rxn_problem_types_augmented[i] for i in arr[:,1]]

In [13]:
# write dataset statistics for control to log file (+ optionally print)
verbose = True
log_output = f"""\
{len(data.reactions)} reactions generated from {len(data.smiles)} input SMILES
Known reactions: {(sum(x is not None for x in data.known_outcomes))}
0D reactions: 0, thereof 0 predicted positive
1D reactions with unknown aldehyde: {len(data.dataset_1D_aldehyde)}, thereof {np.count_nonzero(predictions_1D_aldehyde)} predicted positive
1D reactions with unknown SLAP reagent: {len(data.dataset_1D_slap)}, thereof {np.count_nonzero(predictions_1D_slap)} predicted positive
2D reactions: {len(data.dataset_2D)}, thereof {np.count_nonzero(predictions_2D)} predicted positive
"""

with open(raw_dir / f"{filename_base}_reaction_prediction.log", "w") as file:
    file.write(log_output)
if verbose:
    print(log_output)

13660 reactions generated from 8733 input SMILES
Known reactions: 0
0D reactions: 0, thereof 0 predicted positive
1D reactions with unknown aldehyde: 52, thereof 29 predicted positive
1D reactions with unknown SLAP reagent: 245, thereof 104 predicted positive
2D reactions: 13352, thereof 4756 predicted positive



In [14]:
# write df to output file
df.to_csv(raw_dir / f"{filename_base}_reaction_prediction.csv", index=False)