# Inference using the Synthetic Fermentation models

Predict reaction outcome for Synthetic Fermentation products.

The input to this are SMILES of the desired product(s).
Inputs can be supplied directly as a csv file with one column named "smiles" and arbitrary additional columns.

The output is:
- the reactionSMILES that leads to this product using the SLAP platform
- a classification of whether the reaction is expected to work

The output is written to a new csv file containing all columns from the input file, and six new columns: `rxn_smiles`, `rxn_prediction`, `rxn_confidence`.

Predictions are given as `0` (meaning no reaction expected) or `1` (meaning successful reaction expected). 
If the reaction was in the acquired data set, the known outcome is returned instead.

Confidence is given as an integer in the range `0-4`, with `0` indicating the highest confidence.
Confidence is determined based on the complexity of the prediction problem using the following mapping:
- `0`: known reaction
- `1`: all three reactants known in other reactions
- `2`: exactly one reactant known in other reactions
- `3`: exactly two reactants known in other reactions
- `4`: none of the reactants known in other reactions


In [None]:
import pathlib
import statistics
import sys
sys.path.append(str(pathlib.Path().absolute().parent))

import numpy as np
import pandas as pd
import pytorch_lightning as pl
import torch
from torch.utils.data import DataLoader

from src.util.definitions import TRAINED_MODEL_DIR, LOG_DIR, DATA_ROOT
from src.model.classifier import load_trained_model
from src.data.dataloader import SynFermDataset, collate_fn
from reaction_generator import SFReactionGenerator

In [None]:
def import_smiles(
    raw_dir: pathlib.Path, filename: str, valid_idx_file: pathlib.Path = None
):
    """Import smiles from a csv file and filter by indices supplied in a second csv file"""
    smiles_df = pd.read_csv(raw_dir / filename)
    if valid_idx_file is None:
        return smiles_df
    else:
        indices_arr = pd.read_csv(valid_idx_file)["index"].to_numpy()
        return smiles_df.loc[indices_arr]

In [None]:
# paths to the best models
model_0D_name = "2023-11-20-175433_236136_fold0"
model_0D = LOG_DIR / "checkpoints" / model_0D_name / "last-epoch38-val_loss0.20.ckpt"  # FFN
#model_1D = TRAINED_MODEL_DIR / "2023-03-06-112027_188465" / "best.ckpt"  # D-MPNN
#model_2D = TRAINED_MODEL_DIR / "2023-03-06-112721_778803" / "best.ckpt"  # D-MPNN
#model_3D
# path to the OneHotEncoder state for model_0D
ohe_state_dict = LOG_DIR / "OHE_state_dict_KcEovvzIEafcIYUJ.json"

To use the notebook on your products, change `raw_dir` to the directory that your CSV file containing SMILES is in. Then change `filename_base` to the filename of your csv file without the `.csv` suffix. If you do not want to use all the SMILES in your file (e.g. because some are not valid SLAP products), suppy a `valid_idx_file`. You can set the value to `None` if you want to use all SMILES.

In [None]:
# Import product SMILES and generate reactionSMILES. This will take some time.
raw_dir = DATA_ROOT  # <-- change me
filename = "synferm_dataset_2023-09-05_40018records.csv"  # <-- change me
# remove the .csv extension AND any other extensions behind it (e.g. remove .csv.bz2 or csv.gz)
filename_base = filename.split(".csv")[0]
valid_idx_file = "../data/splits/synferm_dataset_2023-09-05_0D_split_final-retrain/fold0_val.csv"  # <-- change me or set me to None
df = import_smiles(raw_dir, filename, valid_idx_file=valid_idx_file)
#data = SLAPProductDataset(smiles=df["smiles"].values.tolist())
df

In [None]:
# save the data
df.to_csv(DATA_ROOT / "inference_test.csv", index=True)

In [None]:
# for the moment we assume everything is 0D data
data = SynFermDataset(
    name="inference_test.csv",
    raw_dir=DATA_ROOT,
    reaction=True,
    global_features=["OHE", ],
    global_featurizer_state_dict_path=ohe_state_dict,
    graph_type="bond_edges",
    featurizers="custom",
    smiles_columns=["reaction_smiles_atom_mapped"],
    label_columns=None,
    task="multilabel"
)


In [None]:
# run the predictions

# load the trained model 
model_0D = load_trained_model("FFN", model_0D)
model_0D.eval()
trainer = pl.Trainer(accelerator="gpu", logger=False, max_epochs=-1)
# prepare data
dl = DataLoader(data, collate_fn=collate_fn, num_workers=0)
# predict
probabilities_0D = torch.sigmoid(torch.concat(trainer.predict(model_0D, dl)))
    

In [None]:
probabilities_0D

In [None]:
# load decision thresholds
with open(LOG_DIR / "thresholds" / f"{model_0D_name}.txt", "r") as f:
    thresholds = [float(i) for i in f.readlines()]
print(thresholds)

In [None]:
# apply the thresholds
preds = torch.stack([torch.where(probabilities_0D[:, i] > thresholds[i], 1, 0) for i in range(3)], dim=1)

In [None]:
# combine with data
df[["prob_A", "prob_B", "prob_C"]] = probabilities_0D
df[["pred_A", "pred_B", "pred_C"]] = preds 

In [None]:
# check accuracy
df

In [None]:
from sklearn.metrics import accuracy_score, balanced_accuracy_score, recall_score, precision_score, fbeta_score

In [None]:
# CONTROL: check that we still obtain the same metrics
y_true = df["binary_A"]
y_pred = df["pred_A"]
acc = accuracy_score(y_true, y_pred)
bal_acc = balanced_accuracy_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
fbeta = fbeta_score(y_true, y_pred, beta=0.5)

print("accuracy:", f"{acc:.2f}")
print("balanced accuracy:", f"{bal_acc:.2f}")
print("recall:", f"{recall:.2f}")
print("precision:", f"{precision:.2f}")
print("f_0.5 score:", f"{fbeta:.2f}")

In [None]:
# assemble outputs
predictions = np.full(len(data.reactions), np.nan, dtype=float)

predictions[data.idx_known] = [statistics.mean(data.known_outcomes[i]) for i in data.idx_known]  # for known reaction we add the average reaction outcome
try:
    predictions[data.idx_0D] = predictions_0D
except NameError:
    pass
try:
    predictions[data.idx_1D_slap] = predictions_1D_slap
except NameError:
    pass
try:
    predictions[data.idx_1D_aldehyde] = predictions_1D_aldehyde
except NameError:
    pass
try:
    predictions[data.idx_2D] = predictions_2D
except NameError:
    pass


In [None]:
# check if we have not predicted for anything
# this should be only the reactions in data.invalid_idxs
rxn_idxs_no_pred = np.argwhere(np.isnan(predictions)).flatten()

rxn_idxs_invalid = [data.product_idxs.index(i) for i in data.invalid_idxs]

assert set(rxn_idxs_no_pred) == set(rxn_idxs_invalid)

In [None]:
# obtain individual new columns for output df
df["rxn1_smiles"] = [data.reactions[i] for i in arr[:,0]]

df["rxn1_predictions"] = [predictions[i] for i in arr[:,0]]

df["rxn1_confidence"] = [rxn_problem_types[i] for i in arr[:,0]]

df["rxn2_smiles"] = [reactions_augmented[i] for i in arr[:,1]]

df["rxn2_predictions"] = [predictions_augmented[i] for i in arr[:,1]]

df["rxn2_confidence"] = [rxn_problem_types_augmented[i] for i in arr[:,1]]

In [None]:
# write dataset statistics for control to log file (+ optionally print)
verbose = True
log_output = f"""\
{len(data.reactions)} reactions generated from {len(data.smiles)} input SMILES
Known reactions: {(sum(x is not None for x in data.known_outcomes))}
0D reactions: 0, thereof 0 predicted positive
1D reactions with unknown aldehyde: {len(data.dataset_1D_aldehyde)}, thereof {np.count_nonzero(predictions_1D_aldehyde)} predicted positive
1D reactions with unknown SLAP reagent: {len(data.dataset_1D_slap)}, thereof {np.count_nonzero(predictions_1D_slap)} predicted positive
2D reactions: {len(data.dataset_2D)}, thereof {np.count_nonzero(predictions_2D)} predicted positive
"""

with open(raw_dir / f"{filename_base}_reaction_prediction.log", "w") as file:
    file.write(log_output)
if verbose:
    print(log_output)

In [None]:
# write df to output file
df.to_csv(raw_dir / f"{filename_base}_reaction_prediction.csv", index=False)