# Inference using the Synthetic Fermentation models

We export the validation plate data for predictions


In [None]:
import pathlib
import statistics
import sys
sys.path.append(str(pathlib.Path().absolute().parent))

import numpy as np
import pandas as pd
import pytorch_lightning as pl
import torch
from torch.utils.data import DataLoader

from src.util.definitions import TRAINED_MODEL_DIR, LOG_DIR, DATA_ROOT
from src.model.classifier import load_trained_model
from src.data.dataloader import SynFermDataset, collate_fn
from reaction_generator import SFReactionGenerator

To use the notebook on your products, change `raw_dir` to the directory that your CSV file containing SMILES is in. Then change `filename_base` to the filename of your csv file without the `.csv` suffix. If you do not want to use all the SMILES in your file (e.g. because some are not valid SLAP products), suppy a `valid_idx_file`. You can set the value to `None` if you want to use all SMILES.

In [None]:
# Import experiment data
raw_dir = DATA_ROOT
filename = "validation-plate_candidates.csv"
valid_idx_file = None
df = import_smiles(raw_dir, filename, valid_idx_file=valid_idx_file)
df.head()

In [None]:
# check if all these were actually in the training data for the model we use
dfs = [pd.read_csv(TRAINED_MODEL_DIR / model_0D_name / f"train_{bb}.csv") for bb in ["initiators", "monomers", "terminators"]]
used_building_blocks = pd.concat(dfs)
used_building_blocks    


In [None]:
"BiAl005" in used_building_blocks.values

In [None]:
# anything that shows up here will not be recognized by the one-hot encoder
df.loc[~df["long_name"].str.split("+").apply(lambda x: all([i.strip() in used_building_blocks["long"].values for i in x]))]

In [None]:
# first we need to generate the reactionSMILES. This will take a moment
# don't need to rerun this later, we can just load it
gen = SFReactionGenerator()
df["reaction_smiles_atom_mapped"] = df["product_A_smiles"].apply(lambda x: gen.get_reaction_smiles(x))
# save this
df.to_csv(DATA_ROOT / "validation-plate_candidates_reactionSMILES.csv")

In [None]:
# reload
df = pd.read_csv(DATA_ROOT / "validation-plate_candidates_reactionSMILES.csv", index_col=0)
df.head()

In [None]:
# instantiate data set. This will also take a moment
data = SynFermDataset(
    name="validation-plate_candidates_reactionSMILES.csv",
    raw_dir=DATA_ROOT,
    reaction=True,
    global_features=["OHE", ],
    global_featurizer_state_dict_path=ohe_state_dict,
    graph_type="bond_edges",
    featurizers="custom",
    smiles_columns=["reaction_smiles_atom_mapped"],
    label_columns=None,
    task="multilabel"
)

In [None]:
# run the predictions

# load the trained model 
model_0D = load_trained_model("FFN", model_0D)
model_0D.eval()
trainer = pl.Trainer(accelerator="gpu", logger=False, max_epochs=-1)
# prepare data
dl = DataLoader(data, collate_fn=collate_fn, num_workers=0)
# predict
probabilities_0D = torch.sigmoid(torch.concat(trainer.predict(model_0D, dl)))
    

In [None]:
probabilities_0D

In [None]:
# load decision thresholds
with open(LOG_DIR / "thresholds" / f"{model_0D_name}.txt", "r") as f:
    thresholds = [float(i) for i in f.readlines()]
print(thresholds)

In [None]:
# apply the thresholds
preds = torch.stack([torch.where(probabilities_0D[:, i] > thresholds[i], 1, 0) for i in range(3)], dim=1)

In [None]:
# combine with data
df[["prob_A", "prob_B", "prob_C"]] = probabilities_0D
df[["pred_A", "pred_B", "pred_C"]] = preds 

In [None]:
df.head()

In [None]:
# summarize our predictions
df["pred_A"].value_counts()

In [None]:
df["prob_A"].plot.hist(bins=100)

In [None]:
(df["pred_A"] == 1)

In [None]:
# write df to output file
df.to_csv(DATA_ROOT / f"validation-plate_candidates_predictions_2023-12-20.csv", index=False)

In [None]:
# compare to 2023-09-05
old_df = pd.read_csv(DATA_ROOT / f"validation-plate_candidates_predictions.csv")
old_df.head()

In [None]:
assert (df["vl_id"] == old_df["vl_id"]).all()

In [None]:
# most predictions are the same as with the last model
(df["pred_A"] == old_df["pred_A"]).value_counts()

In [None]:
# how many of the positives from the old model are also positives in the new model?
(df.loc[old_df["pred_A"] == 1, "pred_A"] == old_df.loc[old_df["pred_A"] == 1, "pred_A"]).value_counts()

## 1D plates
We also need to predict for the 1D plates (exp101). Since we have by now prepared the `inference.py` script, we will use that instead of re-writing stuff here