# Correct virtual library

We have found a problem with the products originating from TerTH010:
The E-configuration of the double bound in this terminator was not transferred to products.
We will fix this here.

In [None]:
import pathlib
import sys

import pandas as pd
from rdkit import Chem
from rdkit.Chem.rdchem import BondStereo

sys.path.append(str(pathlib.Path().resolve().parents[1]))
from src.util.db_utils import SynFermDatabaseConnection
from src.library_design.reaction_generator import SFReactionGenerator
from src.util.rdkit_util import desalt_building_block, remove_monomer_pg_chirality

In [None]:
con = SynFermDatabaseConnection()
res = con.con.execute("SELECT id, initiator_long, monomer_long, terminator_long, type, SMILES FROM virtuallibrary WHERE terminator_long = 'TerTH010'").fetchall()
header = ["id", "initiator_long", "monomer_long", "terminator_long", "type", "SMILES"]
df = pd.DataFrame(res, columns=header)
df.head()

In [None]:
# pattern to identify the double bond in products originating from TerTH010
# matches four atoms: the carbon that used to be the thiohydrazide carbonyl,
# the two double bond atoms and the first atom of the phenyl ring on the other side
# note that on the thiohydrazide side the pattern is less specific to account for the non-aromatized products
# in particular the carbonyl-C to N bond is undefined to match either aromatic or double bonds
pat = Chem.MolFromSmarts("[$([#6]([#16])~[#7][#7])]-C=C-[$([cX3]1[cX3H][cX3H][cX3H][cX3H][cX3H]1)]") 
pat

In [None]:
with_stereo = []
for i, (smi, t) in df.loc[~df.type.isin(["F", "G"]), ["SMILES", "type"]].iterrows():
    if smi:  # for some product H, no SMILES exist, b/c it would be chemically invalid
        mol = Chem.MolFromSmiles(smi)

        atom_idx = mol.GetSubstructMatches(pat)

        if t == "E":
            assert len(atom_idx) == 2  # the dimer should have two matches of course
        else:
            assert len(atom_idx) == 1

        for a_idx in atom_idx:

            bond = mol.GetBondBetweenAtoms(a_idx[1], a_idx[2])

            if a_idx[1] < a_idx[2]:
                prec = a_idx[0]
                subs = a_idx[3]
            else:
                prec = a_idx[3]
                subs = a_idx[0]

            # set preceeding and follow atom that define stereo chem. First argument is the atom connected to the bond atom with lower idx
            bond.SetStereoAtoms(prec, subs)

            # set stereo descriptor
            bond.SetStereo(BondStereo.STEREOE)

            # set direction of preceeding and following bond
            mol.GetBondBetweenAtoms(a_idx[0], a_idx[1]).SetBondDir(Chem.rdchem.BondDir.ENDUPRIGHT)
            mol.GetBondBetweenAtoms(a_idx[2], a_idx[3]).SetBondDir(Chem.rdchem.BondDir.ENDUPRIGHT)

        Chem.SanitizeMol(mol)
        with_stereo.append(Chem.MolToSmiles(mol))
    else:
        with_stereo.append(None)

In [None]:
# add the new product SMILES back to dataframe
df.loc[~df.type.isin(["F", "G"]), "SMILES"] = with_stereo

## Doublecheck the products with SFReactionGenerator
(we can only check type A)

In [None]:
rxn_generator = SFReactionGenerator()

In [None]:
building_blocks = {
    long: Chem.MolToSmiles(
        remove_monomer_pg_chirality(desalt_building_block(smiles))
    )
    if (
        long.startswith("Mon")
        or long.startswith("Fused")
        or long.startswith("Spiro")
    )
    else Chem.MolToSmiles(desalt_building_block(smiles))
    for long, smiles in con.building_blocks()
}

In [None]:
reactants = [tuple(bbs) for i, bbs in df.loc[df.type == "A", ["initiator_long", "monomer_long", "terminator_long", "type"]].iterrows()]

In [None]:
prods = []
failed_reactants = []
for bbs in reactants:
    if bbs[3] == "A":
        try:
            prods.append(Chem.MolToSmiles(rxn_generator.generate_product([Chem.MolFromSmiles(building_blocks[smi]) for smi in bbs[:3]])))
        except RuntimeError as e:
            failed_reactants.append(bbs)
            prods.append(None)


In [None]:
# compare SFReactionGeneratorProducts
for i, j in zip(prods, df.loc[df.type == "A", "SMILES"]):
    if i != j:
        print(i)
        print(j)

In [None]:
failed_reactants

### Conclusion
this has worked fine. Only for the bullshit initiator 4-Pyrazole002, the SFReactionGenerator found no product, which is expected. We write the new SMILES with added double bond stereochemistry back to the database.

In [None]:
data = df[["SMILES", "id"]].to_numpy().tolist()
data

In [None]:
con.con.executemany("UPDATE virtuallibrary SET SMILES = ? WHERE id = ?", data)

In [None]:
con.con.total_changes

In [None]:
len(df)

In [None]:
con.con.commit()

## Some errors remain
As previously said, we could only check correctness with the SFReactionGenerator for product type A. Meanwhile, I have implemented and tested the other products types and it seems that the VL contains incorrect Z-stereochemistry for some products D, E, and H containing TerTH010. Since we now have the SFReactionGenerator working for those as well, we can easily fix it.

In [None]:
rxn_generator = SFReactionGenerator()

In [None]:
con = SynFermDatabaseConnection()
res = con.con.execute("SELECT id, initiator_long, monomer_long, terminator_long, type, SMILES FROM virtuallibrary WHERE terminator_long = 'TerTH010' AND type IN ('D', 'E', 'H') AND initiator_long != '4-Pyrazole002'").fetchall()
header = ["id", "initiator_long", "monomer_long", "terminator_long", "type", "SMILES"]
df = pd.DataFrame(res, columns=header)
df.head()

In [None]:
building_blocks = {
    long: Chem.MolToSmiles(
        remove_monomer_pg_chirality(desalt_building_block(smiles))
    )
    if (
        long.startswith("Mon")
        or long.startswith("Fused")
        or long.startswith("Spiro")
    )
    else Chem.MolToSmiles(desalt_building_block(smiles))
    for long, smiles in con.building_blocks()
}

In [None]:
reactants = [tuple(bbs) for i, bbs in df[["initiator_long", "monomer_long", "terminator_long", "type"]].iterrows()]

In [None]:
prods = []
failed_reactants = []
for bbs in reactants:
    try:
        prod = rxn_generator.generate_product([Chem.MolFromSmiles(building_blocks[smi]) for smi in bbs[:3]], product_type=bbs[3])
        if prod:
            prods.append(Chem.MolToSmiles(prod))
        else:
            prods.append(None)
    except RuntimeError as e:
        failed_reactants.append(bbs)
        prods.append(None)


In [None]:
len(prods)

In [None]:
len(df[["SMILES"]].values.tolist())

In [None]:
# compare SFReactionGeneratorProducts
for i, j in zip(prods, df["SMILES"].values.tolist()):
    if i != j:
        print(i)
        print(j)

### Conclusion
this has worked. The differences we see are solely double bond stereo and `prods` is in the correct E-configuration. We write the new SMILES with correct double bond stereochemistry back to the database.

In [None]:
df[["SMILES"]].shape

In [None]:
df["SMILES"] = prods

In [None]:
data = df[["SMILES", "id"]].to_numpy().tolist()
data

In [None]:
con.con.executemany("UPDATE virtuallibrary SET SMILES = ? WHERE id = ?", data)

In [None]:
con.con.total_changes

In [None]:
con.con.commit()