# Add new products to VL database table

You will most likely need to run this after additions to the `building_blocks` table, e.g. through `add_new_buildingblocks_to_vl.ipynb`.

In [1]:
import pathlib
import sys
sys.path.append(str(pathlib.Path().resolve().parents[1]))

import pandas as pd
from rdkit import Chem
from rdkit.Chem.rdMolDescriptors import CalcMolFormula, CalcExactMolWt

from src.util.db_utils import SynFermDatabaseConnection
from src.library_design.reaction_generator import SFReactionGenerator
from src.util.rdkit_util import desalt_building_block
from src.util.protecting_groups import pg_dict
from src.util.sumformula_manipulation import string_formula_substraction

In [2]:
con = SynFermDatabaseConnection()

## Determine missing building block combinations

In [3]:
# Determine all building block combinations already in the virtuallibrary table (* 8 different products)
vl = pd.DataFrame(con.con.execute("SELECT long_name, type FROM virtuallibrary").fetchall(), columns=["long_name", "type"])
vl_members = set([f"{l} - {t}" for _, (l, t) in vl.iterrows()])
len(vl_members)

2135936

In [4]:
# Determine all possible building block combinations (* 8 different products)
bbs = pd.DataFrame(con.con.execute("SELECT long, category FROM building_blocks").fetchall(), columns=["long", "category"])
bb_combinations = set([
    f"{i} + {m} + {t} - {product_type}" 
    for i in bbs.loc[bbs["category"] == "I", "long"] 
    for m in bbs.loc[bbs["category"] == "M", "long"]
    for t in bbs.loc[bbs["category"] == "T", "long"]
    for product_type in "ABCDEFGH"
])
len(bb_combinations)

2160208

In [5]:
# diagnostic only: Check if we have any combinations in the VL that are not in the building_blocks table
# should give an empty set
vl_members - bb_combinations

set()

In [6]:
# Determine all combinations of building blocks that are not in the VL
vl_missing = bb_combinations - vl_members
len(vl_missing)

24272

## Run enumeration of missing building block combinations

In [7]:
# first, import the reactants
reactants = set([n for name in vl_missing for n in name.split(" - ")[0].split(" + ")])
reactants = {long: Chem.Mol(desalt_building_block(con.get_mol(long=long))) for long in reactants}

In [8]:
# set up reaction generator
rxn_gen = SFReactionGenerator()

In [9]:
list(vl_missing)[:5]

['Ph043 + Spiro002 + TerABT011 - A',
 'Ph043 + Mon031 + TerABT018 - G',
 'Ph043 + Mon093 + TerTH016 - G',
 'Ph043 + Spiro014 + TerABT004 - D',
 'Ph043 + Mon100 + TerTH005 - E']

In [12]:
# for each combination, generate the product
prods = []
for item_missing in list(vl_missing):
    reac, product_type = item_missing.split(" - ")
    i, m, t = reac.split(" + ")
    prod = rxn_gen.generate_product([reactants[i], reactants[m], reactants[t]], product_type=product_type)
    prods.append(Chem.MolToSmiles(prod) if prod else None)



In [13]:
len(prods)

24272

## Add enumerated products to DB

In [14]:
def count_protecting_groups(initiator, monomer, terminator, product_type):
    i_boc, i_cbz, i_tbu, i_tms = range(4)
    if product_type in ['A', 'B', 'C']:
        boc = initiator[i_boc] + monomer[i_boc] + terminator[i_boc]
        cbz = initiator[i_cbz] + monomer[i_cbz] + terminator[i_cbz]
        tbu = initiator[i_tbu] + monomer[i_tbu] + terminator[i_tbu]
        tms = initiator[i_tms] + monomer[i_tms] + terminator[i_tms]
    elif product_type in ['D']:
        boc = initiator[i_boc] + terminator[i_boc]
        cbz = initiator[i_cbz] + terminator[i_cbz]
        tbu = initiator[i_tbu] + terminator[i_tbu]
        tms = initiator[i_tms] + terminator[i_tms]
    elif product_type in 'E':
        boc = 2 * terminator[i_boc]
        cbz = 2 * terminator[i_cbz]
        tbu = 2 * terminator[i_tbu]
        tms = 2 * terminator[i_tms]
    elif product_type in ['F', 'G']:
        boc = initiator[i_boc] + monomer[i_boc]
        cbz = initiator[i_cbz] + monomer[i_cbz]
        tbu = initiator[i_tbu] + monomer[i_tbu]
        tms = initiator[i_tms] + monomer[i_tms]
    elif product_type == 'H':
        boc = monomer[i_boc] + terminator[i_boc]
        cbz = monomer[i_cbz] + terminator[i_cbz]
        tbu = monomer[i_tbu] + terminator[i_tbu]
        tms = monomer[i_tms] + terminator[i_tms]
    else:
        raise ValueError(f'Invalid product type {product_type}')

    return boc, cbz, tbu, tms

pg_counts = []

for item_missing in list(vl_missing):
    reac, product_type = item_missing.split(" - ")
    i, m, t = reac.split(" + ")
    initiator = con.list_pg(long=i)
    monomer = con.list_pg(long=m)
    terminator = con.list_pg(long=t)
    pg_counts.append(count_protecting_groups(initiator, monomer, terminator, product_type))
    

In [15]:
len(pg_counts)

24272

In [16]:
lcms_props_1 = []
for smi in prods:
    if smi is not None:
        mol = Chem.MolFromSmiles(smi)
        lcms_props_1.append([CalcMolFormula(mol), CalcExactMolWt(mol)])
    else:
        lcms_props_1.append([None, None])

In [17]:
len(lcms_props_1)

24272

In [18]:
lcms_props_1[0]

['C30H33N3O10S2', 659.160736256]

In [19]:
# finally, we need the lcms formulae and masses
# do our magic with the protecting groups

lcms_formula_alt, lcms_mass_alt = [], []
for i, ((boc, cbz, tbu, tms), (molecular_formula_1, lcms_mass_1)) in enumerate(zip(pg_counts, lcms_props_1)):
    additional_formulae = []
    additional_masses = []
    if molecular_formula_1 is not None and lcms_mass_1 is not None:

        # generate all the combinations in that we need to leave PGs out
        combinations = [(i, j, k, l) for i in range(boc+1) for j in range(cbz+1) for k in range(tbu+1) for l in range(tms + 1) if sum([i,j,k,l]) > 0]
        pgname = ['boc', 'cbz', 'tbu', 'tms']
        for i in combinations:
            new_mass = lcms_mass_1 - i[0] * pg_dict['boc'][1] - i[1] * pg_dict['cbz'][1] - i[2] * pg_dict['tbu'][1] - i[3] * pg_dict['tms'][1]
            substrahend_formulae = [j * pg_dict[pgname[idx]][0] for idx, j in enumerate(i)]
            new_form = molecular_formula_1
            for s in substrahend_formulae:
                if s != '':
                    new_form = string_formula_substraction(new_form, s)
            additional_masses.append(new_mass)
            additional_formulae.append(new_form)
        additional_masses = [f'{i:.4f}' for i in additional_masses]
    lcms_formula_alt.append(additional_formulae)
    lcms_mass_alt.append(additional_masses)

In [20]:
len(lcms_formula_alt)

24272

In [21]:
len(lcms_mass_alt)

24272

In [22]:
lcms_formula_alt[:10]

[[], [], [], [], [], [], ['C20H26N2O7'], ['C17H20N2O7'], [], []]

In [23]:
# finally, we just assemble everything and write it to the table
len(vl_missing)

24272

In [24]:
data = []
for item, p, pg, lcms1, lcms_f_alt, lcms_m_alt in zip(vl_missing, prods, pg_counts, lcms_props_1, lcms_formula_alt, lcms_mass_alt):
    reac, product_type = item.split(" - ")
    i,m,t = reac.split(" + ")
    data.append({
        "initiator_long": i,
        "monomer_long": m,
        "terminators_long": t,
        "type": product_type,
        "SMILES": p,
        "boc": pg[0],
        "cbz": pg[1],
        "tbu": pg[2],
        "tms": pg[3],
        "comment": None,
        "molecular_formula_1": lcms1[0],
        "lcms_mass_1": lcms1[1],
        "molecular_formula_alt": repr(lcms_f_alt) if len(lcms_f_alt) > 0 else None,
        "lcms_mass_alt": repr(lcms_m_alt) if len(lcms_m_alt) > 0 else None,
        }
    )
data = [list(dic.values()) for dic in data]

In [25]:
data

[['Ph043',
  'Spiro002',
  'TerABT011',
  'A',
  'COC(=O)c1cc(C(=O)NCC2(c3nc4ccc(S(C)(=O)=O)cc4s3)CCN(C(=O)O[C@@H]3CCOC3)CC2)cc(C(=O)OC)c1',
  0,
  0,
  0,
  0,
  None,
  'C30H33N3O10S2',
  659.160736256,
  None,
  None],
 ['Ph043',
  'Mon031',
  'TerABT018',
  'G',
  'COC(=O)c1cc(C(=O)N[C@H](CC(=O)O)Cc2ccc(OCc3ccccc3)cc2)cc(C(=O)OC)c1',
  0,
  0,
  0,
  0,
  None,
  'C28H27NO8',
  505.17366682399995,
  None,
  None],
 ['Ph043',
  'Mon093',
  'TerTH016',
  'G',
  'COC(=O)c1cc(C(=O)N[C@H](CC(=O)O)c2ccc3ncccc3c2)cc(C(=O)OC)c1',
  0,
  0,
  0,
  0,
  None,
  'C23H20N2O7',
  436.12705098,
  None,
  None],
 ['Ph043',
  'Spiro014',
  'TerABT004',
  'D',
  'COC(=O)c1cc(C(=O)OC)cc(-c2nc3ccc(F)cc3s2)c1',
  0,
  0,
  0,
  0,
  None,
  'C17H12FNO4S',
  345.047107084,
  None,
  None],
 ['Ph043',
  'Mon100',
  'TerTH005',
  'E',
  'COc1ccc(-c2nnc(-c3ccc(OC)nc3)s2)cn1',
  0,
  0,
  0,
  0,
  None,
  'C14H12N4O2S',
  300.068096624,
  None,
  None],
 ['Ph043',
  'Mon002',
  'TerTH001',
  'G',
  'COC(=

In [26]:
cur = con.con.cursor()

In [27]:
cur.executemany("""INSERT INTO virtuallibrary (
initiator_long, monomer_long, terminator_long, type, 
SMILES, boc, cbz, tbu, tms, comment, 
molecular_formula_1, lcms_mass_1, molecular_formula_alt, lcms_mass_alt)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?
)""", data)

<sqlite3.Cursor at 0x2a07a98f0>

In [28]:
cur.rowcount

24272

In [29]:
con.con.commit()