In [25]:
import pickle
import pickle as pkl
import json
import sqlite3 as sql
from pathlib import Path
import pandas as pd
from rdkit.Chem import Draw
from rdkit.Chem.rdMolDescriptors import CalcMolFormula, CalcExactMolWt
from rdkit.Chem.rdmolfiles import ForwardSDMolSupplier
from rdkit import Chem
from PIL import Image
import re
import copy
import gzip


In [26]:
DATA_DIR = Path('..', 'data').resolve()
OUTPUT_DIR = DATA_DIR / 'outputs'
INFO_PATH = OUTPUT_DIR / 'library_constituents_dataframe.pkl'
MAPPING_PATH = OUTPUT_DIR / 'compound_mapping.txt'
EXP_PLAN_PATH = OUTPUT_DIR / 'synthesis_plan.json'
DB_PATH = DATA_DIR / 'db' / '50k_project.db'
STATIC_DIR = DATA_DIR / 'db' / 'static'
LIB_DIR = DATA_DIR / 'library_static'

In [27]:
con = sql.connect(DB_PATH)
cur = con.cursor()

In [28]:
with open(MAPPING_PATH, 'r') as file:
    compound_long_names = []
    for line in file.readlines():
        compound_long_names.append(line.split()[1])

In [29]:
# lets grab our building blocks from their db
initiator_ids = [i[0] for i in cur.execute('SELECT id FROM buildingblocks WHERE category = ?;', 'I').fetchall()]
monomer_ids = [i[0] for i in cur.execute('SELECT id FROM buildingblocks WHERE category = ?;', 'M').fetchall()]
terminator_ids = [i[0] for i in cur.execute('SELECT id FROM buildingblocks WHERE category = ?;', 'T').fetchall()]
print(len(initiator_ids), len(monomer_ids), len(terminator_ids))

78 74 41


In [31]:
# let's try adding product A for one combination
# initiator_short, initiator_long, boc_i, cbz_i, tbu_i, tms_i\
def query_building_block(id):
    """For a building block id, retrieve short, long, #boc, #cbz, #tbu, #tms in that order from DB"""
    assert type(id) is str or type(id) is int
    result = cur.execute('SELECT short, long, boc, cbz, tbu, tms FROM buildingblocks WHERE id = ?;', [id]).fetchone()
    if result is None:
        raise ValueError(f'No entry in database for id {id}')
    return result

def count_protecting_groups(initiator, monomer, terminator, product_type):
    # we will assume the indices as 2: boc, 3: cbz, 4: tbu, 5:tms
    if product_type in ['A', 'B', 'C']:
        boc = initiator[2] + monomer[2] + terminator[2]
        cbz = initiator[3] + monomer[3] + terminator[3]
        tbu = initiator[4] + monomer[4] + terminator[4]
        tms = initiator[5] + monomer[5] + terminator[5]
    elif product_type in ['D']:
        boc = initiator[2] + terminator[2]
        cbz = initiator[3] + terminator[3]
        tbu = initiator[4] + terminator[4]
        tms = initiator[5] + terminator[5]
    elif product_type in 'E':
        boc = 2 * terminator[2]
        cbz = 2 * terminator[3]
        tbu = 2 * terminator[4]
        tms = 2 * terminator[5]
    elif product_type in ['F', 'G']:
        boc = initiator[2] + monomer[2]
        cbz = initiator[3] + monomer[3]
        tbu = initiator[4] + monomer[4]
        tms = initiator[5] + monomer[5]
    elif product_type == 'H':
        boc = monomer[2] + terminator[2]
        cbz = monomer[3] + terminator[3]
        tbu = monomer[4] + terminator[4]
        tms = monomer[5] + terminator[5]
    else:
        raise ValueError(f'Invalid product type {product_type}')

    return boc, cbz, tbu, tms

for product_type in ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']:
    for i in initiator_ids:
        for m in monomer_ids:
            for t in terminator_ids:
                initiator = query_building_block(i)
                monomer = query_building_block(m)
                terminator = query_building_block(t)
                boc, cbz, tbu, tms = count_protecting_groups(initiator, monomer, terminator, product_type)
                # Let's add those entries to DB
                cur.execute('INSERT INTO virtuallibrary (initiator, monomer, terminator, type, boc, cbz, tbu, tms, initiator_long, monomer_long, terminator_long) VALUES (?,?,?,?,?,?,?,?,?,?,?)', (initiator[0], monomer[0], terminator[0], product_type, boc, cbz, tbu, tms, initiator[1], monomer[1], terminator[1]))
            # commit after going through all terminators
            con.commit()

In [None]:
"""
still TODO
- get product mol -> image and smiles and molecular formula_1
- from pg numbers and pg dict, get the molecular formulae of deprotection products
"""

In [None]:
# open the sdf.gz
product_type = 'A'
with gzip.open(LIB_DIR / 'product_A.sdf.gz') as file:
    supplier = ForwardSDMolSupplier(file)
    for mol in supplier:
        if mol is not None:
            smiles = Chem.MolToSmiles(mol)
            molecular_formula_1 = CalcMolFormula(mol)
            mass_1 = CalcExactMolWt(mol)
            name = mol.GetProp('_Name')
            # now we need to match the mol with a database entry
            result = cur.execute('SELECT id FROM virtuallibrary WHERE long_name = ? AND type = ?;', [name, product_type]).fetchall()
            # check if we got exactly one id back
            try:
                if len(result) == 0:
                    raise ValueError(f'No entry in DB for long name {name}')
                elif len(result) > 1:
                    raise ValueError(f'Duplicate entry for long name {name} and type {product_type}. Offending entries: {result}')
                # sine we now have the correct id, we can add the new info obtained from the sdf file
                cur.execute('UPDATE virtuallibrary SET SMILES = ?, molecular_formula_1 = ?, lcms_mass_1 = ?  WHERE id = ?', (smiles, molecular_formula_1, mass_1, result[0][0]))
            except ValueError:
                # only raise the error if the building block was used in 50k project
                if name.split(' + ')[0] in compound_long_names and name.split(' + ')[1] in compound_long_names and name.split(' + ')[2] in compound_long_names:
                    raise ValueError(f'Failed on product {name}')
con.commit()

In [24]:
con.rollback()

In [None]:
"""Add entries to DB"""
for short in compound_mapping.keys():
    long = compound_mapping[short]
    mol = df.loc[df['Compound Name'] == long, 'mol'].item()
    smiles = Chem.MolToSmiles(mol)  # we regenerate SMILES instead of using the value from df to have a canonical representation
    image = Draw.MolToImage(mol)  # TODO change this into the better drawing method
    category = df.loc[df['Compound Name'] == long,'Category'].item()
    # save image to static dir and keep imagepath to add to db
    image_path = STATIC_DIR / 'image' / f'{short}.png'
    image.save(image_path)
    # write this to the db
    cur.execute('INSERT INTO main.buildingblocks(long, short, SMILES, image, category, boc, cbz, tbu, tms) VALUES(?,?,?,?,?,?,?,?,?);', (long, short, smiles, str(image_path.resolve()), category, boc, cbz, tbu, tms))

In [11]:
con.commit()

In [35]:
"""A dictionary to look up protecting group properties"""
pg_dict = {
    'boc': ('C5H8O2', 100.0524),
    'cbz': ('C8H6O2', 134.0368),
    'tbu': ('C4H8', 56.0626),
    'tms': ('C3H8Si', 72.0395),
}

In [33]:
def parse_formula(formula : str) -> dict: # Formula Parsing by Aditya Matam
    def multiply(formula: dict, mul: int) -> None:
        for key in formula: formula[key] *= mul

    formDict = {}
    # PARENS
    for match in re.finditer(r"\((.*?)\)(\d*)", formula):
        parens = parse_formula(match.group(1))
        mul = match.group(2)
        if not mul: mul = 1
        multiply(parens, int(mul))
        formDict.update(parens)
    # REST
    for match in re.finditer(r"(\(?)([A-Z][a-z]?)(\d*)(\)?)", formula):
        left, elem, mul, right = match.groups()
        if left or right: continue
        if not mul: mul = 1
        if elem in formDict:
            formDict[elem] += int(mul)
        else:
            formDict[elem] = int(mul)

    return formDict

def formula_to_string(formDict):
    s = ''
    for key, value in formDict.items():
        if value == 1:
            s += key
        elif value > 1:
            s += f'{key}{value}'
    return s

def substract_formulae(minuend, substrahend):
    result = copy.deepcopy(minuend)  # we make a deepcopy to not alter the minuend
    for key, value in substrahend.items():
        result[key] -= value
    return result

def string_formula_substraction(minuend, substrahend):
    return formula_to_string(substract_formulae(parse_formula(minuend), parse_formula(substrahend)))

In [48]:
"""For existing entries, use SMILES and number of protecting groups to calculate probable formulae in LCMS"""
for id, smiles, boc, cbz, tbu, tms in cur.execute('SELECT id, SMILES, boc, cbz, tbu, tms FROM buildingblocks;').fetchall():
    mol = Chem.MolFromSmiles(smiles)
    lcms_formula_1 = CalcMolFormula(mol)
    lcms_mass_1 = CalcExactMolWt(mol)
    cur.execute('UPDATE buildingblocks SET lcms_formula_1 = ?, lcms_mass_1 = ? WHERE id = ?', (lcms_formula_1, lcms_mass_1, id))
    con.commit()
    additional_formulae = []
    additional_masses = []
    for pg, pgname in zip([boc, cbz, tbu, tms], ['boc', 'cbz', 'tbu', 'tms']):
        for i in range(pg):
            # if boc == 0, this will not evaluate
            additional_formulae.append(string_formula_substraction(lcms_formula_1, pg_dict[pgname][0]))
            additional_masses.append(lcms_mass_1 - pg_dict[pgname][1])
    additional_masses = [f'{i:.4f}' for i in additional_masses]
    if len(additional_formulae) > 0:
        cur.execute('UPDATE buildingblocks SET lcms_formula_alt = ?, lcms_mass_alt = ? WHERE id = ?', (','.join(additional_formulae), ','.join(additional_masses), id))
        con.commit()

In [32]:
ex1 = 'C10H5O'
ex2 = 'C3H'
formula_to_string(substract_formulae(parse_formula(ex1), parse_formula(ex2)))

'C7H4O'

In [None]:
con.commit()

In [37]:
con.close()