### Add entries to the building block database


In [None]:
import pathlib
import sqlite3 as sql
import sys

import pandas as pd
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem.rdMolDescriptors import CalcMolFormula, CalcExactMolWt

sys.path.append(str(pathlib.Path().resolve().parents[1]))
from src.definitions import DATA_DIR
from src.definitions import COMPOUND_MAPPING_PATH, DB_PATH, DB_DIR
from src.util.rdkit_util import desalt_building_block
from src.util.sumformula_manipulation import string_formula_substraction
from src.util.protecting_groups import pg_dict

In [None]:
DB_STATIC_DIR = DB_DIR / 'static'
INFO_PATH = DATA_DIR / 'library_info' / 'library_constituents_dataframe.pkl'

In [None]:
# Import data
with open(INFO_PATH, 'rb') as file:
    df = pd.read_pickle(file)

with open(COMPOUND_MAPPING_PATH, 'r') as file:
    compound_mapping = {}
    for line in file.readlines():
        compound_mapping[line.split()[0]] = line.split()[1]

In [None]:
# open DB connection
con = sql.connect(DB_PATH)
cur = con.cursor()

In [None]:
df

In [None]:
# Add entries to DB
# Save molecule images to the static directory and add the path to DB
# note: originally we would use compound_mapping to add the short name to the building blocks table
# however, we realized later that we need to change the relation in some cases, so we now use a separate table building_blocks_shorts
for long in compound_mapping.values():
    mol = df.loc[df['Compound Name'] == long, 'mol'].item()
    smiles = Chem.MolToSmiles(mol)  # we regenerate SMILES instead of using the value from df to have a canonical representation
    image = Draw.MolToImage(mol)
    category = df.loc[df['Compound Name'] == long,'Category'].item()
    # save image to static dir and keep imagepath to add to db
    image_path = DB_STATIC_DIR / 'image' / f'{long}.png'
    image.save(image_path)
    # write this to the db
    cur.execute('INSERT INTO building_blocks(long, SMILES, image, category) VALUES(?,?,?,?);', (long, smiles, str(image_path.resolve()), category))
con.commit()

In [None]:
# at this point, boc, cbz, tbu, tms columns were set after manual inspection

In [None]:
"""For existing entries, use SMILES and number of protecting groups to calculate probable formulae in LCMS"""
for id, smiles, boc, cbz, tbu, tms in cur.execute('SELECT id, SMILES, boc, cbz, tbu, tms FROM building_blocks;').fetchall():
    mol = Chem.MolFromSmiles(smiles)
    mol_desalted = desalt_building_block(mol)
    lcms_formula_1 = CalcMolFormula(mol_desalted)
    lcms_mass_1 = CalcExactMolWt(mol_desalted)
    cur.execute('UPDATE building_blocks SET lcms_formula_1 = ?, lcms_mass_1 = ? WHERE id = ?', (lcms_formula_1, lcms_mass_1, id))
    con.commit()
    additional_formulae = []
    additional_masses = []
    for pg, pgname in zip([boc, cbz, tbu, tms], ['boc', 'cbz', 'tbu', 'tms']):
        for i in range(pg):
            # if boc == 0, this will not evaluate
            additional_formulae.append(string_formula_substraction(lcms_formula_1, pg_dict[pgname][0]))
            additional_masses.append(lcms_mass_1 - pg_dict[pgname][1])
    additional_masses = [f'{i:.4f}' for i in additional_masses]
    if len(additional_formulae) > 0:
        cur.execute('UPDATE building_blocks SET lcms_formula_alt = ?, lcms_mass_alt = ? WHERE id = ?', (','.join(additional_formulae), ','.join(additional_masses), id))
        con.commit()

In [None]:
con.close()