In [30]:
import pickle
import pickle as pkl
import json
import sqlite3 as sql
from pathlib import Path
import pandas as pd
from rdkit.Chem import Draw
from rdkit.Chem.rdMolDescriptors import CalcMolFormula, CalcExactMolWt
from rdkit import Chem
from PIL import Image
import re
import copy
from util_functions import *

In [6]:
DATA_DIR = Path('..', 'data').resolve()
OUTPUT_DIR = DATA_DIR / 'outputs'
INFO_PATH = OUTPUT_DIR / 'library_constituents_dataframe.pkl'
MAPPING_PATH = OUTPUT_DIR / 'compound_mapping.txt'
EXP_PLAN_PATH = OUTPUT_DIR / 'synthesis_plan.json'
DB_PATH = DATA_DIR / 'db' / '50k_project.db'
STATIC_DIR = DATA_DIR / 'db' / 'static'

In [41]:
with open(INFO_PATH, 'rb') as file:
    df = pd.read_pickle(file)

with open(MAPPING_PATH, 'r') as file:
    compound_mapping = {}
    for line in file.readlines():
        compound_mapping[line.split()[0]] = line.split()[1]

with open(EXP_PLAN_PATH, 'r')as file:
    synthesis_plan = json.load(file)

con = sql.connect(DB_PATH)
cur = con.cursor()

In [8]:
# lets try with I1
short = 'I1'

In [10]:
"""Add entries to DB"""
for short in compound_mapping.keys():
    long = compound_mapping[short]
    mol = df.loc[df['Compound Name'] == long, 'mol'].item()
    smiles = Chem.MolToSmiles(mol)  # we regenerate SMILES instead of using the value from df to have a canonical representation
    image = Draw.MolToImage(mol)  # TODO change this into the better drawing method
    category = df.loc[df['Compound Name'] == long,'Category'].item()
    # save image to static dir and keep imagepath to add to db
    image_path = STATIC_DIR / 'image' / f'{short}.png'
    image.save(image_path)
    # write this to the db
    cur.execute('INSERT INTO main.buildingblocks(long, short, SMILES, image, category, boc, cbz, tbu, tms) VALUES(?,?,?,?,?,?,?,?,?);', (long, short, smiles, str(image_path.resolve()), category, boc, cbz, tbu, tms))

In [11]:
con.commit()

In [35]:
"""A dictionary to look up protecting group properties"""
pg_dict = {
    'boc': ('C5H8O2', 100.0524),
    'cbz': ('C8H6O2', 134.0368),
    'tbu': ('C4H8', 56.0626),
    'tms': ('C3H8Si', 72.0395),
}

In [48]:
"""For existing entries, use SMILES and number of protecting groups to calculate probable formulae in LCMS"""
for id, smiles, boc, cbz, tbu, tms in cur.execute('SELECT id, SMILES, boc, cbz, tbu, tms FROM buildingblocks;').fetchall():
    mol = Chem.MolFromSmiles(smiles)
    # TODO we are missing the desalting step here. We could get desalted SMILES from sdf files
    lcms_formula_1 = CalcMolFormula(mol)
    lcms_mass_1 = CalcExactMolWt(mol)
    cur.execute('UPDATE buildingblocks SET lcms_formula_1 = ?, lcms_mass_1 = ? WHERE id = ?', (lcms_formula_1, lcms_mass_1, id))
    con.commit()
    additional_formulae = []
    additional_masses = []
    for pg, pgname in zip([boc, cbz, tbu, tms], ['boc', 'cbz', 'tbu', 'tms']):
        for i in range(pg):
            # if boc == 0, this will not evaluate
            additional_formulae.append(string_formula_substraction(lcms_formula_1, pg_dict[pgname][0]))
            additional_masses.append(lcms_mass_1 - pg_dict[pgname][1])
    additional_masses = [f'{i:.4f}' for i in additional_masses]
    if len(additional_formulae) > 0:
        cur.execute('UPDATE buildingblocks SET lcms_formula_alt = ?, lcms_mass_alt = ? WHERE id = ?', (','.join(additional_formulae), ','.join(additional_masses), id))
        con.commit()

In [32]:
ex1 = 'C10H5O'
ex2 = 'C3H'
formula_to_string(substract_formulae(parse_formula(ex1), parse_formula(ex2)))

'C7H4O'

In [None]:
con.commit()

In [37]:
con.close()