# Correct Ph031

Change all relevant entries related to Ph031 in the virtuallibrary table.
Only products A-D, F, and G are affected.

- Add a comment about the side product
- Append +H2O to molecular_formula_alt
- Append +18.0106 to lcms_mass_alt

In [5]:
import sqlite3
import re
import copy

import pandas as pd

In [2]:
# Connect to database
con = sqlite3.connect('../../data/db/50k_project.db')
con

<sqlite3.Connection at 0x11e8fa990>

In [46]:
# Get all Ph031 entries
df = pd.read_sql_query(
    "SELECT * FROM virtuallibrary WHERE initiator_long = 'Ph031' AND type IN ('A', 'B', 'C', 'D', 'F', 'G')",
    con
)
df

Unnamed: 0,id,initiator_long,monomer_long,terminator_long,long_name,type,SMILES,boc,cbz,tbu,tms,comment,molecular_formula_1,lcms_mass_1,molecular_formula_alt,lcms_mass_alt
0,221483,Ph031,Fused002,TerABT001,Ph031 + Fused002 + TerABT001,A,COc1ccc(CCOC(=O)N2C[C@H](NC(=O)c3ccc(C4=NCCO4)...,0,0,0,0,,C32H32N4O6S,600.204256,,
1,221484,Ph031,Fused002,TerABT004,Ph031 + Fused002 + TerABT004,A,COc1ccc(CCOC(=O)N2C[C@H](NC(=O)c3ccc(C4=NCCO4)...,0,0,0,0,,C32H31FN4O6S,618.194834,,
2,221485,Ph031,Fused002,TerABT005,Ph031 + Fused002 + TerABT005,A,COc1ccc(CCOC(=O)N2C[C@H](NC(=O)c3ccc(C4=NCCO4)...,0,0,0,0,,C33H34N4O6S,614.219906,,
3,221486,Ph031,Fused002,TerABT006,Ph031 + Fused002 + TerABT006,A,COc1ccc(CCOC(=O)N2C[C@H](NC(=O)c3ccc(C4=NCCO4)...,0,0,0,0,,C34H36N4O6S,628.235556,,
4,221487,Ph031,Fused002,TerABT007,Ph031 + Fused002 + TerABT007,A,COc1ccc(CCOC(=O)N2C[C@H](NC(=O)c3ccc(C4=NCCO4)...,0,0,0,0,,C32H31BrN4O6S,678.114768,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18199,1644424,Ph031,Spiro017,TerTH023,Ph031 + Spiro017 + TerTH023,G,O=C(O)C[C@@]1(NC(=O)c2ccc(C3=NCCO3)cc2)CCCOC1,0,0,0,0,,C17H20N2O5,332.137222,,
18200,1644425,Ph031,Spiro017,TerTH025,Ph031 + Spiro017 + TerTH025,G,O=C(O)C[C@@]1(NC(=O)c2ccc(C3=NCCO3)cc2)CCCOC1,0,0,0,0,,C17H20N2O5,332.137222,,
18201,1644426,Ph031,Spiro017,TerTH026,Ph031 + Spiro017 + TerTH026,G,O=C(O)C[C@@]1(NC(=O)c2ccc(C3=NCCO3)cc2)CCCOC1,0,0,0,0,,C17H20N2O5,332.137222,,
18202,1644427,Ph031,Spiro017,TerTH027,Ph031 + Spiro017 + TerTH027,G,O=C(O)C[C@@]1(NC(=O)c2ccc(C3=NCCO3)cc2)CCCOC1,0,0,0,0,,C17H20N2O5,332.137222,,


In [47]:
# add comment
df['comment'] = 'oxazoline->amide side product for Ph031'

In [48]:
# some helper functions
def parse_formula(formula : str) -> dict: # Formula Parsing by Aditya Matam
    def multiply(formula: dict, mul: int) -> None:
        for key in formula: formula[key] *= mul

    formDict = {}
    # PARENS
    for match in re.finditer(r"\((.*?)\)(\d*)", formula):
        parens = parse_formula(match.group(1))
        mul = match.group(2)
        if not mul: mul = 1
        multiply(parens, int(mul))
        formDict.update(parens)
    # REST
    for match in re.finditer(r"(\(?)([A-Z][a-z]?)(\d*)(\)?)", formula):
        left, elem, mul, right = match.groups()
        if left or right: continue
        if not mul: mul = 1
        if elem in formDict:
            formDict[elem] += int(mul)
        else:
            formDict[elem] = int(mul)

    return formDict

def formula_to_string(formDict):
    s = ''
    for key, value in formDict.items():
        if value == 1:
            s += key
        elif value > 1:
            s += f'{key}{value}'
    return s

def add_formulae(formula1, formula2):
    result = copy.deepcopy(formula1)  # we make a deepcopy to not alter the first formula
    for key, value in formula2.items():
        result[key] += value
    return result

def string_formula_addition(formula1, formula2):
    return formula_to_string(add_formulae(parse_formula(formula1), parse_formula(formula2)))

In [50]:
# append +H2O to molecular_formula_alt
df['new_formula'] = df.apply(lambda row: string_formula_addition(row['molecular_formula_1'], 'H2O'), axis=1)
df['molecular_formula_alt_new'] = [','.join(filter(None, (a, b))) for a,b in zip(df['molecular_formula_alt'], df['new_formula'])]
df[['molecular_formula_alt', 'new_formula', 'molecular_formula_alt_new']]

Unnamed: 0,molecular_formula_alt,new_formula,molecular_formula_alt_new
0,,C32H34N4O7S,C32H34N4O7S
1,,C32H33FN4O7S,C32H33FN4O7S
2,,C33H36N4O7S,C33H36N4O7S
3,,C34H38N4O7S,C34H38N4O7S
4,,C32H33BrN4O7S,C32H33BrN4O7S
...,...,...,...
18199,,C17H22N2O6,C17H22N2O6
18200,,C17H22N2O6,C17H22N2O6
18201,,C17H22N2O6,C17H22N2O6
18202,,C17H22N2O6,C17H22N2O6


In [55]:
# append +18.0106 to lcms_mass_alt
df['new_mass'] = df.apply(lambda row: f"{row['lcms_mass_1'] + 18.0106:.4f}", axis=1)
df['lcms_mass_alt_new'] = [','.join(filter(None, (a, b))) for a,b in zip(df['lcms_mass_alt'], df['new_mass'].astype(str))]
df[['lcms_mass_alt', 'new_mass', 'lcms_mass_alt_new']]

Unnamed: 0,lcms_mass_alt,new_mass,lcms_mass_alt_new
0,,618.2149,618.2149
1,,636.2054,636.2054
2,,632.2305,632.2305
3,,646.2462,646.2462
4,,696.1254,696.1254
...,...,...,...
18199,,350.1478,350.1478
18200,,350.1478,350.1478
18201,,350.1478,350.1478
18202,,350.1478,350.1478


In [56]:
# update database
with con:
    con.executemany(
        "UPDATE virtuallibrary SET comment = ?, molecular_formula_alt = ?, lcms_mass_alt = ? WHERE id = ?",
        zip(df['comment'], df['molecular_formula_alt_new'], df['lcms_mass_alt_new'], df['id'])
    )


In [59]:
con.cursor().rowcount



-1