In [1]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Draw, inchi, rdChemReactions
from IPython.display import display, Markdown
import json
import tqdm
import warnings
from rdkit import RDLogger

# Suppress RDKit warnings
warnings.filterwarnings("ignore", category=UserWarning)
RDLogger.DisableLog('rdApp.*')  # This will suppress all RDKit-related logs

In [2]:
with open('/Users/kate_fieseler/PycharmProjects/syndirella/syndirella/constants/RXN_SMARTS_CONSTANTS.json') as f:
    smarts = json.load(f)

In [23]:
# defs
def check_products(products,
                   product_smiles,
                   reaction_id,
                   try_num):
    #print(products)
    products = [product[0] for product in products]
    if len(products) == 0 and try_num==1:
        print(reaction_id)
        raise Exception('no products')
    any_sanitized = False
    for product in products:
        try:
            Chem.SanitizeMol(product)
            any_sanitized = True
            break  # As soon as one is sanitized successfully, we can stop checking
        except:
            continue  # If an error occurs, move to the next product
    if not any_sanitized:
        print('None of the products could be sanitised.')
        return False
    # check if one matches the product_smiles by inchi key
    if any_sanitized:
        true_inchi = inchi.MolToInchiKey(Chem.MolFromSmiles(product_smiles))
        for product in products:
            if true_inchi != inchi.MolToInchiKey(product):
                print('Incorrect product')
                print(reaction_id)
                print(
                print(true_inchi)
                print(inchi.MolToInchiKey(product))
                print(Chem.MolToSmiles(product))
                return False
        #print('All correct products')
    return True

def correct_transformation(reaction_name,
                           product_smiles,
                           reactant_smiles,
                           reaction_id):
    # print('REACTION NAME:', reaction_name)
    # print('PRODUCT:', product_smiles)
    # print('REACTANTS:', reactant_smiles)
    # print('REACTION_ID:', reaction_id)
    # define reaction
    rxn_smarts = smarts[reaction_name]
    rxn = rdChemReactions.ReactionFromSmarts(rxn_smarts)
    reactants = [Chem.MolFromSmiles(smile) for smile in reactant_smiles]
    assert len(reactants) == len(reactant_smiles)
    reactants = tuple(reactants)
    products = rxn.RunReactants(reactants)
    check = check_products(products, product_smiles, reaction_id, 1, reactant_smiles)
    # if check is False:
    #     reactants = reactants[::-1] # inverse tuple
    #     products = rxn.RunReactants(reactants)
    #     check = check_products(products, product_smiles, reaction_id, 2)
    # print()
    return check

def apply_function(row):
    return correct_transformation(row['reaction_type'], row['product_smiles'], row['reactant_smiles'], row['reaction_id'])

In [16]:
df = pd.read_pickle('/Users/kate_fieseler/PycharmProjects/EV-A71-2A-syndirella-run/placements_april8/may2_reactions_use.pkl.gz')
# only look at deprotections
deprotections = ["N-Boc_deprotection", "N-Bn_deprotection", "TBS_alcohol_deprotection", "Benzyl_alcohol_deprotection"]
df = df[df['reaction_type'].isin(deprotections)]
df

Unnamed: 0,reaction_id,reaction_type,product_id,reactant_ids,product_smiles,reactant_smiles,product_mol,reactant_mols
1,2,N-Boc_deprotection,88,{87},Cn1ncc(N)c1NC(=O)C1CCCO1,{Cn1ncc(NC(=O)OC(C)(C)C)c1NC(=O)C1CCCO1},<rdkit.Chem.rdchem.Mol object at 0x30493b970>,{<rdkit.Chem.rdchem.Mol object at 0x355618590>}
4,5,N-Boc_deprotection,93,{92},Cn1ncc(N)c1NC(=O)C1(C)CCCO1,{Cn1ncc(NC(=O)OC(C)(C)C)c1NC(=O)C1(C)CCCO1},<rdkit.Chem.rdchem.Mol object at 0x30493ba60>,{<rdkit.Chem.rdchem.Mol object at 0x355618720>}
7,8,N-Boc_deprotection,97,{96},Cn1ncc(N)c1NC(=O)C1CCC(=O)O1,{Cn1ncc(NC(=O)OC(C)(C)C)c1NC(=O)C1CCC(=O)O1},<rdkit.Chem.rdchem.Mol object at 0x30493bb50>,{<rdkit.Chem.rdchem.Mol object at 0x3556188b0>}
10,11,N-Boc_deprotection,101,{100},CC1CCC(C(=O)Nc2c(N)cnn2C)O1,{CC1CCC(C(=O)Nc2c(NC(=O)OC(C)(C)C)cnn2C)O1},<rdkit.Chem.rdchem.Mol object at 0x30493bc40>,{<rdkit.Chem.rdchem.Mol object at 0x355618a40>}
13,14,N-Boc_deprotection,105,{104},Cn1ncc(N)c1NC(=O)C1OCCC1F,{Cn1ncc(NC(=O)OC(C)(C)C)c1NC(=O)C1OCCC1F},<rdkit.Chem.rdchem.Mol object at 0x30493bd30>,{<rdkit.Chem.rdchem.Mol object at 0x355618bd0>}
...,...,...,...,...,...,...,...,...
107773,107774,Benzyl_alcohol_deprotection,109407,{109409},O=C(COc1cc(O)ccn1)N1CCOC(c2ccc(F)cc2)C1,{O=C(COc1cc(OCc2ccccc2)ccn1)N1CCOC(c2ccc(F)cc2...,<rdkit.Chem.rdchem.Mol object at 0x3588b4c70>,{<rdkit.Chem.rdchem.Mol object at 0x36be88040>}
107930,107931,Benzyl_alcohol_deprotection,109566,{109565},O=C(COc1cc(O)c2ccccc2n1)N1CCOCC1C1CC1,{O=C(COc1cc(OCc2ccccc2)c2ccccc2n1)N1CCOCC1C1CC1},<rdkit.Chem.rdchem.Mol object at 0x3588b7d80>,{<rdkit.Chem.rdchem.Mol object at 0x36beae250>}
108103,108104,Benzyl_alcohol_deprotection,109739,{109649},CC1CN(C(=O)COc2cc(O)c3ccccc3n2)CC(C)(C)O1,{CC1CN(C(=O)COc2cc(OCc3ccccc3)c3ccccc3n2)CC(C)...,<rdkit.Chem.rdchem.Mol object at 0x3588e33d0>,{<rdkit.Chem.rdchem.Mol object at 0x36c200ea0>}
108109,108110,Benzyl_alcohol_deprotection,109746,{109745},O=C(COc1cc(O)c2ccccc2n1)N1CCOC2CCCCC21,{O=C(COc1cc(OCc2ccccc2)c2ccccc2n1)N1CCOC2CCCCC21},<rdkit.Chem.rdchem.Mol object at 0x3588e35b0>,{<rdkit.Chem.rdchem.Mol object at 0x36c201210>}


In [24]:
df['new_smarts_check'] = [apply_function(row) for index, row in tqdm.tqdm(df.iterrows(), total=df.shape[0])]

 64%|██████▍   | 1141/1774 [00:00<00:00, 1212.85it/s]

Incorrect product
LHGGVNQTVVGQJQ-UHFFFAOYSA-N
DPTVHVRIENHKII-UHFFFAOYSA-N
CC(=O)Nc1cc(F)ccc1N
Incorrect product
LCCYFDVSWREFOO-UHFFFAOYSA-N
KEBCEKWOAOJDJP-UHFFFAOYSA-N
Nc1ccc(F)cc1NC(=O)CF
Incorrect product
NQEWJOKHXIEAKG-UHFFFAOYSA-N
GPGUKPRERLLVMH-UHFFFAOYSA-N
CCC(=O)Nc1cc(F)ccc1N
Incorrect product
BMJMMJXAZYFDQQ-UHFFFAOYSA-N
YXNHYCJPOISHSZ-UHFFFAOYSA-N
C=CC(=O)Nc1cc(F)ccc1N
Incorrect product
BSGIRRQRZAKCTP-UHFFFAOYSA-N
VJFGCQQXKBHDEW-UHFFFAOYSA-N
Nc1ccc(F)cc1NC(=O)CCl
Incorrect product
ICKRSEMNXBGKSJ-UHFFFAOYSA-N
CNXKXLQJEOPONG-UHFFFAOYSA-N
Nc1ccc(F)cc1NC(=O)CI
Incorrect product
IYTMUKDCYMGZDQ-UHFFFAOYSA-N
FOIBPMNXVLDNPX-UHFFFAOYSA-N
CC(=O)Nc1ccc(Cl)cc1N
Incorrect product
UPWILJJLIPSTTD-UHFFFAOYSA-N
ARFYTLSTQXQJOY-UHFFFAOYSA-N
Nc1ccc(F)cc1NC(=O)CO
Incorrect product
QSJDPCLGUSZRJS-UHFFFAOYSA-N
ONCCPEDIICZPDU-UHFFFAOYSA-N
Nc1ccc(F)cc1NC(=O)CBr
Incorrect product
UEVGFONSKPOCAR-UHFFFAOYSA-N
VJGUFTRYOCAQKF-UHFFFAOYSA-N
Nc1ccc(F)cc1NC(=O)CS
Incorrect product
XZHDLMQALCXQNH-UHFFFAOYSA-N
KG

100%|██████████| 1774/1774 [00:01<00:00, 1191.19it/s]

Incorrect product
XLTRZUXWMURUPF-UHFFFAOYSA-N
UFHIDTWATWLAKC-UHFFFAOYSA-N
Nc1cc(Cl)ccc1NC(=O)CO
Incorrect product
VROZXPDGMQCLMX-UHFFFAOYSA-N
COMGHSIWPXYKII-UHFFFAOYSA-N
Nc1cc(Cl)ccc1NC(=O)C1CO1
Incorrect product
MYDPTAQFDRHJPS-UHFFFAOYSA-N
GYBHKSPGNUVQMA-UHFFFAOYSA-N
Nc1cc(Cl)ccc1NC(=O)C(O)O
Incorrect product
HXYPUCCJPAEXII-UHFFFAOYSA-N
HGROLCBXIKCSPH-UHFFFAOYSA-N
Cc1ccc(N)c(NC(=O)C(C)O)c1
Incorrect product
QPFFXTAITBJALQ-UHFFFAOYSA-N
TVHZLIUHDUAFLV-UHFFFAOYSA-N
Nc1cc(F)ccc1NC(=O)C1CO1
Incorrect product
DHVIEWZDWPITPZ-UHFFFAOYSA-N
VPNZFUSMVJRGDK-UHFFFAOYSA-N
NOCC(=O)Nc1ccc(Cl)cc1N
Incorrect product
ZAMJZRUHWLFLCH-UHFFFAOYSA-N
YJMBLWUQBLQPCB-UHFFFAOYSA-N
Nc1cc(F)ccc1NC(=O)C(O)O
Incorrect product
CJMNCVNHJPCSTA-UHFFFAOYSA-N
SXRQPBLERISXHQ-UHFFFAOYSA-N
NOCC(=O)Nc1cc(F)ccc1N
Incorrect product
ZQTGPFVLTGUIEV-UHFFFAOYSA-N
RNGMNICPQROFRB-UHFFFAOYSA-N
Cc1ccc(NC(=O)CO)c(N)c1
Incorrect product
JEKGUCMGZHMSAI-UHFFFAOYSA-N
BHIJUWISGIHYDX-UHFFFAOYSA-N
CC(O)C(=O)Nc1ccc(Cl)cc1N
Incorrect product
AZP




In [19]:
df

Unnamed: 0,reaction_id,reaction_type,product_id,reactant_ids,product_smiles,reactant_smiles,product_mol,reactant_mols,new_smarts_check
1,2,N-Boc_deprotection,88,{87},Cn1ncc(N)c1NC(=O)C1CCCO1,{Cn1ncc(NC(=O)OC(C)(C)C)c1NC(=O)C1CCCO1},<rdkit.Chem.rdchem.Mol object at 0x30493b970>,{<rdkit.Chem.rdchem.Mol object at 0x355618590>},True
4,5,N-Boc_deprotection,93,{92},Cn1ncc(N)c1NC(=O)C1(C)CCCO1,{Cn1ncc(NC(=O)OC(C)(C)C)c1NC(=O)C1(C)CCCO1},<rdkit.Chem.rdchem.Mol object at 0x30493ba60>,{<rdkit.Chem.rdchem.Mol object at 0x355618720>},True
7,8,N-Boc_deprotection,97,{96},Cn1ncc(N)c1NC(=O)C1CCC(=O)O1,{Cn1ncc(NC(=O)OC(C)(C)C)c1NC(=O)C1CCC(=O)O1},<rdkit.Chem.rdchem.Mol object at 0x30493bb50>,{<rdkit.Chem.rdchem.Mol object at 0x3556188b0>},True
10,11,N-Boc_deprotection,101,{100},CC1CCC(C(=O)Nc2c(N)cnn2C)O1,{CC1CCC(C(=O)Nc2c(NC(=O)OC(C)(C)C)cnn2C)O1},<rdkit.Chem.rdchem.Mol object at 0x30493bc40>,{<rdkit.Chem.rdchem.Mol object at 0x355618a40>},True
13,14,N-Boc_deprotection,105,{104},Cn1ncc(N)c1NC(=O)C1OCCC1F,{Cn1ncc(NC(=O)OC(C)(C)C)c1NC(=O)C1OCCC1F},<rdkit.Chem.rdchem.Mol object at 0x30493bd30>,{<rdkit.Chem.rdchem.Mol object at 0x355618bd0>},True
...,...,...,...,...,...,...,...,...,...
107773,107774,Benzyl_alcohol_deprotection,109407,{109409},O=C(COc1cc(O)ccn1)N1CCOC(c2ccc(F)cc2)C1,{O=C(COc1cc(OCc2ccccc2)ccn1)N1CCOC(c2ccc(F)cc2...,<rdkit.Chem.rdchem.Mol object at 0x3588b4c70>,{<rdkit.Chem.rdchem.Mol object at 0x36be88040>},True
107930,107931,Benzyl_alcohol_deprotection,109566,{109565},O=C(COc1cc(O)c2ccccc2n1)N1CCOCC1C1CC1,{O=C(COc1cc(OCc2ccccc2)c2ccccc2n1)N1CCOCC1C1CC1},<rdkit.Chem.rdchem.Mol object at 0x3588b7d80>,{<rdkit.Chem.rdchem.Mol object at 0x36beae250>},True
108103,108104,Benzyl_alcohol_deprotection,109739,{109649},CC1CN(C(=O)COc2cc(O)c3ccccc3n2)CC(C)(C)O1,{CC1CN(C(=O)COc2cc(OCc3ccccc3)c3ccccc3n2)CC(C)...,<rdkit.Chem.rdchem.Mol object at 0x3588e33d0>,{<rdkit.Chem.rdchem.Mol object at 0x36c200ea0>},True
108109,108110,Benzyl_alcohol_deprotection,109746,{109745},O=C(COc1cc(O)c2ccccc2n1)N1CCOC2CCCCC21,{O=C(COc1cc(OCc2ccccc2)c2ccccc2n1)N1CCOC2CCCCC21},<rdkit.Chem.rdchem.Mol object at 0x3588e35b0>,{<rdkit.Chem.rdchem.Mol object at 0x36c201210>},True


In [20]:
bad_df = df[df['new_smarts_check'] == False]
bad_df

Unnamed: 0,reaction_id,reaction_type,product_id,reactant_ids,product_smiles,reactant_smiles,product_mol,reactant_mols,new_smarts_check
65983,65984,N-Boc_deprotection,67031,{67034},CC(=O)Nc1ccc(F)cc1N,{CC(=O)Nc1cc(F)ccc1NC(=O)OC(C)(C)C},<rdkit.Chem.rdchem.Mol object at 0x36d7cd620>,{<rdkit.Chem.rdchem.Mol object at 0x35f089ee0>},False
66027,66028,N-Boc_deprotection,67078,{67080},Nc1cc(F)ccc1NC(=O)CF,{CC(C)(C)OC(=O)Nc1ccc(F)cc1NC(=O)CF},<rdkit.Chem.rdchem.Mol object at 0x36d7ce3e0>,{<rdkit.Chem.rdchem.Mol object at 0x35f08b6f0>},False
66110,66111,N-Boc_deprotection,67158,{67160},CCC(=O)Nc1ccc(F)cc1N,{CCC(=O)Nc1cc(F)ccc1NC(=O)OC(C)(C)C},<rdkit.Chem.rdchem.Mol object at 0x36d7cfdd0>,{<rdkit.Chem.rdchem.Mol object at 0x35f0a25c0>},False
66136,66137,N-Boc_deprotection,67183,{67185},C=CC(=O)Nc1ccc(F)cc1N,{C=CC(=O)Nc1cc(F)ccc1NC(=O)OC(C)(C)C},<rdkit.Chem.rdchem.Mol object at 0x35aae0630>,{<rdkit.Chem.rdchem.Mol object at 0x35f0a3420>},False
66151,66152,N-Boc_deprotection,67197,{67199},Nc1cc(F)ccc1NC(=O)CCl,{CC(C)(C)OC(=O)Nc1ccc(F)cc1NC(=O)CCl},<rdkit.Chem.rdchem.Mol object at 0x35aae0ae0>,{<rdkit.Chem.rdchem.Mol object at 0x35f0a3c40>},False
66256,66257,N-Boc_deprotection,67301,{67303},Nc1cc(F)ccc1NC(=O)CI,{CC(C)(C)OC(=O)Nc1ccc(F)cc1NC(=O)CI},<rdkit.Chem.rdchem.Mol object at 0x35aae2bb0>,{<rdkit.Chem.rdchem.Mol object at 0x35f0c7a60>},False
66403,66404,N-Boc_deprotection,67448,{67451},CC(=O)Nc1cc(Cl)ccc1N,{CC(=O)Nc1ccc(Cl)cc1NC(=O)OC(C)(C)C},<rdkit.Chem.rdchem.Mol object at 0x35aaf99e0>,{<rdkit.Chem.rdchem.Mol object at 0x35f205300>},False
66481,66482,N-Boc_deprotection,67526,{67528},Nc1cc(F)ccc1NC(=O)CO,{CC(C)(C)OC(=O)Nc1ccc(F)cc1NC(=O)CO},<rdkit.Chem.rdchem.Mol object at 0x35aafb240>,{<rdkit.Chem.rdchem.Mol object at 0x35f230220>},False
66498,66499,N-Boc_deprotection,67542,{67544},Nc1cc(F)ccc1NC(=O)CBr,{CC(C)(C)OC(=O)Nc1ccc(F)cc1NC(=O)CBr},<rdkit.Chem.rdchem.Mol object at 0x35aafb790>,{<rdkit.Chem.rdchem.Mol object at 0x35f230c20>},False
66576,66577,N-Boc_deprotection,67620,{67622},Nc1cc(F)ccc1NC(=O)CS,{CC(C)(C)OC(=O)Nc1ccc(F)cc1NC(=O)CS},<rdkit.Chem.rdchem.Mol object at 0x35aa65030>,{<rdkit.Chem.rdchem.Mol object at 0x35f233ba0>},False


In [28]:
bad_df['unpacked_reactants'] = df['reactant_smiles'].apply(lambda x: x.pop())
bad_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bad_df['unpacked_reactants'] = df['reactant_smiles'].apply(lambda x: x.pop())


Unnamed: 0,reaction_id,reaction_type,product_id,reactant_ids,product_smiles,reactant_smiles,product_mol,reactant_mols,new_smarts_check,unpacked_reactants
65983,65984,N-Boc_deprotection,67031,{67034},CC(=O)Nc1ccc(F)cc1N,{},<rdkit.Chem.rdchem.Mol object at 0x36d7cd620>,{<rdkit.Chem.rdchem.Mol object at 0x35f089ee0>},False,CC(=O)Nc1cc(F)ccc1NC(=O)OC(C)(C)C
66027,66028,N-Boc_deprotection,67078,{67080},Nc1cc(F)ccc1NC(=O)CF,{},<rdkit.Chem.rdchem.Mol object at 0x36d7ce3e0>,{<rdkit.Chem.rdchem.Mol object at 0x35f08b6f0>},False,CC(C)(C)OC(=O)Nc1ccc(F)cc1NC(=O)CF
66110,66111,N-Boc_deprotection,67158,{67160},CCC(=O)Nc1ccc(F)cc1N,{},<rdkit.Chem.rdchem.Mol object at 0x36d7cfdd0>,{<rdkit.Chem.rdchem.Mol object at 0x35f0a25c0>},False,CCC(=O)Nc1cc(F)ccc1NC(=O)OC(C)(C)C
66136,66137,N-Boc_deprotection,67183,{67185},C=CC(=O)Nc1ccc(F)cc1N,{},<rdkit.Chem.rdchem.Mol object at 0x35aae0630>,{<rdkit.Chem.rdchem.Mol object at 0x35f0a3420>},False,C=CC(=O)Nc1cc(F)ccc1NC(=O)OC(C)(C)C
66151,66152,N-Boc_deprotection,67197,{67199},Nc1cc(F)ccc1NC(=O)CCl,{},<rdkit.Chem.rdchem.Mol object at 0x35aae0ae0>,{<rdkit.Chem.rdchem.Mol object at 0x35f0a3c40>},False,CC(C)(C)OC(=O)Nc1ccc(F)cc1NC(=O)CCl
66256,66257,N-Boc_deprotection,67301,{67303},Nc1cc(F)ccc1NC(=O)CI,{},<rdkit.Chem.rdchem.Mol object at 0x35aae2bb0>,{<rdkit.Chem.rdchem.Mol object at 0x35f0c7a60>},False,CC(C)(C)OC(=O)Nc1ccc(F)cc1NC(=O)CI
66403,66404,N-Boc_deprotection,67448,{67451},CC(=O)Nc1cc(Cl)ccc1N,{},<rdkit.Chem.rdchem.Mol object at 0x35aaf99e0>,{<rdkit.Chem.rdchem.Mol object at 0x35f205300>},False,CC(=O)Nc1ccc(Cl)cc1NC(=O)OC(C)(C)C
66481,66482,N-Boc_deprotection,67526,{67528},Nc1cc(F)ccc1NC(=O)CO,{},<rdkit.Chem.rdchem.Mol object at 0x35aafb240>,{<rdkit.Chem.rdchem.Mol object at 0x35f230220>},False,CC(C)(C)OC(=O)Nc1ccc(F)cc1NC(=O)CO
66498,66499,N-Boc_deprotection,67542,{67544},Nc1cc(F)ccc1NC(=O)CBr,{},<rdkit.Chem.rdchem.Mol object at 0x35aafb790>,{<rdkit.Chem.rdchem.Mol object at 0x35f230c20>},False,CC(C)(C)OC(=O)Nc1ccc(F)cc1NC(=O)CBr
66576,66577,N-Boc_deprotection,67620,{67622},Nc1cc(F)ccc1NC(=O)CS,{},<rdkit.Chem.rdchem.Mol object at 0x35aa65030>,{<rdkit.Chem.rdchem.Mol object at 0x35f233ba0>},False,CC(C)(C)OC(=O)Nc1ccc(F)cc1NC(=O)CS


In [29]:
bad_df.to_csv('/Users/kate_fieseler/PycharmProjects/EV-A71-2A-syndirella-run/placements_april8/failed_smarts.csv')