# Load the dataset

In [1]:
from rdkit import Chem
from rdkit.Chem import AllChem
import pandas as pd
import re



In [2]:
# Load data from MetaCyc dataset
dataSetB = pd.read_pickle('RHEA_atom_mapped_timepoint_3.pkl')

In [3]:
dataSetB.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18064 entries, 0 to 18063
Data columns (total 3 columns):
Atom Mapped Reaction    18064 non-null object
Input SMILES            18064 non-null object
RHEA ID                 18064 non-null object
dtypes: object(3)
memory usage: 423.5+ KB


In [4]:
dataSetB.head()

Unnamed: 0,Atom Mapped Reaction,Input SMILES,RHEA ID
0,[[O:11]=[c:12]1[cH:13][cH:14][n:15]([C@@H:16]2...,O=C1C=CN([C@@H]2O[C@H](COP(=O)([O-])OP(=O)([O-...,20342
1,[[H+].[H+].[O:1]=[C:2]([O-:3])[c:4]1[cH:5][c:6...,O=C1C=C(C(=O)[O-])NC(=O)N1.[H+].[H+]>>O=C1C=C(...,29348
2,[[CH2:1]=[O:2].[CH3:3][C@@H:4]1[NH:5][c:6]2[n:...,[H]C([H])=O.[H][C@@]1([C@@H](C)NC2=CC=C(C[C@H]...,24679
3,[[NH2:1][C:2](=[O:3])[c:4]1[cH:5][cH:6][cH:7][...,NC(=O)C1=CC=C[N+]([C@@H]2O[C@H](COP(=O)([O-])O...,24762
4,[[CH3:1][c:2]1[n:3][cH:4][c:5]([CH2:6][OH:7])[...,[H]C(=O)C1=C(CO)C=NC(C)=C1O>>[H]C(=O)C1=C(CO)C...,28464


# Dataset Processing
Enumerate the products.

In [5]:
# Create new df from old (minor processing)

#Failed reaction smiles capturing
failed_ids = []
failed_rxn = []
failed_rxn_NumRxnSmiles = []
error = 0

#successful reaction smiles capturing
ids = []
rxn_smiles = []
prod_smiles = []
atom_mapped_rxn_input = []
not_atom_mapped_rxn_input = []

#an important challenge here is that the atom mapper can sometimes provide multiple solutions.
#this approach (below) is inclusive. It will consider all possible solutions!

#loop through the dataset
for row in dataSetB.itertuples():
    
    #start with an empty dictionary to store not mapped products (key) and reaction SMILES (value)
    storage_dict = {}
    
    #loop through the list of reactions-
    for rxn in row[1]:
        
        all_reactants, all_products = rxn.split('>>')

        products = [Chem.MolFromSmiles(smi) for smi in all_products.split('.')]

        # Multiple products = enumerate
        for prod in products:
            try:
                # Make sure all have atom mapping
                if not all([a.HasProp('molAtomMapNumber') for a in prod.GetAtoms()]):
                    continue
                prod_smi = Chem.MolToSmiles(prod, True)

                # Re-parse reactants for each product so we can clear maps
                reactants = [Chem.MolFromSmiles(smi) for smi in all_reactants.split('.')]

                # Get rid of reactants when they don't contribute to this prod
                prod_maps = set(re.findall('\:([[0-9]+)\]', prod_smi))
                reactants_smi_list = []
                for mol in reactants:
                    used = False
                    for a in mol.GetAtoms():
                        if a.HasProp('molAtomMapNumber'):
                            if a.GetProp('molAtomMapNumber') in prod_maps:
                                used = True 
                            else:
                                a.ClearProp('molAtomMapNumber')
                    if used:
                        reactants_smi_list.append(Chem.MolToSmiles(mol, True))

                reactants_smi = '.'.join(reactants_smi_list)

                # Was this just a spectator/ transport reaction? Examples inlcude molecule X (in)>> molecule X (out) 
                if reactants_smi == prod_smi:
                    continue
                
                # Remove reactions that do not have an reactants, weird exception
                if reactants_smi == '':
                    continue
                
                [a.ClearProp('molAtomMapNumber') for a in prod.GetAtoms()]
                prod_smiles_temp = Chem.MolToSmiles(prod, True)
                rxn_smiles_temp = '{}>>{}'.format(reactants_smi, prod_smi)
                
                #Append to on going dictionary
                #Check if the product is already available in the dictionary
                if prod_smiles_temp in storage_dict:
                    storage_dict [prod_smiles_temp]. append (rxn_smiles_temp)
                
                #If it is not already available in the dictionary, then create the new entry 
                else:
                    storage_dict [prod_smiles_temp]= [rxn_smiles_temp]
            
            #keep track of the errors    
            except:
                error = error + 1
                failed_ids. append (row[3])
                failed_rxn. append (rxn)
                failed_rxn_NumRxnSmiles (len (row[1]))
                continue
    
    
    for product in storage_dict:
        prod_smiles.append (product)
        rxn_smiles.append (list (set (storage_dict[product])))
        ids.append (row[3])
        atom_mapped_rxn_input.append (row[1])
        not_atom_mapped_rxn_input.append (row[2])  



















































































































































































































































































































































In [14]:
dataframe = pd.DataFrame ({
                               'Atom Mapped Reaction Input': atom_mapped_rxn_input,
                               'Not Atom Mapped Input': not_atom_mapped_rxn_input,
                          'Product SMILES': prod_smiles,'RHEA ID': ids,'Reaction SMILES enumerated': rxn_smiles})

Get information about the dataframe.

In [15]:
dataframe.head()

Unnamed: 0,Atom Mapped Reaction Input,Not Atom Mapped Input,Product SMILES,RHEA ID,Reaction SMILES enumerated
0,[[O:11]=[c:12]1[cH:13][cH:14][n:15]([C@@H:16]2...,O=C1C=CN([C@@H]2O[C@H](COP(=O)([O-])OP(=O)([O-...,O=c1ccn([C@@H]2O[C@H](COP(=O)([O-])OP(=O)([O-]...,20342,[OC[C@H]1O[C@H]([O:28][P:25]([O:24][P:21]([O:2...
1,[[O:11]=[c:12]1[cH:13][cH:14][n:15]([C@@H:16]2...,O=C1C=CN([C@@H]2O[C@H](COP(=O)([O-])OP(=O)([O-...,O=P([O-])([O-])OC[C@H](O)CO[C@H]1O[C@H](CO)[C@...,20342,[O=c1ccn([C@@H]2O[C@H](COP(=O)([O-])OP(=O)([O-...
2,[[CH2:1]=[O:2].[CH3:3][C@@H:4]1[NH:5][c:6]2[n:...,[H]C([H])=O.[H][C@@]1([C@@H](C)NC2=CC=C(C[C@H]...,C[C@@H]1Nc2nc(N)[nH]c(=O)c2N2CN(c3ccc(C[C@H](O...,24679,[O=[CH2:1].[CH3:3][C@@H:4]1[NH:5][c:6]2[n:7][c...
3,[[CH2:1]=[O:2].[CH3:3][C@@H:4]1[NH:5][c:6]2[n:...,[H]C([H])=O.[H][C@@]1([C@@H](C)NC2=CC=C(C[C@H]...,O,24679,[C=[O:2]>>[OH2:2]]
4,[[NH2:1][C:2](=[O:3])[c:4]1[cH:5][cH:6][cH:7][...,NC(=O)C1=CC=C[N+]([C@@H]2O[C@H](COP(=O)([O-])O...,NC(=O)C1=CN([C@@H]2O[C@H](COP(=O)([O-])OP(=O)(...,24762,[[NH2:1][C:2](=[O:3])[c:4]1[cH:5][cH:6][cH:7][...


In [16]:
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35504 entries, 0 to 35503
Data columns (total 5 columns):
Atom Mapped Reaction Input    35504 non-null object
Not Atom Mapped Input         35504 non-null object
Product SMILES                35504 non-null object
RHEA ID                       35504 non-null object
Reaction SMILES enumerated    35504 non-null object
dtypes: object(5)
memory usage: 1.4+ MB


In [17]:
error

0

In [18]:
from collections import Counter
prod_smi_counter = Counter (dataframe['Product SMILES'])
print (prod_smi_counter.most_common (25))

[('O', 3613), ('O=O', 1684), ('NC(=O)C1=CN([C@@H]2O[C@H](COP(=O)([O-])OP(=O)([O-])OC[C@H]3O[C@@H](n4cnc5c(N)ncnc54)[C@H](OP(=O)([O-])[O-])[C@@H]3O)[C@@H](O)[C@H]2O)C=CC1', 909), ('NC(=O)c1ccc[n+]([C@@H]2O[C@H](COP(=O)([O-])OP(=O)([O-])OC[C@H]3O[C@@H](n4cnc5c(N)ncnc54)[C@H](OP(=O)([O-])[O-])[C@@H]3O)[C@@H](O)[C@H]2O)c1', 908), ('NC(=O)c1ccc[n+]([C@@H]2O[C@H](COP(=O)([O-])OP(=O)([O-])OC[C@H]3O[C@@H](n4cnc5c(N)ncnc54)[C@H](O)[C@@H]3O)[C@@H](O)[C@H]2O)c1', 847), ('NC(=O)C1=CN([C@@H]2O[C@H](COP(=O)([O-])OP(=O)([O-])OC[C@H]3O[C@@H](n4cnc5c(N)ncnc54)[C@H](O)[C@@H]3O)[C@@H](O)[C@H]2O)C=CC1', 845), ('CC(C)(COP(=O)([O-])OP(=O)([O-])OC[C@H]1O[C@@H](n2cnc3c(N)ncnc32)[C@H](O)[C@@H]1OP(=O)([O-])[O-])[C@@H](O)C(=O)NCCC(=O)NCCS', 727), ('Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)([O-])OP(=O)([O-])OP(=O)([O-])[O-])[C@@H](O)[C@H]1O', 637), ('O=P([O-])([O-])OP(=O)([O-])O', 569), ('O=P([O-])([O-])O', 552), ('O=C=O', 547), ('Cc1cc2nc3c(=O)[n-]c(=O)nc-3n(C[C@H](O)[C@H](O)[C@H](O)COP(=O)([O-])[O-])c2cc1C', 534), ('

In [19]:
dataframe ['prod_smiles_pop'] = [prod_smi_counter[smi] for smi in dataframe['Product SMILES']]

In [20]:
dataframe.head()

Unnamed: 0,Atom Mapped Reaction Input,Not Atom Mapped Input,Product SMILES,RHEA ID,Reaction SMILES enumerated,prod_smiles_pop
0,[[O:11]=[c:12]1[cH:13][cH:14][n:15]([C@@H:16]2...,O=C1C=CN([C@@H]2O[C@H](COP(=O)([O-])OP(=O)([O-...,O=c1ccn([C@@H]2O[C@H](COP(=O)([O-])OP(=O)([O-]...,20342,[OC[C@H]1O[C@H]([O:28][P:25]([O:24][P:21]([O:2...,266
1,[[O:11]=[c:12]1[cH:13][cH:14][n:15]([C@@H:16]2...,O=C1C=CN([C@@H]2O[C@H](COP(=O)([O-])OP(=O)([O-...,O=P([O-])([O-])OC[C@H](O)CO[C@H]1O[C@H](CO)[C@...,20342,[O=c1ccn([C@@H]2O[C@H](COP(=O)([O-])OP(=O)([O-...,1
2,[[CH2:1]=[O:2].[CH3:3][C@@H:4]1[NH:5][c:6]2[n:...,[H]C([H])=O.[H][C@@]1([C@@H](C)NC2=CC=C(C[C@H]...,C[C@@H]1Nc2nc(N)[nH]c(=O)c2N2CN(c3ccc(C[C@H](O...,24679,[O=[CH2:1].[CH3:3][C@@H:4]1[NH:5][c:6]2[n:7][c...,7
3,[[CH2:1]=[O:2].[CH3:3][C@@H:4]1[NH:5][c:6]2[n:...,[H]C([H])=O.[H][C@@]1([C@@H](C)NC2=CC=C(C[C@H]...,O,24679,[C=[O:2]>>[OH2:2]],3613
4,[[NH2:1][C:2](=[O:3])[c:4]1[cH:5][cH:6][cH:7][...,NC(=O)C1=CC=C[N+]([C@@H]2O[C@H](COP(=O)([O-])O...,NC(=O)C1=CN([C@@H]2O[C@H](COP(=O)([O-])OP(=O)(...,24762,[[NH2:1][C:2](=[O:3])[c:4]1[cH:5][cH:6][cH:7][...,845


Keep product SMILES that are longer than three characters in length and that occur less than fifty times.

In [21]:
dataframe ['keep'] = [x[6]<50 and len(x[3]) >= 3 for x in dataframe.itertuples()]

In [22]:
dataframe.loc[dataframe ['keep']].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17309 entries, 1 to 35499
Data columns (total 7 columns):
Atom Mapped Reaction Input    17309 non-null object
Not Atom Mapped Input         17309 non-null object
Product SMILES                17309 non-null object
RHEA ID                       17309 non-null object
Reaction SMILES enumerated    17309 non-null object
prod_smiles_pop               17309 non-null int64
keep                          17309 non-null bool
dtypes: bool(1), int64(1), object(5)
memory usage: 963.5+ KB


In [23]:
dataframe_2 = dataframe.rename (columns = {'RHEA ID': 'id', 'Product SMILES': 'prod_smiles', 
                             'Reaction SMILES enumerated': 'rxn_smiles'})

In [24]:
dataframe_2.head()

Unnamed: 0,Atom Mapped Reaction Input,Not Atom Mapped Input,prod_smiles,id,rxn_smiles,prod_smiles_pop,keep
0,[[O:11]=[c:12]1[cH:13][cH:14][n:15]([C@@H:16]2...,O=C1C=CN([C@@H]2O[C@H](COP(=O)([O-])OP(=O)([O-...,O=c1ccn([C@@H]2O[C@H](COP(=O)([O-])OP(=O)([O-]...,20342,[OC[C@H]1O[C@H]([O:28][P:25]([O:24][P:21]([O:2...,266,False
1,[[O:11]=[c:12]1[cH:13][cH:14][n:15]([C@@H:16]2...,O=C1C=CN([C@@H]2O[C@H](COP(=O)([O-])OP(=O)([O-...,O=P([O-])([O-])OC[C@H](O)CO[C@H]1O[C@H](CO)[C@...,20342,[O=c1ccn([C@@H]2O[C@H](COP(=O)([O-])OP(=O)([O-...,1,True
2,[[CH2:1]=[O:2].[CH3:3][C@@H:4]1[NH:5][c:6]2[n:...,[H]C([H])=O.[H][C@@]1([C@@H](C)NC2=CC=C(C[C@H]...,C[C@@H]1Nc2nc(N)[nH]c(=O)c2N2CN(c3ccc(C[C@H](O...,24679,[O=[CH2:1].[CH3:3][C@@H:4]1[NH:5][c:6]2[n:7][c...,7,True
3,[[CH2:1]=[O:2].[CH3:3][C@@H:4]1[NH:5][c:6]2[n:...,[H]C([H])=O.[H][C@@]1([C@@H](C)NC2=CC=C(C[C@H]...,O,24679,[C=[O:2]>>[OH2:2]],3613,False
4,[[NH2:1][C:2](=[O:3])[c:4]1[cH:5][cH:6][cH:7][...,NC(=O)C1=CC=C[N+]([C@@H]2O[C@H](COP(=O)([O-])O...,NC(=O)C1=CN([C@@H]2O[C@H](COP(=O)([O-])OP(=O)(...,24762,[[NH2:1][C:2](=[O:3])[c:4]1[cH:5][cH:6][cH:7][...,845,False


# Clean up the dataframe.
Create a new dataframe with only keep=True. Drop unnecessary columns like 'prod_smiles_pop' and 'keep'. Save a copy of the dataframe!

In [25]:
# process the dataframe and store as dataframe3
dataframe_3 = dataframe_2.loc[dataframe_2['keep']]

In [26]:
dataframe_3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17309 entries, 1 to 35499
Data columns (total 7 columns):
Atom Mapped Reaction Input    17309 non-null object
Not Atom Mapped Input         17309 non-null object
prod_smiles                   17309 non-null object
id                            17309 non-null object
rxn_smiles                    17309 non-null object
prod_smiles_pop               17309 non-null int64
keep                          17309 non-null bool
dtypes: bool(1), int64(1), object(5)
memory usage: 963.5+ KB


In [27]:
dataframe_3 = dataframe_3.drop (columns = ['prod_smiles_pop', 'keep'])

In [28]:
dataframe_3

Unnamed: 0,Atom Mapped Reaction Input,Not Atom Mapped Input,prod_smiles,id,rxn_smiles
1,[[O:11]=[c:12]1[cH:13][cH:14][n:15]([C@@H:16]2...,O=C1C=CN([C@@H]2O[C@H](COP(=O)([O-])OP(=O)([O-...,O=P([O-])([O-])OC[C@H](O)CO[C@H]1O[C@H](CO)[C@...,20342,[O=c1ccn([C@@H]2O[C@H](COP(=O)([O-])OP(=O)([O-...
2,[[CH2:1]=[O:2].[CH3:3][C@@H:4]1[NH:5][c:6]2[n:...,[H]C([H])=O.[H][C@@]1([C@@H](C)NC2=CC=C(C[C@H]...,C[C@@H]1Nc2nc(N)[nH]c(=O)c2N2CN(c3ccc(C[C@H](O...,24679,[O=[CH2:1].[CH3:3][C@@H:4]1[NH:5][c:6]2[n:7][c...
5,[[NH2:1][C:2](=[O:3])[c:4]1[cH:5][cH:6][cH:7][...,NC(=O)C1=CC=C[N+]([C@@H]2O[C@H](COP(=O)([O-])O...,[NH3+][C@@H](Cc1ccc(O)c(O)c1)C(=O)[O-],24762,[[NH4+:45].[O:46]=[C:47]([O-:48])[CH2:49][CH2:...
6,[[O:11]=[O:12].[O:1]=[C:2]1[O:3][C@H:4]([CH2:5...,O=C1O[C@H](CO)[C@@H](O)[C@@H]1O.O=O>>O=C1O[C@H...,O=C1O[C@H](CO)C([O-])=C1O,23757,[[O:1]=[C:2]1[O:3][C@H:4]([CH2:5][OH:6])[C@@H:...
9,[[H+].[H+].[O:1]=[C:2]([O-:3])/[CH:4]=[CH:5]\[...,O=C([O-])/C=C\C=C/C(=O)[O-].[H+].[H+]>>O=O.OC1...,Oc1ccccc1O,23854,[O=[C:2]([O-:3])/[CH:4]=[CH:5]\[CH:6]=[CH:7]/[...
...,...,...,...,...,...
35484,[[CH3:1][C:2]([CH3:3])([CH2:4][O:5][P:6](=[O:7...,CC(C)(COP(=O)([O-])OP(=O)([O-])OC[C@H]1O[C@@H]...,O=C([O-])CCCCCCC(=O)[O-],49602,[CC(C)(COP(=O)([O-])OP(=O)([O-])OC[C@H]1O[C@@H...
35489,[[O:35]=[C:36]([O-:37])[CH2:38][CH2:39][C:40](...,O=C([O-])CCC(=O)C(=O)[O-].O=O.OCC1=C[C@H]([NH2...,OCC1=C[C@H](N[C@@H]2[C@H](O)[C@@H](O)[C@H](O[C...,48753,[O=C([O-])CCC(C(=O)[O-])=[O:41].[OH:1][CH2:2][...
35492,[[NH2:1][c:2]1[n:3][cH:4][n:5][c:6]2[c:7]1[n:8...,NC1=NC=NC2=C1N=CN2[C@@H]1O[C@H](COP(=O)([O-])O...,O=C([O-])c1ccc[n+]([C@@H]2O[C@H](COP(=O)([O-])...,36164,[[O:33]=[C:34]([O-:35])[c:36]1[cH:37][cH:38][c...
35496,[[CH3:1][C:2]([CH3:3])([CH2:4][O:5][P:6](=[O:7...,CC(C)(COP(=O)([O-])OP(=O)([O-])OC[C@H]1O[C@@H]...,CC(C)(COP(=O)([O-])OP(=O)([O-])OC[C@H]1O[C@@H]...,40292,[[CH3:1][C:2]([CH3:3])([CH2:4][O:5][P:6](=[O:7...


In [29]:
#dataframe_3.to_pickle ('RHEA_atom_mapped_timepoint_5.pkl')

# Remove reactions for which you can extract and apply templates

In [1]:
#import all relevant modules!
import sys 
import os
import re
import copy
import rdkit.Chem as Chem
import rdkit.Chem.AllChem as AllChem
from rdkit.Chem.rdchem import ChiralType, BondType, BondDir
from rdchiral.utils import vprint, PLEVEL, atoms_are_different
from rdchiral.initialization import rdchiralReaction, rdchiralReactants
from rdchiral.chiral import template_atom_could_have_been_tetra, copy_chirality,\
    atom_chirality_matches
from rdchiral.clean import canonicalize_outcome_smiles, combine_enantiomers_into_racemic
from rdchiral.bonds import BondDirOpposite, restore_bond_stereo_to_sp2_atom
from template_extractor_enz_v4 import extract_from_reaction
from main_v3 import rdchiralRun



In [2]:
#this is not necessary, but the analysis below was done on a different day!
import pandas as pd
dataset = pd.read_pickle('RHEA_atom_mapped_timepoint_5.pkl')

dataset = dataset. reset_index (drop = True)

In [3]:
# counts to break the for loop
count = 0

# capture the input from the dataframe
_id = []
prod_smiles = []
rxn_smiles = []
in_atom_map = []
in_not_atom_map = []

# capture the failed data from the dataframe
fail_id = []
fail_prod_smiles = []
fail_rxn_smiles = []
fail_in_atom_map = []
fail_in_not_atom_map = []

# capture the output of the analysis
template = []
proposed_reactants = []
match = []

#debug
debug = False

for row in dataset.itertuples():
    
    #0- Setup the debug
    count += 1
    
    if debug == True and count == 10:
        break
    
    for single_reaction in row[5]:
        
        check = False
    
        try:
            #1-EXTRACT THE TEMPLATE

            #1A-Convert the reaction into a dictionary
            reaction = {}
            rct, rea, prd = row[5][0].split(' ')[0].split('>')
            reaction['reactants'] = rct
            reaction['products'] = prd
            reaction['_id'] = row[4]

            #1B- Extract the template using rdchiral
            template_extract = extract_from_reaction (reaction)
            rxn_smart = template_extract ['reaction_smarts']

            #2-APPLY THE TEMPLATE

            #2A- Pre-initialize
            rxn = rdchiralReaction(rxn_smart)
            reactants = rdchiralReactants(row[3])

            #2B - Run the reaction
            outcomes = rdchiralRun(rxn, reactants)

            #3- CHECK FOR A MATCH

            #3A- Remove atom mapping from reactants and store the set as reactants
            reactants_mol = Chem.MolFromSmiles(rct)
            [a.ClearProp('molAtomMapNumber') for a in reactants_mol.GetAtoms()]
            react_smiles = Chem.MolToSmiles(reactants_mol, True)
            react_smiles_list = [react_smiles]

            #3B- Check if the element in dataset (true) is present as one of the solutions of the predicted outcomes
            check = all (item in outcomes for item in react_smiles_list)

            if check == True:
                break
                
        except:
            continue

    if check == True:
        _id.append(row[4])
        prod_smiles.append(row[3])
        rxn_smiles.append(row[5])
        in_atom_map.append(row[1])
        in_not_atom_map.append(row[2])

    if check == False:
        fail_id.append (row[4])
        fail_prod_smiles.append (row[3])
        fail_rxn_smiles.append (row[5])
        fail_in_atom_map.append (row[1])
        fail_in_not_atom_map.append (row[2])

In [4]:
result_df = pd.DataFrame({'id': _id,
                         'prod_smiles': prod_smiles,
                         'rxn_smiles': rxn_smiles,
                          'atom mapped smiles-input': in_atom_map,
                          'not atom mapped smiles-input': in_not_atom_map})

In [5]:
result_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15691 entries, 0 to 15690
Data columns (total 5 columns):
id                              15691 non-null object
prod_smiles                     15691 non-null object
rxn_smiles                      15691 non-null object
atom mapped smiles-input        15691 non-null object
not atom mapped smiles-input    15691 non-null object
dtypes: object(5)
memory usage: 613.1+ KB


In [None]:
#result_df.to_pickle('RHEA_atom_mapped_timepoint_6_success.pkl')

# Remove products corresponding to single atoms

In [8]:
#this is not necessary, it was just that this last part of the processing was done on a new day!!
data = pd.read_pickle('RHEA_atom_mapped_timepoint_6_success.pkl')
data.head(2)

Unnamed: 0,id,prod_smiles,rxn_smiles,atom mapped smiles-input,not atom mapped smiles-input
0,20342,O=P([O-])([O-])OC[C@H](O)CO[C@H]1O[C@H](CO)[C@...,[O=c1ccn([C@@H]2O[C@H](COP(=O)([O-])OP(=O)([O-...,[[O:11]=[c:12]1[cH:13][cH:14][n:15]([C@@H:16]2...,O=C1C=CN([C@@H]2O[C@H](COP(=O)([O-])OP(=O)([O-...
1,24762,[NH3+][C@@H](Cc1ccc(O)c(O)c1)C(=O)[O-],[[NH4+:45].[O:46]=[C:47]([O-:48])[CH2:49][CH2:...,[[NH2:1][C:2](=[O:3])[c:4]1[cH:5][cH:6][cH:7][...,NC(=O)C1=CC=C[N+]([C@@H]2O[C@H](COP(=O)([O-])O...


In [9]:
_id = []
prod_smiles = []
rxn_smiles = []
atom_map_smiles = []
not_atom_map_smiles = []

total_rxn = 0
single_atom = 0
dataset_length = 0

for row in data.itertuples():
    total_rxn += 1
    
    if Chem.MolFromSmiles(row[2]).GetNumAtoms()==1:
        #print("Not using product",row[2],"from reaction",row[3])
        single_atom += 1
        continue
    
    dataset_length += 1
    
    _id.append (row[1])
    prod_smiles.append (row[2])
    rxn_smiles.append (row[3])
    atom_map_smiles.append (row[4])
    not_atom_map_smiles.append (row[5])

In [10]:
total_rxn

15690

In [11]:
single_atom

142

In [12]:
dataset_length

15548

In [13]:
data.head(2)

Unnamed: 0,id,prod_smiles,rxn_smiles,atom mapped smiles-input,not atom mapped smiles-input
0,20342,O=P([O-])([O-])OC[C@H](O)CO[C@H]1O[C@H](CO)[C@...,[O=c1ccn([C@@H]2O[C@H](COP(=O)([O-])OP(=O)([O-...,[[O:11]=[c:12]1[cH:13][cH:14][n:15]([C@@H:16]2...,O=C1C=CN([C@@H]2O[C@H](COP(=O)([O-])OP(=O)([O-...
1,24762,[NH3+][C@@H](Cc1ccc(O)c(O)c1)C(=O)[O-],[[NH4+:45].[O:46]=[C:47]([O-:48])[CH2:49][CH2:...,[[NH2:1][C:2](=[O:3])[c:4]1[cH:5][cH:6][cH:7][...,NC(=O)C1=CC=C[N+]([C@@H]2O[C@H](COP(=O)([O-])O...


In [14]:
data_v2 = pd.DataFrame({'id': _id, 'prod_smiles': prod_smiles, 'rxn_smiles': rxn_smiles,
                       'atom mapped smiles-input': atom_map_smiles, 'not atom mapped smiles-input': not_atom_map_smiles})

In [15]:
data_v2.head(2)

Unnamed: 0,id,prod_smiles,rxn_smiles,atom mapped smiles-input,not atom mapped smiles-input
0,20342,O=P([O-])([O-])OC[C@H](O)CO[C@H]1O[C@H](CO)[C@...,[O=c1ccn([C@@H]2O[C@H](COP(=O)([O-])OP(=O)([O-...,[[O:11]=[c:12]1[cH:13][cH:14][n:15]([C@@H:16]2...,O=C1C=CN([C@@H]2O[C@H](COP(=O)([O-])OP(=O)([O-...
1,24762,[NH3+][C@@H](Cc1ccc(O)c(O)c1)C(=O)[O-],[[NH4+:45].[O:46]=[C:47]([O-:48])[CH2:49][CH2:...,[[NH2:1][C:2](=[O:3])[c:4]1[cH:5][cH:6][cH:7][...,NC(=O)C1=CC=C[N+]([C@@H]2O[C@H](COP(=O)([O-])O...


In [16]:
data_v2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15548 entries, 0 to 15547
Data columns (total 5 columns):
id                              15548 non-null object
prod_smiles                     15548 non-null object
rxn_smiles                      15548 non-null object
atom mapped smiles-input        15548 non-null object
not atom mapped smiles-input    15548 non-null object
dtypes: object(5)
memory usage: 607.5+ KB


In [None]:
#data_v2.to_pickle('RHEA_atom_mapped_timepoint_7_success.pkl')