# Load the dataset

In [1]:
import pandas as pd
#load the atom mapped dataset
data_orig = pd.read_pickle('RHEA_atom_mapped_timepoint_7_success.pkl')

In [2]:
import rdkit.Chem as Chem



# Ennumerate the dataset to incoporate all atom mapped solutions

Sometimes multiple atom mapping solutions result from the atom mapping algorithm by Jaworksi et al. The goal is to ennumerate the different solutions, so that all solutions are considered.

In [5]:
_id = []
prod_smiles = []
rxn_smiles = []
atom_map_smiles = []
not_atom_map_smiles = []

#loop through the dataset
for row in data_orig.itertuples():
    #loop through the different atom mapping solutions
    for list_item in row[3]:
        _id.append (row [1])
        prod_smiles.append (row[2])
        rxn_smiles.append (list_item)
        atom_map_smiles.append (row[4])
        not_atom_map_smiles.append (row[5]) 

In [6]:
#rebuild the dataframe
data = pd.DataFrame({'id': _id,
                        'prod_smiles': prod_smiles,
                        'rxn_smiles': rxn_smiles,
                        'atom mapped smiles-input': atom_map_smiles,
                        'not atom mapped smiles-input':not_atom_map_smiles})

In [7]:
data.head()

Unnamed: 0,id,prod_smiles,rxn_smiles,atom mapped smiles-input,not atom mapped smiles-input
0,20342,O=P([O-])([O-])OC[C@H](O)CO[C@H]1O[C@H](CO)[C@...,O=c1ccn([C@@H]2O[C@H](COP(=O)([O-])OP(=O)([O-]...,[[O:11]=[c:12]1[cH:13][cH:14][n:15]([C@@H:16]2...,O=C1C=CN([C@@H]2O[C@H](COP(=O)([O-])OP(=O)([O-...
1,24762,[NH3+][C@@H](Cc1ccc(O)c(O)c1)C(=O)[O-],[NH4+:45].[O:46]=[C:47]([O-:48])[CH2:49][CH2:5...,[[NH2:1][C:2](=[O:3])[c:4]1[cH:5][cH:6][cH:7][...,NC(=O)C1=CC=C[N+]([C@@H]2O[C@H](COP(=O)([O-])O...
2,23757,O=C1O[C@H](CO)C([O-])=C1O,[O:1]=[C:2]1[O:3][C@H:4]([CH2:5][OH:6])[C@@H:7...,[[O:11]=[O:12].[O:1]=[C:2]1[O:3][C@H:4]([CH2:5...,O=C1O[C@H](CO)[C@@H](O)[C@@H]1O.O=O>>O=C1O[C@H...
3,23854,Oc1ccccc1O,O=[C:2]([O-:3])/[CH:4]=[CH:5]\[CH:6]=[CH:7]/[C...,[[H+].[H+].[O:1]=[C:2]([O-:3])/[CH:4]=[CH:5]\[...,O=C([O-])/C=C\C=C/C(=O)[O-].[H+].[H+]>>O=O.OC1...
4,15154,O=C([O-])c1ccc(O[C@@H]2O[C@H](CO)[C@@H](O)[C@H...,O=c1ccn([C@@H]2O[C@H](COP(=O)([O-])OP(=O)([O-]...,[[O:11]=[c:12]1[cH:13][cH:14][n:15]([C@@H:16]2...,O=C([O-])C1=CC=C(O)C=C1.O=C1C=CN([C@@H]2O[C@H]...


# Canonicalize the reaction and add to the pandas dataframe

Canonicalize the reaction SMILES to help remove duplicates.

In [8]:
#load the necessary modules
from rdkit.Chem import rdChemReactions

In [9]:
#initial definitions
debug = False
count = 0
canonical_rxn_smiles_list = []

for row in data.itertuples():
    
    count += 1
    
    if debug and count == 5:
        break
    

    if debug:
        print ('The original reaction is: {}'.format (row[3]))
    
    #split reactants and product
    all_reactants, product = row[3].split('>>')
    
    #remove atom mapping information from the product 
    product_mol = Chem.MolFromSmiles(product)
    [a.ClearProp('molAtomMapNumber') for a in product_mol.GetAtoms()]
    prod_smi = Chem.MolToSmiles(product_mol, True)
    
    #remove atom mapping information from the reactant
    reactants = [Chem.MolFromSmiles(smi) for smi in all_reactants.split('.')]
    reactants_smi_list = []
    for react in reactants:
        [a.ClearProp('molAtomMapNumber') for a in react.GetAtoms()]
        reactants_smi_list.append(Chem.MolToSmiles(react, True))
    reactants_smi = '.'.join(reactants_smi_list)

    if debug:
        print ('Ammended reaction is {}>>{}'.format (reactants_smi,prod_smi))
    
    #load the reaction into RDKit and have it translate into SMILES
    rxn = rdChemReactions.ReactionFromSmarts('{}>>{}'.format(reactants_smi,prod_smi),useSmiles = True)
    rxn_smiles_canonical = rdChemReactions.ReactionToSmiles(rxn)

    if debug:
        print ('Canonical reaction smiles is: {}'.format (rxn_smiles_canonical))
        print ('-------------------------------')

    canonical_rxn_smiles_list.append (rxn_smiles_canonical)

In [10]:
data ['canonical_rxn_smiles'] = canonical_rxn_smiles_list

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16648 entries, 0 to 16647
Data columns (total 6 columns):
id                              16648 non-null object
prod_smiles                     16648 non-null object
rxn_smiles                      16648 non-null object
atom mapped smiles-input        16648 non-null object
not atom mapped smiles-input    16648 non-null object
canonical_rxn_smiles            16648 non-null object
dtypes: object(6)
memory usage: 780.5+ KB


# Remove duplicate transformations

In [12]:
data.head(2)

Unnamed: 0,id,prod_smiles,rxn_smiles,atom mapped smiles-input,not atom mapped smiles-input,canonical_rxn_smiles
0,20342,O=P([O-])([O-])OC[C@H](O)CO[C@H]1O[C@H](CO)[C@...,O=c1ccn([C@@H]2O[C@H](COP(=O)([O-])OP(=O)([O-]...,[[O:11]=[c:12]1[cH:13][cH:14][n:15]([C@@H:16]2...,O=C1C=CN([C@@H]2O[C@H](COP(=O)([O-])OP(=O)([O-...,O=P([O-])([O-])OC[C@H](O)CO.O=c1ccn([C@@H]2O[C...
1,24762,[NH3+][C@@H](Cc1ccc(O)c(O)c1)C(=O)[O-],[NH4+:45].[O:46]=[C:47]([O-:48])[CH2:49][CH2:5...,[[NH2:1][C:2](=[O:3])[c:4]1[cH:5][cH:6][cH:7][...,NC(=O)C1=CC=C[N+]([C@@H]2O[C@H](COP(=O)([O-])O...,O=C([O-])CCc1ccc(O)c(O)c1.[NH4+]>>[NH3+][C@@H]...


In [13]:
#growing list of reactions to include
rxns_included = []

#keep track of lists
_id = []
prod_smiles = []
rxn_smiles = []
atom_map_smiles = []
not_atom_map_smiles = []
canonical_rxn_smiles = []

for row in data.itertuples():
    
    # if the reaction is already there, do not include it
    if row[6] in rxns_included:
        continue
    
    #if it is not there, include it in the database
    _id.append (row [1])
    prod_smiles.append (row[2])
    rxn_smiles.append (row[3])
    atom_map_smiles.append (row[4])
    not_atom_map_smiles.append (row[5])
    canonical_rxn_smiles.append (row[6])
    #add to the reaction included list
    rxns_included.append (row[6])

In [14]:
#rebuild the dataframe
data_v2 = pd.DataFrame({'id': _id,
                        'prod_smiles': prod_smiles,
                        'rxn_smiles': rxn_smiles,
                        'atom mapped smiles-input': atom_map_smiles,
                        'not atom mapped smiles-input':not_atom_map_smiles,
                    'canonical_rxn_smiles': canonical_rxn_smiles})

In [15]:
data_v2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14013 entries, 0 to 14012
Data columns (total 6 columns):
id                              14013 non-null object
prod_smiles                     14013 non-null object
rxn_smiles                      14013 non-null object
atom mapped smiles-input        14013 non-null object
not atom mapped smiles-input    14013 non-null object
canonical_rxn_smiles            14013 non-null object
dtypes: object(6)
memory usage: 657.0+ KB


In [16]:
#double check that you removed duplicates
from collections import Counter
post_removal = Counter (data_v2['canonical_rxn_smiles'])
pre_removal = Counter (data['canonical_rxn_smiles'])

In [17]:
Counter (pre_removal.values())

Counter({1: 12393,
         2: 1246,
         17: 1,
         3: 210,
         11: 4,
         4: 79,
         5: 20,
         25: 2,
         16: 2,
         6: 17,
         18: 3,
         13: 5,
         29: 1,
         8: 7,
         30: 1,
         12: 4,
         7: 11,
         26: 2,
         15: 1,
         14: 2,
         9: 2})

In [18]:
Counter (post_removal.values())

Counter({1: 14013})

In data_v2, duplicates were removed!

# Get reaction templates using RDEnzyme
Approach: Get reaction templates and keep reactions that have popular reaction templates only! This can help test for generalizability of reactions in topK accuracy analysis.

## Get reaction SMILES

In [20]:
rxn_smiles_list = []
for row in data_v2.itertuples():
    rxn_smiles_list.append (row[3])

In [21]:
rxn_smiles_list

['O=c1ccn([C@@H]2O[C@H](COP(=O)([O-])OP(=O)([O-])O[C@H:29]3[O:30][C@H:31]([CH2:32][OH:33])[C@H:34]([OH:35])[C@H:36]([OH:37])[C@H:38]3[OH:39])[C@@H](O)[C@H]2O)c(=O)[nH]1.[O:1]=[P:2]([O-:3])([O-:4])[O:5][CH2:6][C@H:7]([OH:8])[CH2:9][OH:10]>>[O:1]=[P:2]([O-:3])([O-:4])[O:5][CH2:6][C@H:7]([OH:8])[CH2:9][O:10][C@H:29]1[O:30][C@H:31]([CH2:32][OH:33])[C@H:34]([OH:35])[C@H:36]([OH:37])[C@H:38]1[OH:39]',
 '[NH4+:45].[O:46]=[C:47]([O-:48])[CH2:49][CH2:50][c:51]1[cH:52][cH:53][c:54]([OH:55])[c:56]([OH:57])[cH:58]1>>[NH3+:45][C@H:49]([C:47](=[O:46])[O-:48])[CH2:50][c:51]1[cH:52][cH:53][c:54]([OH:55])[c:56]([OH:57])[cH:58]1',
 '[O:1]=[C:2]1[O:3][C@H:4]([CH2:5][OH:6])[C@@H:7]([OH:8])[C@@H:9]1[OH:10]>>[O:1]=[C:2]1[O:3][C@H:4]([CH2:5][OH:6])[C:7]([O-:8])=[C:9]1[OH:10]',
 'O=[C:2]([O-:3])/[CH:4]=[CH:5]\\[CH:6]=[CH:7]/[C:8](=O)[O-:10]>>[c:2]1([OH:3])[cH:4][cH:5][cH:6][cH:7][c:8]1[OH:10]',
 'O=c1ccn([C@@H]2O[C@H](COP(=O)([O-])OP(=O)([O-])O[C@H:29]3[O:30][C@H:31]([CH2:32][OH:33])[C@@H:34]([OH:35])[C@H:36]

## Get templates from rxn_smiles_list

In [24]:
from template_extractor_enz_v4 import extract_from_reaction

In [25]:
def get_templates(rxn_smi):
    
    # extracts the template    
    try:
        #convert reaction into a dictionary
        reaction = {}
        rct_0, rea_0, prd_0 = rxn_smi.split(' ')[0].split('>')
        reaction['reactants'] = rct_0
        reaction['products'] = prd_0
        reaction['_id'] = 0
                
        #extract the template
        template = extract_from_reaction(reaction)['reaction_smarts']
    
    # fails to extract template
    except:
        template=None
    return template

In [26]:
from joblib import Parallel, delayed

In [27]:
templates=Parallel(n_jobs=20, verbose=20)(delayed(get_templates)(rxn_smi) for rxn_smi in rxn_smiles_list)

[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   1 tasks      | elapsed:    0.4s
[Parallel(n_jobs=20)]: Batch computation too fast (0.1538s.) Setting batch_size=2.
[Parallel(n_jobs=20)]: Done   2 tasks      | elapsed:    0.4s
[Parallel(n_jobs=20)]: Done   3 tasks      | elapsed:    0.4s
[Parallel(n_jobs=20)]: Done   4 tasks      | elapsed:    0.4s
[Parallel(n_jobs=20)]: Done   5 tasks      | elapsed:    0.4s
[Parallel(n_jobs=20)]: Done   6 tasks      | elapsed:    0.4s
[Parallel(n_jobs=20)]: Done   7 tasks      | elapsed:    0.4s
[Parallel(n_jobs=20)]: Done   8 tasks      | elapsed:    0.4s
[Parallel(n_jobs=20)]: Done   9 tasks      | elapsed:    0.4s
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.4s
[Parallel(n_jobs=20)]: Done  11 tasks      | elapsed:    0.4s
[Parallel(n_jobs=20)]: Done  12 tasks      | elapsed:    0.4s
[Parallel(n_jobs=20)]: Done  13 tasks      | elapsed:    0.4s
[Parallel(n_jobs=20)]: Done  14 ta

[Parallel(n_jobs=20)]: Done 372 tasks      | elapsed:    1.4s
[Parallel(n_jobs=20)]: Done 380 tasks      | elapsed:    1.4s
[Parallel(n_jobs=20)]: Done 388 tasks      | elapsed:    1.4s
[Parallel(n_jobs=20)]: Done 396 tasks      | elapsed:    1.5s
[Parallel(n_jobs=20)]: Done 404 tasks      | elapsed:    1.5s
[Parallel(n_jobs=20)]: Done 412 tasks      | elapsed:    1.5s
[Parallel(n_jobs=20)]: Done 416 tasks      | elapsed:    1.5s
[Parallel(n_jobs=20)]: Done 424 tasks      | elapsed:    1.5s
[Parallel(n_jobs=20)]: Done 432 tasks      | elapsed:    1.5s
[Parallel(n_jobs=20)]: Done 440 tasks      | elapsed:    1.5s
[Parallel(n_jobs=20)]: Done 448 tasks      | elapsed:    1.5s
[Parallel(n_jobs=20)]: Done 456 tasks      | elapsed:    1.6s
[Parallel(n_jobs=20)]: Done 464 tasks      | elapsed:    1.6s
[Parallel(n_jobs=20)]: Done 472 tasks      | elapsed:    1.6s
[Parallel(n_jobs=20)]: Done 480 tasks      | elapsed:    1.6s
[Parallel(n_jobs=20)]: Done 488 tasks      | elapsed:    1.6s
[Paralle

[Parallel(n_jobs=20)]: Done 1464 tasks      | elapsed:    3.6s
[Parallel(n_jobs=20)]: Done 1472 tasks      | elapsed:    3.6s
[Parallel(n_jobs=20)]: Done 1480 tasks      | elapsed:    3.6s
[Parallel(n_jobs=20)]: Done 1488 tasks      | elapsed:    3.6s
[Parallel(n_jobs=20)]: Done 1496 tasks      | elapsed:    3.6s
[Parallel(n_jobs=20)]: Done 1504 tasks      | elapsed:    3.6s
[Parallel(n_jobs=20)]: Done 1512 tasks      | elapsed:    3.6s
[Parallel(n_jobs=20)]: Done 1520 tasks      | elapsed:    3.7s
[Parallel(n_jobs=20)]: Done 1528 tasks      | elapsed:    3.7s
[Parallel(n_jobs=20)]: Done 1536 tasks      | elapsed:    3.7s
[Parallel(n_jobs=20)]: Done 1544 tasks      | elapsed:    3.7s
[Parallel(n_jobs=20)]: Done 1552 tasks      | elapsed:    3.8s
[Parallel(n_jobs=20)]: Done 1560 tasks      | elapsed:    3.8s
[Parallel(n_jobs=20)]: Done 1568 tasks      | elapsed:    3.8s
[Parallel(n_jobs=20)]: Done 1576 tasks      | elapsed:    3.8s
[Parallel(n_jobs=20)]: Done 1584 tasks      | elapsed: 

[Parallel(n_jobs=20)]: Done 2592 tasks      | elapsed:    5.9s
[Parallel(n_jobs=20)]: Done 2600 tasks      | elapsed:    5.9s
[Parallel(n_jobs=20)]: Done 2608 tasks      | elapsed:    6.0s
[Parallel(n_jobs=20)]: Done 2616 tasks      | elapsed:    6.0s
[Parallel(n_jobs=20)]: Done 2624 tasks      | elapsed:    6.0s
[Parallel(n_jobs=20)]: Done 2632 tasks      | elapsed:    6.1s
[Parallel(n_jobs=20)]: Done 2640 tasks      | elapsed:    6.1s
[Parallel(n_jobs=20)]: Done 2648 tasks      | elapsed:    6.1s
[Parallel(n_jobs=20)]: Done 2656 tasks      | elapsed:    6.1s
[Parallel(n_jobs=20)]: Done 2664 tasks      | elapsed:    6.1s
[Parallel(n_jobs=20)]: Done 2672 tasks      | elapsed:    6.1s
[Parallel(n_jobs=20)]: Done 2680 tasks      | elapsed:    6.1s
[Parallel(n_jobs=20)]: Done 2688 tasks      | elapsed:    6.2s
[Parallel(n_jobs=20)]: Done 2696 tasks      | elapsed:    6.2s
[Parallel(n_jobs=20)]: Done 2704 tasks      | elapsed:    6.2s
[Parallel(n_jobs=20)]: Done 2712 tasks      | elapsed: 

[Parallel(n_jobs=20)]: Done 3712 tasks      | elapsed:    8.3s
[Parallel(n_jobs=20)]: Done 3720 tasks      | elapsed:    8.3s
[Parallel(n_jobs=20)]: Done 3728 tasks      | elapsed:    8.3s
[Parallel(n_jobs=20)]: Done 3736 tasks      | elapsed:    8.3s
[Parallel(n_jobs=20)]: Done 3744 tasks      | elapsed:    8.3s
[Parallel(n_jobs=20)]: Done 3752 tasks      | elapsed:    8.3s
[Parallel(n_jobs=20)]: Done 3760 tasks      | elapsed:    8.4s
[Parallel(n_jobs=20)]: Done 3768 tasks      | elapsed:    8.4s
[Parallel(n_jobs=20)]: Done 3776 tasks      | elapsed:    8.4s
[Parallel(n_jobs=20)]: Done 3784 tasks      | elapsed:    8.4s
[Parallel(n_jobs=20)]: Done 3792 tasks      | elapsed:    8.4s
[Parallel(n_jobs=20)]: Done 3800 tasks      | elapsed:    8.4s
[Parallel(n_jobs=20)]: Done 3808 tasks      | elapsed:    8.4s
[Parallel(n_jobs=20)]: Done 3816 tasks      | elapsed:    8.4s
[Parallel(n_jobs=20)]: Done 3824 tasks      | elapsed:    8.4s
[Parallel(n_jobs=20)]: Done 3832 tasks      | elapsed: 

[Parallel(n_jobs=20)]: Done 4816 tasks      | elapsed:   10.6s
[Parallel(n_jobs=20)]: Done 4824 tasks      | elapsed:   10.7s
[Parallel(n_jobs=20)]: Done 4832 tasks      | elapsed:   10.7s
[Parallel(n_jobs=20)]: Done 4840 tasks      | elapsed:   10.7s
[Parallel(n_jobs=20)]: Done 4848 tasks      | elapsed:   10.7s
[Parallel(n_jobs=20)]: Done 4856 tasks      | elapsed:   10.7s
[Parallel(n_jobs=20)]: Done 4864 tasks      | elapsed:   10.7s
[Parallel(n_jobs=20)]: Done 4872 tasks      | elapsed:   10.7s
[Parallel(n_jobs=20)]: Done 4880 tasks      | elapsed:   10.7s
[Parallel(n_jobs=20)]: Done 4888 tasks      | elapsed:   10.8s
[Parallel(n_jobs=20)]: Done 4896 tasks      | elapsed:   10.8s
[Parallel(n_jobs=20)]: Done 4904 tasks      | elapsed:   10.8s
[Parallel(n_jobs=20)]: Done 4912 tasks      | elapsed:   10.8s
[Parallel(n_jobs=20)]: Done 4920 tasks      | elapsed:   10.9s
[Parallel(n_jobs=20)]: Done 4928 tasks      | elapsed:   10.9s
[Parallel(n_jobs=20)]: Done 4936 tasks      | elapsed: 

[Parallel(n_jobs=20)]: Done 5928 tasks      | elapsed:   12.8s
[Parallel(n_jobs=20)]: Done 5936 tasks      | elapsed:   12.8s
[Parallel(n_jobs=20)]: Done 5944 tasks      | elapsed:   12.8s
[Parallel(n_jobs=20)]: Done 5952 tasks      | elapsed:   12.8s
[Parallel(n_jobs=20)]: Done 5960 tasks      | elapsed:   12.8s
[Parallel(n_jobs=20)]: Done 5968 tasks      | elapsed:   12.9s
[Parallel(n_jobs=20)]: Done 5976 tasks      | elapsed:   12.9s
[Parallel(n_jobs=20)]: Done 5984 tasks      | elapsed:   12.9s
[Parallel(n_jobs=20)]: Done 5992 tasks      | elapsed:   12.9s
[Parallel(n_jobs=20)]: Done 6000 tasks      | elapsed:   12.9s
[Parallel(n_jobs=20)]: Done 6008 tasks      | elapsed:   12.9s
[Parallel(n_jobs=20)]: Done 6016 tasks      | elapsed:   13.0s
[Parallel(n_jobs=20)]: Done 6024 tasks      | elapsed:   13.0s
[Parallel(n_jobs=20)]: Done 6032 tasks      | elapsed:   13.0s
[Parallel(n_jobs=20)]: Done 6040 tasks      | elapsed:   13.0s
[Parallel(n_jobs=20)]: Done 6048 tasks      | elapsed: 

[Parallel(n_jobs=20)]: Done 7016 tasks      | elapsed:   15.1s
[Parallel(n_jobs=20)]: Done 7024 tasks      | elapsed:   15.1s
[Parallel(n_jobs=20)]: Done 7032 tasks      | elapsed:   15.1s
[Parallel(n_jobs=20)]: Done 7040 tasks      | elapsed:   15.1s
[Parallel(n_jobs=20)]: Done 7048 tasks      | elapsed:   15.1s
[Parallel(n_jobs=20)]: Done 7056 tasks      | elapsed:   15.1s
[Parallel(n_jobs=20)]: Done 7064 tasks      | elapsed:   15.1s
[Parallel(n_jobs=20)]: Done 7072 tasks      | elapsed:   15.1s
[Parallel(n_jobs=20)]: Done 7080 tasks      | elapsed:   15.2s
[Parallel(n_jobs=20)]: Done 7088 tasks      | elapsed:   15.2s
[Parallel(n_jobs=20)]: Done 7096 tasks      | elapsed:   15.2s
[Parallel(n_jobs=20)]: Done 7104 tasks      | elapsed:   15.2s
[Parallel(n_jobs=20)]: Done 7112 tasks      | elapsed:   15.3s
[Parallel(n_jobs=20)]: Done 7120 tasks      | elapsed:   15.3s
[Parallel(n_jobs=20)]: Done 7128 tasks      | elapsed:   15.3s
[Parallel(n_jobs=20)]: Done 7136 tasks      | elapsed: 

[Parallel(n_jobs=20)]: Done 8152 tasks      | elapsed:   17.7s
[Parallel(n_jobs=20)]: Done 8160 tasks      | elapsed:   17.7s
[Parallel(n_jobs=20)]: Done 8168 tasks      | elapsed:   17.7s
[Parallel(n_jobs=20)]: Done 8176 tasks      | elapsed:   17.7s
[Parallel(n_jobs=20)]: Done 8184 tasks      | elapsed:   17.7s
[Parallel(n_jobs=20)]: Done 8192 tasks      | elapsed:   17.7s
[Parallel(n_jobs=20)]: Done 8200 tasks      | elapsed:   17.7s
[Parallel(n_jobs=20)]: Done 8208 tasks      | elapsed:   17.7s
[Parallel(n_jobs=20)]: Done 8216 tasks      | elapsed:   17.7s
[Parallel(n_jobs=20)]: Done 8224 tasks      | elapsed:   17.8s
[Parallel(n_jobs=20)]: Done 8232 tasks      | elapsed:   17.8s
[Parallel(n_jobs=20)]: Done 8240 tasks      | elapsed:   17.9s
[Parallel(n_jobs=20)]: Done 8248 tasks      | elapsed:   17.9s
[Parallel(n_jobs=20)]: Done 8256 tasks      | elapsed:   17.9s
[Parallel(n_jobs=20)]: Done 8264 tasks      | elapsed:   17.9s
[Parallel(n_jobs=20)]: Done 8272 tasks      | elapsed: 

[Parallel(n_jobs=20)]: Done 9224 tasks      | elapsed:   19.9s
[Parallel(n_jobs=20)]: Done 9232 tasks      | elapsed:   19.9s
[Parallel(n_jobs=20)]: Done 9240 tasks      | elapsed:   19.9s
[Parallel(n_jobs=20)]: Done 9248 tasks      | elapsed:   19.9s
[Parallel(n_jobs=20)]: Done 9256 tasks      | elapsed:   19.9s
[Parallel(n_jobs=20)]: Done 9264 tasks      | elapsed:   19.9s
[Parallel(n_jobs=20)]: Done 9272 tasks      | elapsed:   20.0s
[Parallel(n_jobs=20)]: Done 9280 tasks      | elapsed:   20.0s
[Parallel(n_jobs=20)]: Done 9288 tasks      | elapsed:   20.0s
[Parallel(n_jobs=20)]: Done 9296 tasks      | elapsed:   20.0s
[Parallel(n_jobs=20)]: Done 9304 tasks      | elapsed:   20.0s
[Parallel(n_jobs=20)]: Done 9312 tasks      | elapsed:   20.0s
[Parallel(n_jobs=20)]: Done 9320 tasks      | elapsed:   20.0s
[Parallel(n_jobs=20)]: Done 9328 tasks      | elapsed:   20.0s
[Parallel(n_jobs=20)]: Done 9336 tasks      | elapsed:   20.1s
[Parallel(n_jobs=20)]: Done 9344 tasks      | elapsed: 

[Parallel(n_jobs=20)]: Done 10336 tasks      | elapsed:   22.0s
[Parallel(n_jobs=20)]: Done 10344 tasks      | elapsed:   22.1s
[Parallel(n_jobs=20)]: Done 10352 tasks      | elapsed:   22.1s
[Parallel(n_jobs=20)]: Done 10360 tasks      | elapsed:   22.1s
[Parallel(n_jobs=20)]: Done 10368 tasks      | elapsed:   22.1s
[Parallel(n_jobs=20)]: Done 10376 tasks      | elapsed:   22.1s
[Parallel(n_jobs=20)]: Done 10384 tasks      | elapsed:   22.1s
[Parallel(n_jobs=20)]: Done 10392 tasks      | elapsed:   22.2s
[Parallel(n_jobs=20)]: Done 10400 tasks      | elapsed:   22.2s
[Parallel(n_jobs=20)]: Done 10408 tasks      | elapsed:   22.2s
[Parallel(n_jobs=20)]: Done 10416 tasks      | elapsed:   22.2s
[Parallel(n_jobs=20)]: Done 10424 tasks      | elapsed:   22.2s
[Parallel(n_jobs=20)]: Done 10432 tasks      | elapsed:   22.2s
[Parallel(n_jobs=20)]: Done 10440 tasks      | elapsed:   22.3s
[Parallel(n_jobs=20)]: Done 10448 tasks      | elapsed:   22.3s
[Parallel(n_jobs=20)]: Done 10456 tasks 

[Parallel(n_jobs=20)]: Done 11424 tasks      | elapsed:   24.2s
[Parallel(n_jobs=20)]: Done 11432 tasks      | elapsed:   24.2s
[Parallel(n_jobs=20)]: Done 11440 tasks      | elapsed:   24.2s
[Parallel(n_jobs=20)]: Done 11448 tasks      | elapsed:   24.2s
[Parallel(n_jobs=20)]: Done 11456 tasks      | elapsed:   24.2s
[Parallel(n_jobs=20)]: Done 11464 tasks      | elapsed:   24.3s
[Parallel(n_jobs=20)]: Done 11472 tasks      | elapsed:   24.3s
[Parallel(n_jobs=20)]: Done 11480 tasks      | elapsed:   24.3s
[Parallel(n_jobs=20)]: Done 11488 tasks      | elapsed:   24.3s
[Parallel(n_jobs=20)]: Done 11496 tasks      | elapsed:   24.3s
[Parallel(n_jobs=20)]: Done 11504 tasks      | elapsed:   24.3s
[Parallel(n_jobs=20)]: Done 11512 tasks      | elapsed:   24.4s
[Parallel(n_jobs=20)]: Done 11520 tasks      | elapsed:   24.4s
[Parallel(n_jobs=20)]: Done 11528 tasks      | elapsed:   24.4s
[Parallel(n_jobs=20)]: Done 11536 tasks      | elapsed:   24.4s
[Parallel(n_jobs=20)]: Done 11544 tasks 

[Parallel(n_jobs=20)]: Done 12464 tasks      | elapsed:   26.6s
[Parallel(n_jobs=20)]: Done 12472 tasks      | elapsed:   26.6s
[Parallel(n_jobs=20)]: Done 12480 tasks      | elapsed:   26.6s
[Parallel(n_jobs=20)]: Done 12488 tasks      | elapsed:   26.6s
[Parallel(n_jobs=20)]: Done 12496 tasks      | elapsed:   26.7s
[Parallel(n_jobs=20)]: Done 12504 tasks      | elapsed:   26.7s
[Parallel(n_jobs=20)]: Done 12512 tasks      | elapsed:   26.7s
[Parallel(n_jobs=20)]: Done 12520 tasks      | elapsed:   26.7s
[Parallel(n_jobs=20)]: Done 12528 tasks      | elapsed:   26.7s
[Parallel(n_jobs=20)]: Done 12536 tasks      | elapsed:   26.7s
[Parallel(n_jobs=20)]: Done 12544 tasks      | elapsed:   26.8s
[Parallel(n_jobs=20)]: Done 12552 tasks      | elapsed:   26.8s
[Parallel(n_jobs=20)]: Done 12560 tasks      | elapsed:   26.8s
[Parallel(n_jobs=20)]: Done 12568 tasks      | elapsed:   26.8s
[Parallel(n_jobs=20)]: Done 12576 tasks      | elapsed:   26.8s
[Parallel(n_jobs=20)]: Done 12584 tasks 

[Parallel(n_jobs=20)]: Done 13536 tasks      | elapsed:   30.4s
[Parallel(n_jobs=20)]: Done 13544 tasks      | elapsed:   30.5s
[Parallel(n_jobs=20)]: Done 13552 tasks      | elapsed:   30.5s
[Parallel(n_jobs=20)]: Done 13560 tasks      | elapsed:   30.5s
[Parallel(n_jobs=20)]: Done 13568 tasks      | elapsed:   30.5s
[Parallel(n_jobs=20)]: Done 13576 tasks      | elapsed:   30.5s
[Parallel(n_jobs=20)]: Done 13584 tasks      | elapsed:   30.5s
[Parallel(n_jobs=20)]: Done 13592 tasks      | elapsed:   30.6s
[Parallel(n_jobs=20)]: Done 13600 tasks      | elapsed:   30.6s
[Parallel(n_jobs=20)]: Done 13608 tasks      | elapsed:   30.7s
[Parallel(n_jobs=20)]: Done 13616 tasks      | elapsed:   30.7s
[Parallel(n_jobs=20)]: Done 13624 tasks      | elapsed:   30.8s
[Parallel(n_jobs=20)]: Done 13632 tasks      | elapsed:   30.8s
[Parallel(n_jobs=20)]: Done 13640 tasks      | elapsed:   30.8s
[Parallel(n_jobs=20)]: Done 13648 tasks      | elapsed:   30.8s
[Parallel(n_jobs=20)]: Done 13656 tasks 

[Parallel(n_jobs=20)]: Done 14013 out of 14013 | elapsed:   32.0s finished


## Keep reactions that have a popular template
In this analysis, a popular template is defined as one that occurs three times or more.

In [35]:
#dictionary that links reaction SMILES to template
rxn_template_dict = {}
for (rxn_smi, template) in zip (rxn_smiles_list,templates):
    rxn_template_dict [rxn_smi] = template 

In [36]:
#dictionary that links template to the number of times it occurs in the dataset
templates2 = {}
for retro_canonical_uni in templates:
    if retro_canonical_uni in templates2:
        templates2[retro_canonical_uni] += 1
    else:
        templates2[retro_canonical_uni] = 1

In [37]:
#understand what the most popular templates look like
for template, count in sorted(templates2.items(), 
                              key=lambda x: x[1], reverse=True)[:50]:
    print('{}     {}'.format(count, template))

177     ([C:2]-[OH;D1;+0:1])>>(O=P(-[O-])(-[O-])-[O;H0;D2;+0:1]-[C:2])
124     ([C:2]-[C;H0;D3;+0:1](=[O;D1;H0:3])-[O;H0;D2;+0:5]-[C:4])>>C-C(-C)(-C-O-P(=O)(-[O-])-O-P(=O)(-[O-])-O-C-[C@H]1-O-[C@@H](-n2:c:n:c3:c(-N):n:c:n:c:3:2)-[C@H](-O)-[C@@H]-1-O-P(=O)(-[O-])-[O-])-[C@@H](-O)-C(=O)-N-C-C-C(=O)-N-C-C-S-[C;H0;D3;+0:1](-[C:2])=[O;D1;H0:3].[C:4]-[OH;D1;+0:5]
118     ([C:5]-[O;H0;D2;+0:6]-[P;H0;D4;+0:1](-[O-;H0;D1:4])(-[O;-;D1;H0:2])=[O;D1;H0:3])>>N-c1:n:c:n:c2:c:1:n:c:n:2-[C@@H]1-O-[C@H](-C-O-P(=O)(-[O-])-O-[P;H0;D4;+0:1](-[O;-;D1;H0:2])(=[O;D1;H0:3])-[O;H0;D2;+0:4]-P(=O)(-[O-])-[O-])-[C@@H](-O)-[C@H]-1-O.[C:5]-[OH;D1;+0:6]
110     ([C:5]-[O;H0;D2;+0:6]-[P;H0;D4;+0:1](-[O;-;D1;H0:2])(-[O;-;D1;H0:3])=[O;D1;H0:4])>>O-[P;H0;D4;+0:1](-[O;-;D1;H0:2])(-[O;-;D1;H0:3])=[O;D1;H0:4].[C:5]-[OH;D1;+0:6]
90     ([OH;D1;+0:1]-[c:2])>>(C-[O;H0;D2;+0:1]-[c:2])
88     ([C:2]-[OH;D1;+0:1])>>(C-C-C-C-C-C-C-C/C=C\C-C-C-C-C-C-C-C(=O)-[O;H0;D2;+0:1]-[C:2])
85     ([CH3;D1;+0:1]-[O;H0;D2;+0:2]-[c:3])>>N-c1:n:

In [39]:
#understand how many reactions are lost by restricting the template count values
total_rxn = 0
for count in templates2.values():
    if count >= 3:
        total_rxn += count

In [40]:
print (total_rxn)

6973


## Keep reactions that have a template >3

In [43]:
_id = []
prod_smiles = []
rxn_smiles = []
atom_map_smiles = []
not_atom_map_smiles = []

for row in data_v2.itertuples():
    
    rxn_smiles_list = []
    #get reaction template
    rxn_template = rxn_template_dict[row[3]]
    #get template count
    count = templates2[rxn_template]
    #if count >=3, then keep the reaction
    if count >= 3:
        _id.append (row[1])
        prod_smiles.append (row[2])
        rxn_smiles.append (row[3])
        atom_map_smiles.append (row[4])
        not_atom_map_smiles.append (row[5])

In [45]:
data_v3 = pd.DataFrame({'id': _id,
                        'prod_smiles': prod_smiles,
                        'rxn_smiles': rxn_smiles,
                        'atom mapped smiles-input': atom_map_smiles,
                        'not atom mapped smiles-input': not_atom_map_smiles})

In [46]:
data_v3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6973 entries, 0 to 6972
Data columns (total 5 columns):
id                              6973 non-null object
prod_smiles                     6973 non-null object
rxn_smiles                      6973 non-null object
atom mapped smiles-input        6973 non-null object
not atom mapped smiles-input    6973 non-null object
dtypes: object(5)
memory usage: 272.5+ KB


# Randomly split the data into training, validation, and test sets

Randomly split the dataset into training, validation, and test splits.

In [47]:
import numpy as np
import time

In [48]:
def split_data_df(data, test_frac = 0.1, val_frac = 0.1, shuffle=True, seed = 0):
   
    # Define shuffling
    if shuffle:
        if seed is None:
            np.random.seed(int(time.time()))
        else:
            np.random.seed(seed)
            
        def shuffle_func(x):
            np.random.shuffle(x)
    else:
        def shuffle_func(x):
            pass
    
    #get all indeces
    indeces = data.index.tolist()
    N = len(indeces)
    print ('{} reactions available in the dataset'.format(N))
    
    shuffle_func(indeces)
    
    train_end = int((1.0-val_frac-test_frac)*N)
    val_end = int((1.0-test_frac)*N)
    
    for i in indeces [:train_end]:
        data.set_value(i, 'dataset', 'train')
    for i in indeces [train_end:val_end]:
        data.set_value(i,'dataset', 'val')
    for i in indeces [val_end:]:
        data.set_value(i,'dataset', 'test')
        
    print(data['dataset'].value_counts())
    

In [49]:
split_data_df(data_v3)

6973 reactions available in the dataset
train    5578
test      698
val       697
Name: dataset, dtype: int64




In [50]:
data_v3

Unnamed: 0,id,prod_smiles,rxn_smiles,atom mapped smiles-input,not atom mapped smiles-input,dataset
0,20342,O=P([O-])([O-])OC[C@H](O)CO[C@H]1O[C@H](CO)[C@...,O=c1ccn([C@@H]2O[C@H](COP(=O)([O-])OP(=O)([O-]...,[[O:11]=[c:12]1[cH:13][cH:14][n:15]([C@@H:16]2...,O=C1C=CN([C@@H]2O[C@H](COP(=O)([O-])OP(=O)([O-...,test
1,15154,O=C([O-])c1ccc(O[C@@H]2O[C@H](CO)[C@@H](O)[C@H...,O=c1ccn([C@@H]2O[C@H](COP(=O)([O-])OP(=O)([O-]...,[[O:11]=[c:12]1[cH:13][cH:14][n:15]([C@@H:16]2...,O=C([O-])C1=CC=C(O)C=C1.O=C1C=CN([C@@H]2O[C@H]...,train
2,50174,CCCCC/C=C\C/C=C\C/C=C\C/C=C\CCCC(=O)N[C@@H](C)...,OO[C@@H:9]([CH2:8]/[CH:7]=[CH:6]\[CH2:5][CH2:4...,[[CH3:1][CH2:2][CH2:3][CH2:4][CH2:5]/[CH:6]=[C...,CCCCC/C=C\C[C@@H](/C=C/C=C\C/C=C\CCCC(=O)N[C@@...,train
3,37517,CCCCCCCCCCCC(=O)SCCNC(=O)CCNC(=O)[C@H](O)C(C)(...,CCCCCCCCCCCCCCCC(=O)OC[C@H](COP(=O)([O-])OCC[N...,[[CH3:1][CH2:2][CH2:3][CH2:4][CH2:5][CH2:6][CH...,CC(C)(COP(=O)([O-])OP(=O)([O-])OC[C@H]1O[C@@H]...,train
4,19366,O=C(COP(=O)([O-])[O-])[C@H](O)[C@H](O)COP(=O)(...,Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)([O-])O[P:2...,[[NH2:1][c:2]1[n:3][cH:4][n:5][c:6]2[c:7]1[n:8...,NC1=NC=NC2=C1N=CN2[C@@H]1O[C@H](COP(=O)([O-])O...,train
...,...,...,...,...,...,...
6968,54898,CC/C=C\C[C@@H]1C(=O)C=C[C@@H]1CCCCCCCC(=O)[O-],CC(C)(COP(=O)([O-])OP(=O)([O-])OC[C@H]1O[C@@H]...,[[CH3:1][CH2:2]/[CH:3]=[CH:4]\[CH2:5][C@@H:6]1...,CC/C=C\C[C@@H]1C(=O)C=C[C@@H]1CCCCCCCC(=O)SCCN...,train
6969,44353,CC(C)C[C@H](NC(=O)[C@@H]([NH3+])CNC(=O)/C=C/C(...,[CH3:1][CH:2]([CH3:3])[CH2:4][C@H:5]([NH3+:6])...,[[CH3:1][CH:2]([CH3:3])[CH2:4][C@H:5]([NH3+:6]...,CC(C)C[C@H]([NH3+])C(=O)[O-].NC(=O)/C=C/C(=O)N...,train
6970,49602,O=C([O-])CCCCCCC(=O)[O-],CC(C)(COP(=O)([O-])OP(=O)([O-])OC[C@H]1O[C@@H]...,[[CH3:1][C:2]([CH3:3])([CH2:4][O:5][P:6](=[O:7...,CC(C)(COP(=O)([O-])OP(=O)([O-])OC[C@H]1O[C@@H]...,val
6971,40292,CC(C)(COP(=O)([O-])OP(=O)([O-])OC[C@H]1O[C@@H]...,[CH3:1][C:2]([CH3:3])([CH2:4][O:5][P:6](=[O:7]...,[[CH3:1][C:2]([CH3:3])([CH2:4][O:5][P:6](=[O:7...,CC(C)(COP(=O)([O-])OP(=O)([O-])OC[C@H]1O[C@@H]...,train


In [51]:
datasub = data_v3.loc[data_v3['dataset'] == 'train']
datasub_test = data_v3.loc [data_v3['dataset'] == 'test']
datasub_val = data_v3.loc [data_v3['dataset'] == 'val']

In [52]:
datasub.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5578 entries, 1 to 6972
Data columns (total 6 columns):
id                              5578 non-null object
prod_smiles                     5578 non-null object
rxn_smiles                      5578 non-null object
atom mapped smiles-input        5578 non-null object
not atom mapped smiles-input    5578 non-null object
dataset                         5578 non-null object
dtypes: object(6)
memory usage: 305.0+ KB


In [53]:
datasub_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 698 entries, 0 to 6952
Data columns (total 6 columns):
id                              698 non-null object
prod_smiles                     698 non-null object
rxn_smiles                      698 non-null object
atom mapped smiles-input        698 non-null object
not atom mapped smiles-input    698 non-null object
dataset                         698 non-null object
dtypes: object(6)
memory usage: 38.2+ KB


In [54]:
datasub_val.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 697 entries, 7 to 6970
Data columns (total 6 columns):
id                              697 non-null object
prod_smiles                     697 non-null object
rxn_smiles                      697 non-null object
atom mapped smiles-input        697 non-null object
not atom mapped smiles-input    697 non-null object
dataset                         697 non-null object
dtypes: object(6)
memory usage: 38.1+ KB


In [55]:
datasub_test.head(5)

Unnamed: 0,id,prod_smiles,rxn_smiles,atom mapped smiles-input,not atom mapped smiles-input,dataset
0,20342,O=P([O-])([O-])OC[C@H](O)CO[C@H]1O[C@H](CO)[C@...,O=c1ccn([C@@H]2O[C@H](COP(=O)([O-])OP(=O)([O-]...,[[O:11]=[c:12]1[cH:13][cH:14][n:15]([C@@H:16]2...,O=C1C=CN([C@@H]2O[C@H](COP(=O)([O-])OP(=O)([O-...,test
21,50829,CCCCC[C@@H](/C=C/C=C\C/C=C\C/C=C\C/C=C\CCC(=O)...,[CH3:1][CH2:2][CH2:3][CH2:4][CH2:5]/[CH:6]=[CH...,[[CH3:1][CH2:2][CH2:3][CH2:4][CH2:5]/[CH:6]=[C...,CCCCC/C=C\C/C=C\C/C=C\C/C=C\C/C=C\CCC(=O)[O-]....,test
24,52285,CCCCC/C=C\C(O)/C=C\CCCCCCCC(=O)[O-],[CH3:1][CH2:2][CH2:3][CH2:4][CH2:5]/[CH:6]=[CH...,[[CH3:1][CH2:2][CH2:3][CH2:4][CH2:5]/[CH:6]=[C...,CC1=C(C)C=C2C(=C1)NC1=C(NC(=O)NC1=O)N2C[C@H](O...,test
25,37172,CCCCCCCC/C=C\CCCCCCCC(=O)OC[C@H](COP(=O)([O-])...,[CH3:1][CH2:2][CH2:3][CH2:4][CH2:5][CH2:6][CH2...,[[CH3:1][CH2:2][CH2:3][CH2:4][CH2:5][CH2:6][CH...,CCCCCCCC/C=C\CCCCCCCC(=O)OC[C@@H](O)COP(=O)([O...,test
63,44874,CCCCCCCCCCCCCCCC(=O)OC[C@H](COP(=O)([O-])OCC[N...,[O-][P:23]([O:22][CH2:21][C@@H:20]([CH2:19][O:...,[[CH3:1][CH2:2][CH2:3][CH2:4][CH2:5][CH2:6][CH...,CCCCCCCCCCCCCCCC(=O)OC[C@H](COP(=O)([O-])[O-])...,test


In [56]:
datasub_val.head(5)

Unnamed: 0,id,prod_smiles,rxn_smiles,atom mapped smiles-input,not atom mapped smiles-input,dataset
7,41156,CCCCCCCCC(/C=C/CCCCCCC(=O)[O-])OO,CCCCCCCCCCCCCCCC(=O)OC[C@H](COP(=O)([O-])OCC[N...,[[CH3:1][CH2:2][CH2:3][CH2:4][CH2:5][CH2:6][CH...,[H]O[H].[H][C@@](COC(=O)CCCCCCCCCCCCCCC)(COP(=...,val
28,45778,N#CCc1c[nH]c2ccccc12,[NH4+:1].O=[C:3]([O-])[CH2:5][c:6]1[cH:7][nH:8...,[[NH4+:1].[O:2]=[C:3]([O-:4])[CH2:5][c:6]1[cH:...,O=C([O-])CC1=CN/C2=C\C=C/C=C\12.[H][N+]([H])([...,val
35,24732,Nc1ccccc1C(=O)[O-],[NH2:1][c:2]1[cH:3][cH:4][cH:5][cH:6][cH:7]1.[...,[[NH2:1][c:2]1[cH:3][cH:4][cH:5][cH:6][cH:7]1....,NC1=CC=CC=C1.O=C=O>>NC1=C(C(=O)[O-])C=CC=C1.[H+],val
56,23149,CCC(=O)OP(=O)([O-])[O-],[CH3:1][CH2:2][C:3](=[O:4])[O-:5].Nc1ncnc2c1nc...,[[CH3:1][CH2:2][C:3](=[O:4])[O-:5].[NH2:6][c:7...,CCC(=O)[O-].NC1=NC=NC2=C1N=CN2[C@@H]1O[C@H](CO...,val
60,36688,CCCCCCCCCCCCC/C=C/[C@@H](O)[C@H](CO)NC(=O)CCCC...,[CH3:1][CH2:2][CH2:3][CH2:4][CH2:5][CH2:6][CH2...,[[CH3:1][CH2:2][CH2:3][CH2:4][CH2:5][CH2:6][CH...,CCCCCCCCCCCCC/C=C/[C@@H](O)[C@@H]([NH3+])CO.CC...,val


# Rebuild the test set

The goal is to rebuild the test set with precursor goal as a list.

In [57]:
import rdkit.Chem as Chem

In [58]:
datasub_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 698 entries, 0 to 6952
Data columns (total 6 columns):
id                              698 non-null object
prod_smiles                     698 non-null object
rxn_smiles                      698 non-null object
atom mapped smiles-input        698 non-null object
not atom mapped smiles-input    698 non-null object
dataset                         698 non-null object
dtypes: object(6)
memory usage: 38.2+ KB


In [59]:
datasub_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 698 entries, 0 to 6952
Data columns (total 6 columns):
id                              698 non-null object
prod_smiles                     698 non-null object
rxn_smiles                      698 non-null object
atom mapped smiles-input        698 non-null object
not atom mapped smiles-input    698 non-null object
dataset                         698 non-null object
dtypes: object(6)
memory usage: 38.2+ KB


In [60]:
_id = []
prod_smiles = []
rxn_smiles = []
ams_input = []
not_ams_input = []
canonical_rxn_smiles = []
dataset = []
prec_goal_collector = []

debug = False
count = 0

#Loop through the test set
for row in datasub_test.itertuples():
    
    # add to the count
    count += 1
    
    if debug and count == 10:
        break
    
    #at every iteration, start with an empty prec_goal_list!
    prec_goal_list = []

    #get the precursors
    prec_goal = Chem.MolFromSmiles(row[3].split('>')[0])
    [a.ClearProp('molAtomMapNumber') for a in prec_goal.GetAtoms()]
    prec_goal = Chem.MolToSmiles(prec_goal, True)

    #canonicalize
    prec_goal = Chem.MolToSmiles(Chem.MolFromSmiles(prec_goal), True)

    #add to list
    prec_goal_list.append (prec_goal)
    
    #store stuff so that you are able to rebuild the pandas dataframe!
    _id.append (row[1])
    prod_smiles.append (row[2])
    rxn_smiles.append (row[3])
    ams_input.append (row[4])
    not_ams_input.append (row[5])
    dataset.append (row[6])
    
    #probably unnecessary-there is always just one item in the prec_goal_list, so the set -> list is not
    #strictly needed. It doesn't hurt!
    prec_goal_list = list (set(prec_goal_list))
    #the final list that goes into the Pandas dataframe!
    prec_goal_collector.append (prec_goal_list)
    

In [61]:
datasub_test_2 = pd.DataFrame({'id': _id,
                                'prod_smiles': prod_smiles,
                                'rxn_smiles': rxn_smiles,
                                'atom mapped smiles-input': ams_input,
                                'not atom mapped smiles-input': not_ams_input,
                                'dataset': dataset,
                                  'prec_goal': prec_goal_collector})

In [62]:
datasub_test_2.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 698 entries, 0 to 697
Data columns (total 7 columns):
id                              698 non-null object
prod_smiles                     698 non-null object
rxn_smiles                      698 non-null object
atom mapped smiles-input        698 non-null object
not atom mapped smiles-input    698 non-null object
dataset                         698 non-null object
prec_goal                       698 non-null object
dtypes: object(7)
memory usage: 38.3+ KB


In [63]:
datasub_test_2.head()


Unnamed: 0,id,prod_smiles,rxn_smiles,atom mapped smiles-input,not atom mapped smiles-input,dataset,prec_goal
0,20342,O=P([O-])([O-])OC[C@H](O)CO[C@H]1O[C@H](CO)[C@...,O=c1ccn([C@@H]2O[C@H](COP(=O)([O-])OP(=O)([O-]...,[[O:11]=[c:12]1[cH:13][cH:14][n:15]([C@@H:16]2...,O=C1C=CN([C@@H]2O[C@H](COP(=O)([O-])OP(=O)([O-...,test,[O=P([O-])([O-])OC[C@H](O)CO.O=c1ccn([C@@H]2O[...
1,50829,CCCCC[C@@H](/C=C/C=C\C/C=C\C/C=C\C/C=C\CCC(=O)...,[CH3:1][CH2:2][CH2:3][CH2:4][CH2:5]/[CH:6]=[CH...,[[CH3:1][CH2:2][CH2:3][CH2:4][CH2:5]/[CH:6]=[C...,CCCCC/C=C\C/C=C\C/C=C\C/C=C\C/C=C\CCC(=O)[O-]....,test,[CCCCC/C=C\C/C=C\C/C=C\C/C=C\C/C=C\CCC(=O)[O-]...
2,52285,CCCCC/C=C\C(O)/C=C\CCCCCCCC(=O)[O-],[CH3:1][CH2:2][CH2:3][CH2:4][CH2:5]/[CH:6]=[CH...,[[CH3:1][CH2:2][CH2:3][CH2:4][CH2:5]/[CH:6]=[C...,CC1=C(C)C=C2C(=C1)NC1=C(NC(=O)NC1=O)N2C[C@H](O...,test,[CCCCC/C=C\C/C=C\CCCCCCCC(=O)[O-].O=O]
3,37172,CCCCCCCC/C=C\CCCCCCCC(=O)OC[C@H](COP(=O)([O-])...,[CH3:1][CH2:2][CH2:3][CH2:4][CH2:5][CH2:6][CH2...,[[CH3:1][CH2:2][CH2:3][CH2:4][CH2:5][CH2:6][CH...,CCCCCCCC/C=C\CCCCCCCC(=O)OC[C@@H](O)COP(=O)([O...,test,[CCCCCCCC/C=C\CCCCCCCC(=O)OC[C@@H](O)COP(=O)([...
4,44874,CCCCCCCCCCCCCCCC(=O)OC[C@H](COP(=O)([O-])OCC[N...,[O-][P:23]([O:22][CH2:21][C@@H:20]([CH2:19][O:...,[[CH3:1][CH2:2][CH2:3][CH2:4][CH2:5][CH2:6][CH...,CCCCCCCCCCCCCCCC(=O)OC[C@H](COP(=O)([O-])[O-])...,test,[CCCCCCCCCCCCCCCC(=O)OC[C@H](COP(=O)([O-])[O-]...


# Rebuild the validation set

The goal is to rebuild the validation set with precursor goal as a list.

In [64]:
datasub_val.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 697 entries, 7 to 6970
Data columns (total 6 columns):
id                              697 non-null object
prod_smiles                     697 non-null object
rxn_smiles                      697 non-null object
atom mapped smiles-input        697 non-null object
not atom mapped smiles-input    697 non-null object
dataset                         697 non-null object
dtypes: object(6)
memory usage: 38.1+ KB


In [65]:
datasub_val.head(1)

Unnamed: 0,id,prod_smiles,rxn_smiles,atom mapped smiles-input,not atom mapped smiles-input,dataset
7,41156,CCCCCCCCC(/C=C/CCCCCCC(=O)[O-])OO,CCCCCCCCCCCCCCCC(=O)OC[C@H](COP(=O)([O-])OCC[N...,[[CH3:1][CH2:2][CH2:3][CH2:4][CH2:5][CH2:6][CH...,[H]O[H].[H][C@@](COC(=O)CCCCCCCCCCCCCCC)(COP(=...,val


In [66]:
_id = []
prod_smiles = []
rxn_smiles = []
ams_input = []
not_ams_input = []
canonical_rxn_smiles = []
dataset = []
prec_goal_collector = []

debug = False
count = 0

for row in datasub_val.itertuples():
    
    count += 1
    if debug and count == 10:
        break
    prec_goal_list = []

    #get the precursors
    prec_goal = Chem.MolFromSmiles(row[3].split('>')[0])
    [a.ClearProp('molAtomMapNumber') for a in prec_goal.GetAtoms()]
    prec_goal = Chem.MolToSmiles(prec_goal, True)

    #canonicalize
    prec_goal = Chem.MolToSmiles(Chem.MolFromSmiles(prec_goal), True)

    #add to growing list
    prec_goal_list.append (prec_goal)

    _id.append (row[1])
    prod_smiles.append (row[2])
    rxn_smiles.append (row[3])
    ams_input.append (row[4])
    not_ams_input.append (row[5])
    dataset.append (row[6])
    ##probably unnecessary-there is always just one item in the prec_goal_list, so the set -> list is not
    #strictly needed. It doesn't hurt!
    prec_goal_list = list (set(prec_goal_list))
    prec_goal_collector.append (prec_goal_list)

In [67]:
datasub_val_2 = pd.DataFrame({'id': _id,
                                'prod_smiles': prod_smiles,
                                'rxn_smiles': rxn_smiles,
                                'atom mapped smiles-input': ams_input,
                                'not atom mapped smiles-input': not_ams_input,
                                'dataset': dataset,
                                  'prec_goal': prec_goal_collector})

# Save the training/ validation/ test dataset

In [72]:
datasub.to_pickle ('Training_Set_Processed_new_topK_v1.pkl')
datasub_test_2.to_pickle ('Test_Set_Processed_new_topK_v1.pkl')
datasub_val_2.to_pickle ('Validation_Set_Processed_new_topK_v1.pkl')