In [1]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import rdmolfiles
from joblib import Parallel, delayed

In [18]:
def generatereactsmarts(reactsmiles):
    reactsmarts=[]
    mapnum=[]
    ref=[]
    try:
        rxn=AllChem.ReactionFromSmarts(reactsmiles)
        rxn.Initialize()
    except:
        return False
    reactants=[rxn.GetReactantTemplate(m) for m in range(rxn.GetNumReactantTemplates())]
    products=[rxn.GetProductTemplate(m) for m in range(rxn.GetNumProductTemplates())]
    try:
        reactatoms=rxn.GetReactingAtoms(mappedAtomsOnly=True)
    except:
        return False
    newmap=1
    for i in range(len(reactants)):
        atoms=reactants[i].GetAtoms()
        id=[]
        for j in reactatoms[i]:
            id.append(j)
            mapnum.append(atoms[j].GetAtomMapNum())
            atoms[j].SetAtomMapNum(newmap)
            ref.append(newmap)
            newmap+=1
            neighbors=atoms[j].GetNeighbors()
            for n in neighbors:
                if n.GetIdx() not in reactatoms[i]:
                    if n.GetAtomMapNum() !=0:
                        id.append(n.GetIdx())
                        mapnum.append(n.GetAtomMapNum())
                        n.SetAtomMapNum(newmap)
                        ref.append(newmap)
                        newmap+=1
        try:
            if not id:
                continue
            reactstring=rdmolfiles.MolFragmentToSmarts(reactants[i],atomsToUse=id,isomericSmarts=False)
        except:
            return False
        reactstring='('+reactstring+')'
        if i == 0 or not reactsmarts:
            reactsmarts=reactstring
            continue
        reactsmarts='{}.{}'.format(reactsmarts,reactstring)
    if not reactsmarts :
        return False
    reactsmarts=reactsmarts+'>>'
    for i in range(len(products)):
        atoms=products[i].GetAtoms()
        string=Chem.MolToSmiles(products[i])
        p=Chem.MolFromSmiles(string)

        index=[a.GetIdx() for a in atoms if a.GetAtomMapNum() in mapnum]
        for ind in index:
            dummy= mapnum.index(atoms[ind].GetAtomMapNum())
            atoms[ind].SetAtomMapNum(ref[dummy])
        prodid=index
        
        try:
            if not prodid:
                continue
            prodstring = rdmolfiles.MolFragmentToSmarts(products[i], atomsToUse=prodid,isomericSmarts=False)
        except:
            return False
        prodstring='('+prodstring+')'
        if i ==0:
            reactsmarts=reactsmarts+prodstring
        continue
        reactsmarts='{}.{}'.format(reactsmarts,prodstring)
    return(reactsmarts)

In [7]:
uspto=pd.read_csv ('FinalSmiles.csv',low_memory=False)
reactsmiles=uspto['ReactionSmiles'].tolist()


In [19]:
reactsmarts=Parallel(n_jobs=-1,verbose=1)(delayed(generatereactsmarts)(r) for r in reactsmiles)
dict={'Reaction Smarts':reactsmarts,'ReactionSmiles':reactsmiles}
df=pd.DataFrame(dict)
df.to_csv('Templates_MolToSmarts_MapRNeighbors.csv',index=None)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done 920 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-1)]: Done 6168 tasks      | elapsed:    7.6s
[Parallel(n_jobs=-1)]: Done 17368 tasks      | elapsed:   16.7s
[Parallel(n_jobs=-1)]: Done 31768 tasks      | elapsed:   33.1s
[Parallel(n_jobs=-1)]: Done 49368 tasks      | elapsed:   51.4s
[Parallel(n_jobs=-1)]: Done 70168 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 94168 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 121368 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 151768 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 185368 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 222168 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 262168 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done 305368 tasks      | elapsed:  5.8min
[Parallel(n_jobs=-1)]: Don