In [1]:
import os
import sys
import timeit
import time
import random
import argparse
import itertools
import pandas as pd
import numpy as np
from functools import partial

import multiprocessing 
from multiprocessing import Pool, Value, Lock, Manager

from rdkit import Chem
from rdkit.Chem import AllChem, rdChemReactions, rdmolfiles, rdmolops

from template_utils.amol_utils_rdchiral import Reaction, Binarizer, Parsers

In [2]:
datafile = '/home/cadd/Data/retro_synthesis/test_100/top100rxn.csv'# Path to .csv file containing data

In [3]:
p = Parsers()
print('Parsing: {}'.format(datafile))
reaction_data = p.import_USPTO(datafile)
dataset = pd.DataFrame(columns=["ID", "reaction_hash", "reactants", "products", "classification", "retro_template", "template_hash", "selectivity", "outcomes"])
failed = []

Parsing: uspto_test_data/raw_data/xzaaa


In [None]:
rxn_number=0
extracted=0
for index, row in reaction_data.iterrows():
    rxn_number+=1
    print('Reaction: {}'.format(rxn_number))
    reaction = Reaction(row["rsmi"], rid=row["ID"])
    try:
        if len(reaction.rsmi.split('>')) > 3:
            failed.append([row["ID"], row["rsmi"], "components > 3"])
            continue
        elif len(reaction.product_list) > 1:
            failed.append([row["ID"], row["rsmi"], "products > 1"])
            continue
        elif reaction.incomplete_reaction():
            failed.append([row["ID"], row["rsmi"], "incomplete"])
            continue
        elif reaction.equivalent_reactant_product_set():
            failed.append([row["ID"], row["rsmi"], "reactants = products"])
            continue
        elif reaction.generate_reaction_template(radius=1) is None:
            failed.append([row["ID"], row["rsmi"], "template generation failure"])
            continue
        elif reaction.validate_retro_template(reaction.retro_template) is None:
            failed.append([row["ID"], row["rsmi"], "template rdkit validation failed"])
            continue
        elif reaction.check_retro_template_outcome(reaction.retro_template, reaction.products, save_outcome=True) != 0:
            outcomes = len(reaction.retro_outcomes)
            assessment = reaction.assess_retro_template(reaction.retro_template, reaction.reactant_mol_list, reaction.retro_outcomes)
        else: 
            pass
    except:
        continue
  

    rinchi_hash = reaction.generate_concatenatedRInChI()
    row_list = [row["ID"],
                rinchi_hash,
                reaction.reactants,
                reaction.products,
                row["classification"],
                reaction.retro_template,
                reaction.hash_template(reaction.retro_template),
                assessment,
                outcomes]

    row = pd.DataFrame([row_list], columns=["ID", "reaction_hash", "reactants", "products", "classification", "retro_template", "template_hash", "selectivity", "outcomes"])
    dataset = dataset.append(row, sort = False)
    extracted += 1
    sys.stdout.flush()

print('Extracted: {}'.format(extracted))
failed_df = pd.DataFrame(failed, columns=["ID", "rsmi", "reason"])
output = dataset.drop_duplicates(subset="reaction_hash")

Reaction: 1
Reaction: 2
Reaction: 3
Reaction: 4
Reaction: 5
Reaction: 6
Reaction: 7
Reaction: 8
Reaction: 9
Reaction: 10
Reaction: 11
Reaction: 12
Reaction: 13
Reaction: 14
Reaction: 15
Could not find consistent tetrahedral mapping, 1 centers
Template could not be extracted
Reaction: 16
Reaction: 17
Reaction: 18
Reaction: 19
Reaction: 20
Reaction: 21
Reaction: 22
Reaction: 23
Reaction: 24
Reaction: 25
Reaction: 26
Reaction: 27
Reaction: 28
Reaction: 29
Reaction: 30
Reaction: 31
Reaction: 32
Reaction: 33
Reaction: 34
Reaction: 35
Reaction: 36
Reaction: 37
Reaction: 38
Reaction: 39
Reaction: 40
Reaction: 41
Reaction: 42
Reaction: 43
Reaction: 44
Reaction: 45
Reaction: 46
Reaction: 47
Reaction: 48
Reaction: 49
Reaction: 50
Reaction: 51
Reaction: 52
Reaction: 53
Reaction: 54
Reaction: 55
Reaction: 56
Reaction: 57
Reaction: 58
Could not find consistent tetrahedral mapping, 1 centers
Template could not be extracted
Reaction: 59
Reaction: 60
Could not find consistent tetrahedral mapping, 1 ce

In [5]:
output

Unnamed: 0,ID,reaction_hash,reactants,products,classification,retro_template,template_hash,selectivity,outcomes
0,US20150238500A1;829;2015,649b99e9aac638302e0afa69433859da21a2bf5933d8da...,O=S(Cl)Cl.O[C:1]([c:2]1[cH:3][c:4]([F:5])[c:6]...,Cl[C:1]([c:2]1[cH:3][c:4]([F:5])[c:6]([N+:7](=...,Acid to acid chloride,([Cl;H0;D1;+0]-[C;H0;D3;+0:1](=[O;D1;H0:2])-[c...,22c40e8fc0c9edff204ad748d2987ef676d928b154c233...,1.0,1
0,US20150238500A1;831;2015,ecd4815c281cff98d7fcbd30b4b06cc967ff0e3a96c8b2...,Cl[C:1]([c:2]1[cH:3][c:4]([F:5])[c:6]([N+:7](=...,[C:1]([c:2]1[cH:3][c:4]([F:5])[c:6]([N+:7](=[O...,N-acylation to amide,([C;D1;H3:4]-[#8:5]-[C:6](-[#8:7]-[C;D1;H3:8])...,091e0ff4dbdb346171d19426d33e9185e95698f1249559...,1.0,1
0,US20150238500A1;833;2015,058a6c4b6bef247254653c9804fc2a584bf231b36d94d0...,CO[CH:1]([CH2:2][NH:3][C:4](=O)[c:5]1[cH:6][c:...,[cH:1]1[cH:2][n:3][c:4](-[c:5]2[cH:6][c:7]([F:...,Unassigned,([c:6]1:[c:7]:[c:8]:[c:9]:[c:10]:[c;H0;D3;+0:5...,f9611c91f97f391f71729975d1f1017b98fdc3ac1782f2...,1.0,1
0,US20150238500A1;835;2015,361faf3c224348e9a8caff2e934252cf9fe768e34a6b14...,N#Cc1c(O)c(Cl)[c:1](Cl)[c:2](O)c1C#N.[F:3][c:4...,[CH2:1]([CH3:2])[n:8]1[cH:7][cH:6][c:5]2[c:4](...,Unassigned,([CH3;D1;+0:2]-[CH2;D2;+0:1]-[n;H0;D3;+0:5]1:[...,6b53617d323840146e5b447956131d0075a67ca92cfe8d...,1.0,1
0,US20150238500A1;836;2015,5f041f7b92f1a9f32cdf7ebb644e58547259535e925c6b...,Br[CH2:1][CH3:2].[F:3][c:4]1[c:5]2[c:6]([cH:7]...,[CH2:1]([CH3:2])[N:13]1[c:6]2[c:5]([c:4]([F:3]...,Heteroaryl N-alkylation,([C:3]-[N;H0;D3;+0:4](-[c:5])-[CH2;D2;+0:1]-[C...,3c1d2d8c50c92c9cd505838df72650e0103cd6120c735c...,1.0,1
0,US20150238500A1;837;2015,8604d1f13508e0fec1f7410cd5f197f82b975898c49d51...,CC(=O)[N:1]1[CH2:2][CH2:3][c:4]2[c:5]([F:6])[c...,[NH:1]1[CH2:2][CH2:3][c:4]2[c:5]([F:6])[c:7]([...,NH deprotections,([C:2]-[NH;D2;+0:1]-[c:3])>>(C-C(=O)-[N;H0;D3;...,166a34e5312e3cfae8e9633cfad030bf83e0b45474c0ef...,1.0,1
0,US20150238500A1;839;2015,7b85fd3fe2641a9fd66da5cb03ac3c7d35f19f38ba1321...,CC(=O)O[C:2]([CH3:1])=[O:3].[F:4][c:5]1[c:6]2[...,[CH3:1][C:2](=[O:3])[N:11]1[c:7]2[c:6]([c:5]([...,N-acylation to amide,([C:4]-[N;H0;D3;+0:5](-[c:6])-[C;H0;D3;+0:1](-...,b8b6aa1fda3031481e1d6a38877b5b04b73f932df85b35...,1.0,1
0,US20150238500A1;840;2015,1ef0ebf2bb480630a5ad96d958031fadbe037cfdc016f4...,[F:1][c:2]1[c:3]2[cH:4][cH:5][nH:6][c:7]2[cH:8...,[F:1][c:2]1[c:3]2[c:7]([cH:8][cH:9][cH:10]1)[N...,Alkene to alkane,([CH2;D2;+0:5]1-[CH2;D2;+0:4]-[NH;D2;+0:3]-[c:...,9fa7a35bfb124feec66d4d65ade96655e0707185b08f97...,1.0,1
0,US20150238500A1;841;2015,cf3674c5a5160216f3b26543ab43cca390cbd38e7a5fc4...,O=[N+]([O-])[c:4]1[c:3]([CH:8]=[CH:9][N:10]2CC...,[F:1][c:2]1[c:3]2[c:4]([cH:5][cH:6][cH:7]1)[nH...,Unassigned,([c:2]:[c;H0;D3;+0:1]1:[c:3]:[cH;D2;+0:4]:[cH;...,19c442590dcdb50efef51a9efd39f2a4caaf3a196760ca...,1.0,1
0,US20150238500A1;842;2015,01c6913ed533e0e21986e874b5251c5c1d54fce506e67c...,COC(O[CH3:17])N(C)C.[F:6][c:7]1[c:8]([CH3:16])...,[N:1]1([CH:17]=[CH:16][c:8]2[c:7]([F:6])[cH:12...,Unassigned,([C:4]-[N;H0;D3;+0:5](-[C:6])-[CH;D2;+0:1]=[CH...,5dbf78fb8b0473060ca6fb150be0c7585f67bf694a5d38...,1.0,1


In [6]:
failed_df

Unnamed: 0,ID,rsmi,reason
0,US20150238500A1;849;2015,CC(=N)O[CH2:10]C.CCN(CC)[CH2:1][CH3:2].[NH2:3]...,template generation failure
1,US20150238506A1;372;2015,[OH:1][n:2]1[n:3][n:4][c:5]2[c:6]1[cH:7][cH:8]...,reactants = products
2,US20150238506A1;491;2015,CC(C)(C)OC(=O)[N:1]1[CH2:2][CH2:3][N:4]([c:5]2...,products > 1
3,US20150238506A1;769;2015,CC(C)(C)OC(=O)[N:1]1[CH2:2][c:3]2[cH:4][c:5]([...,products > 1
4,US20080318981A1;209;2008,Cl[C@H:1]([CH3:2])[c:3]1[n:4][cH:5][cH:6][cH:7...,template generation failure
5,US20080318981A1;222;2008,[C:1]([CH3:2])(=[O:3])[c:4]1[cH:5][c:6]([C:7]#...,template generation failure
6,US20080318981A1;228;2008,N[c:1]1[s:2][c:3]2[c:4]([n:5][c:6]([SH:17])[n:...,template generation failure
7,US20080318981A1;237;2008,Cl[C@H:20]([CH3:21])[c:22]1[cH:23][c:24]([C:25...,template generation failure
8,US20080318981A1;247;2008,N[c:1]1[s:2][c:3]2[c:4]([n:5][c:6]([SH:16])[n:...,template generation failure
9,US20080318981A1;279;2008,N[c:1]1[s:2][c:3]2[c:4]([n:5][c:6]([SH:17])[n:...,template generation failure
