In [1]:
import sys
from rdkit import Chem
from typing import List
import torch
import rdkit
import networkx as nx

In [2]:
add = lambda x,y : x + y if type(x) is int else (x[0] + y, x[1] + y)

idxfunc = lambda a : a.GetAtomMapNum() - 1

def get_mol(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None: Chem.Kekulize(mol)
    return mol

def get_smiles(mol):
    return Chem.MolToSmiles(mol, kekuleSmiles=True)

def set_atommap(mol, num=0):
    for atom in mol.GetAtoms():
        atom.SetAtomMapNum(num)
    return mol

def get_inter_label(mol, atoms, inter_atoms):
    new_mol = get_clique_mol(mol, atoms)
    if new_mol.GetNumBonds() == 0: 
        inter_atom = list(inter_atoms)[0]
        for a in new_mol.GetAtoms():
            a.SetAtomMapNum(0)
        return new_mol, [ (inter_atom, Chem.MolToSmiles(new_mol)) ]

    inter_label = []
    for a in new_mol.GetAtoms():
        idx = idxfunc(a)
        if idx in inter_atoms and is_anchor(a, inter_atoms):
            inter_label.append( (idx, get_anchor_smiles(new_mol, idx)) )

    for a in new_mol.GetAtoms():
        a.SetAtomMapNum( 1 if idxfunc(a) in inter_atoms else 0 )
    return new_mol, inter_label

def get_clique_mol(mol, atoms):
    smiles = Chem.MolFragmentToSmiles(mol, atoms, kekuleSmiles=True)
    new_mol = Chem.MolFromSmiles(smiles, sanitize=False)
    new_mol = copy_edit_mol(new_mol).GetMol()
    # print(f"NEW MOL AFTER COPY EDIT: {new_mol}")
    new_mol = sanitize(new_mol) 
    # print(f"NEW MOL AFTER SANITIZE: {new_mol}")
    #if tmp_mol is not None: new_mol = tmp_mol
    return new_mol

def copy_edit_mol(mol):
    new_mol = Chem.RWMol(Chem.MolFromSmiles(''))
    for atom in mol.GetAtoms():
        new_atom = copy_atom(atom)
        new_mol.AddAtom(new_atom)

    for bond in mol.GetBonds():
        a1 = bond.GetBeginAtom().GetIdx()
        a2 = bond.GetEndAtom().GetIdx()
        bt = bond.GetBondType()
        new_mol.AddBond(a1, a2, bt)
        #if bt == Chem.rdchem.BondType.AROMATIC and not aromatic:
        #    bt = Chem.rdchem.BondType.SINGLE
    return new_mol

def copy_atom(atom, atommap=True):
    new_atom = Chem.Atom(atom.GetSymbol())
    new_atom.SetFormalCharge(atom.GetFormalCharge())
    if atommap: 
        new_atom.SetAtomMapNum(atom.GetAtomMapNum())
    return new_atom


def sanitize(mol, kekulize=True):
    try:
        smiles = get_smiles(mol) if kekulize else Chem.MolToSmiles(mol)
        mol = get_mol(smiles) if kekulize else Chem.MolFromSmiles(smiles)
    except:
        mol = None
    return mol

def is_anchor(atom, inter_atoms):
    for a in atom.GetNeighbors():
        if idxfunc(a) not in inter_atoms:
            return True
    return False

def get_anchor_smiles(mol, anchor, idxfunc=idxfunc):
    copy_mol = Chem.Mol(mol)
    for a in copy_mol.GetAtoms():
        idx = idxfunc(a)
        if idx == anchor: a.SetAtomMapNum(1)
        else: a.SetAtomMapNum(0)

    return get_smiles(copy_mol)

def get_assm_cands(mol, atoms, inter_label, cluster, inter_size):
    atoms = list(set(atoms))
    mol = get_clique_mol(mol, atoms)
    atom_map = [idxfunc(atom) for atom in mol.GetAtoms()]
    mol = set_atommap(mol)
    rank = Chem.CanonicalRankAtoms(mol, breakTies=False)
    rank = { x:y for x,y in zip(atom_map, rank) }

    pos, icls = zip(*inter_label)
    if inter_size == 1:
        cands = [pos[0]] + [ x for x in cluster if rank[x] != rank[pos[0]] ] 
    
    elif icls[0] == icls[1]: #symmetric case
        shift = cluster[inter_size - 1:] + cluster[:inter_size - 1]
        cands = zip(cluster, shift)
        cands = [pos] + [ (x,y) for x,y in cands if (rank[min(x,y)],rank[max(x,y)]) != (rank[min(pos)], rank[max(pos)]) ]
    else: 
        shift = cluster[inter_size - 1:] + cluster[:inter_size - 1]
        cands = zip(cluster + shift, shift + cluster)
        cands = [pos] + [ (x,y) for x,y in cands if (rank[x],rank[y]) != (rank[pos[0]], rank[pos[1]]) ]

    return cands

class MolGraph(object):

    BOND_LIST = [Chem.rdchem.BondType.SINGLE, Chem.rdchem.BondType.DOUBLE, Chem.rdchem.BondType.TRIPLE, Chem.rdchem.BondType.AROMATIC] 
    MAX_POS = 20

    def __init__(self, smiles):
        self.smiles = smiles
        self.mol = get_mol(smiles)

        self.mol_graph = self.build_mol_graph()
        self.clusters, self.atom_cls = self.find_clusters()
        self.mol_tree = self.tree_decomp()
        self.order = self.label_tree()

    def find_clusters(self):
        mol = self.mol
        n_atoms = mol.GetNumAtoms()
        if n_atoms == 1: #special case
            return [(0,)], [[0]]

        clusters = []
        for bond in mol.GetBonds():
            a1 = bond.GetBeginAtom().GetIdx()
            a2 = bond.GetEndAtom().GetIdx()
            if not bond.IsInRing():
                clusters.append( (a1,a2) )

        ssr = [tuple(x) for x in Chem.GetSymmSSSR(mol)]
        clusters.extend(ssr)

        if 0 not in clusters[0]: #root is not node[0]
            for i,cls in enumerate(clusters):
                if 0 in cls:
                    clusters = [clusters[i]] + clusters[:i] + clusters[i+1:]
                    #clusters[i], clusters[0] = clusters[0], clusters[i]
                    break

        atom_cls = [[] for i in range(n_atoms)]
        for i in range(len(clusters)):
            for atom in clusters[i]:
                atom_cls[atom].append(i)

        return clusters, atom_cls

    def tree_decomp(self):
        clusters = self.clusters
        graph = nx.empty_graph( len(clusters) )
        for atom, nei_cls in enumerate(self.atom_cls):
            if len(nei_cls) <= 1: continue
            bonds = [c for c in nei_cls if len(clusters[c]) == 2]
            rings = [c for c in nei_cls if len(clusters[c]) > 4] #need to change to 2

            if len(nei_cls) > 2 and len(bonds) >= 2:
                clusters.append([atom])
                c2 = len(clusters) - 1
                graph.add_node(c2)
                for c1 in nei_cls:
                    graph.add_edge(c1, c2, weight = 100)

            elif len(rings) > 2: #Bee Hives, len(nei_cls) > 2 
                clusters.append([atom]) #temporary value, need to change
                c2 = len(clusters) - 1
                graph.add_node(c2)
                for c1 in nei_cls:
                    graph.add_edge(c1, c2, weight = 100)
            else:
                for i,c1 in enumerate(nei_cls):
                    for c2 in nei_cls[i + 1:]:
                        inter = set(clusters[c1]) & set(clusters[c2])
                        graph.add_edge(c1, c2, weight = len(inter))

        n, m = len(graph.nodes), len(graph.edges)
        assert n - m <= 1 #must be connected
        return graph if n - m == 1 else nx.maximum_spanning_tree(graph)

    def label_tree(self):
        def dfs(order, pa, prev_sib, x, fa):
            pa[x] = fa 
            sorted_child = sorted([ y for y in self.mol_tree[x] if y != fa ]) #better performance with fixed order
            for idx,y in enumerate(sorted_child):
                self.mol_tree[x][y]['label'] = 0 
                self.mol_tree[y][x]['label'] = idx + 1 #position encoding
                prev_sib[y] = sorted_child[:idx] 
                prev_sib[y] += [x, fa] if fa >= 0 else [x]
                order.append( (x,y,1) )
                dfs(order, pa, prev_sib, y, x)
                order.append( (y,x,0) )

        order, pa = [], {}
        self.mol_tree = nx.DiGraph(self.mol_tree)
        prev_sib = [[] for i in range(len(self.clusters))]
        dfs(order, pa, prev_sib, 0, -1)

        order.append( (0, None, 0) ) #last backtrack at root
        
        mol = get_mol(self.smiles)
        for a in mol.GetAtoms():
            a.SetAtomMapNum( a.GetIdx() + 1 )

        tree = self.mol_tree
        for i,cls in enumerate(self.clusters):
            inter_atoms = set(cls) & set(self.clusters[pa[i]]) if pa[i] >= 0 else set([0])
            cmol, inter_label = get_inter_label(mol, cls, inter_atoms)
            tree.nodes[i]['ismiles'] = ismiles = get_smiles(cmol)
            tree.nodes[i]['inter_label'] = inter_label
            tree.nodes[i]['smiles'] = smiles = get_smiles(set_atommap(cmol))
            tree.nodes[i]['label'] = (smiles, ismiles) if len(cls) > 1 else (smiles, smiles)
            tree.nodes[i]['cluster'] = cls 
            tree.nodes[i]['assm_cands'] = []

            if pa[i] >= 0 and len(self.clusters[ pa[i] ]) > 2: #uncertainty occurs in assembly
                hist = [a for c in prev_sib[i] for a in self.clusters[c]] 
                pa_cls = self.clusters[ pa[i] ]
                tree.nodes[i]['assm_cands'] = get_assm_cands(mol, hist, inter_label, pa_cls, len(inter_atoms)) 

                child_order = tree[i][pa[i]]['label']
                diff = set(cls) - set(pa_cls)
                for fa_atom in inter_atoms:
                    for ch_atom in self.mol_graph[fa_atom]:
                        if ch_atom in diff:
                            label = self.mol_graph[ch_atom][fa_atom]['label']
                            if type(label) is int: #in case one bond is assigned multiple times
                                self.mol_graph[ch_atom][fa_atom]['label'] = (label, child_order)
        return order
       
    def build_mol_graph(self):
        mol = self.mol
        graph = nx.DiGraph(Chem.rdmolops.GetAdjacencyMatrix(mol))
        for atom in mol.GetAtoms():
            graph.nodes[atom.GetIdx()]['label'] = (atom.GetSymbol(), atom.GetFormalCharge())

        for bond in mol.GetBonds():
            a1 = bond.GetBeginAtom().GetIdx()
            a2 = bond.GetEndAtom().GetIdx()
            btype = MolGraph.BOND_LIST.index( bond.GetBondType() )
            graph[a1][a2]['label'] = btype
            graph[a2][a1]['label'] = btype

        return graph
    
    @staticmethod
    def tensorize(mol_batch, vocab, avocab):
        mol_batch = [MolGraph(x) for x in mol_batch]
        tree_tensors, tree_batchG = MolGraph.tensorize_graph([x.mol_tree for x in mol_batch], vocab)
        graph_tensors, graph_batchG = MolGraph.tensorize_graph([x.mol_graph for x in mol_batch], avocab)
        tree_scope = tree_tensors[-1]
        graph_scope = graph_tensors[-1]

        max_cls_size = max( [len(c) for x in mol_batch for c in x.clusters] )
        cgraph = torch.zeros(len(tree_batchG) + 1, max_cls_size).int()
        for v,attr in tree_batchG.nodes(data=True):
            bid = attr['batch_id']
            offset = graph_scope[bid][0]
            tree_batchG.nodes[v]['inter_label'] = inter_label = [(x + offset, y) for x,y in attr['inter_label']]
            tree_batchG.nodes[v]['cluster'] = cls = [x + offset for x in attr['cluster']]
            tree_batchG.nodes[v]['assm_cands'] = [add(x, offset) for x in attr['assm_cands']]
            cgraph[v, :len(cls)] = torch.IntTensor(cls)

        all_orders = []
        for i,hmol in enumerate(mol_batch):
            offset = tree_scope[i][0]
            order = [(x + offset, y + offset, z) for x,y,z in hmol.order[:-1]] + [(hmol.order[-1][0] + offset, None, 0)]
            all_orders.append(order)

        tree_tensors = tree_tensors[:4] + (cgraph, tree_scope)
        return (tree_batchG, graph_batchG), (tree_tensors, graph_tensors), all_orders

    @staticmethod
    def tensorize_graph(graph_batch, vocab):
        fnode,fmess = [None],[(0,0,0,0)] 
        agraph,bgraph = [[]], [[]] 
        scope = []
        edge_dict = {}
        all_G = []

        for bid,G in enumerate(graph_batch):
            offset = len(fnode)
            scope.append( (offset, len(G)) )
            G = nx.convert_node_labels_to_integers(G, first_label=offset)
            all_G.append(G)
            fnode.extend( [None for v in G.nodes] )

            for v, attr in G.nodes(data='label'):
                G.nodes[v]['batch_id'] = bid
                fnode[v] = vocab[attr]
                agraph.append([])

            for u, v, attr in G.edges(data='label'):
                if type(attr) is tuple:
                    fmess.append( (u, v, attr[0], attr[1]) )
                else:
                    fmess.append( (u, v, attr, 0) )
                edge_dict[(u, v)] = eid = len(edge_dict) + 1
                G[u][v]['mess_idx'] = eid
                agraph[v].append(eid)
                bgraph.append([])

            for u, v in G.edges:
                eid = edge_dict[(u, v)]
                for w in G.predecessors(u):
                    if w == v: continue
                    bgraph[eid].append( edge_dict[(w, u)] )

        fnode[0] = fnode[1]
        fnode = torch.IntTensor(fnode)
        fmess = torch.IntTensor(fmess)
        agraph = create_pad_tensor(agraph)
        bgraph = create_pad_tensor(bgraph)
        return (fnode, fmess, agraph, bgraph, scope), nx.union_all(all_G)

In [3]:
# def process(smiles_filepath: str) -> List[str]:
#     vocab = set()
#     with open(smiles_filepath, "r") as f:
#         for line in f.readlines():
#             s = line.strip("\n")
#             try:
#                 hmol = MolGraph(s)
#             except Chem.KekulizeException as e:
#                 continue
#             for node, attr in hmol.mol_tree.nodes(data=True):
#                 smiles = attr["smiles"]
#                 vocab.add(attr["label"])
#                 for i, s in attr["inter_label"]:
#                     vocab.add((smiles, s))
#     return vocab

In [4]:
# voc = process("../tests/data/test.smi")

In [5]:
# voc

In [6]:
# df

In [7]:
# lvoc = list(voc)
# fvoc = set()
# for i in range(len(voc)):
#     smi = lvoc[i][0]
#     fvoc.add(smi)

In [8]:

# pdb_blocks = pd.read_json("../data/processed/blocks_PDB_105.json")

In [9]:
# "O=c1nc(Cl)ccn1-c1[nH]ncc1C1CC=CCC1"  91, 29, 48, 95
# pdb_blocks.iloc[95]["block_smi"]

In [11]:
import pandas as pd

zinc_smiles = pd.read_csv("../data/processed/250k_rndm_zinc_drugs_clean_3.csv", nrows=500).smiles.apply(str.strip, args=("\n"))
data = zinc_smiles.to_list()

In [12]:
data

['CC(C)(C)c1ccc2occ(CC(=O)Nc3ccccc3F)c2c1',
 'C[C@@H]1CC(Nc2cncc(-c3nncn3C)c2)C[C@@H](C)C1',
 'N#Cc1ccc(-c2ccc(O[C@@H](C(=O)N3CCCC3)c3ccccc3)cc2)cc1',
 'CCOC(=O)[C@@H]1CCCN(C(=O)c2nc(-c3ccc(C)cc3)n3c2CCCCC3)C1',
 'N#CC1=C(SCC(=O)Nc2cccc(Cl)c2)N=C([O-])[C@H](C#N)C12CCCCC2',
 'CC[NH+](CC)[C@](C)(CC)[C@H](O)c1cscc1Br',
 'COc1ccc(C(=O)N(C)[C@@H](C)C/C(N)=N/O)cc1O',
 'O=C(Nc1nc[nH]n1)c1cccnc1Nc1cccc(F)c1',
 'Cc1c(/C=N/c2cc(Br)ccn2)c(O)n2c(nc3ccccc32)c1C#N',
 'C[C@@H]1CN(C(=O)c2cc(Br)cn2C)CC[C@H]1[NH3+]',
 'CCOc1ccc(OCC)c([C@H]2C(C#N)=C(N)N(c3ccccc3C(F)(F)F)C3=C2C(=O)CCC3)c1',
 'Cc1ccc2nc(S[C@H](C)C(=O)NC3CCC(C)CC3)n(C)c(=O)c2c1',
 'O=C(N1CCc2c(F)ccc(F)c2C1)C1(O)Cc2ccccc2C1',
 'Cc1ccccc1C(=O)N1CCC2(CC1)C[C@H](c1ccccc1)C(=O)N2C',
 'CCCc1cc(NC(=O)CN2C(=O)NC3(CCC(C)CC3)C2=O)n(C)n1',
 'CC(C)Cc1nc(SCC(=O)NC[C@@H]2CCCO2)c2c(=O)n(C)c(=O)n(C)c2n1',
 'Cc1ccc(CNC(=O)c2ccccc2NC(=O)[C@@H]2CC(=O)N(c3ccc(C)cc3)C2)cc1',
 'CCCCC(=O)NC(=S)Nc1ccccc1C(=O)N1CCOCC1',
 'Cc1c(NC(=O)CSc2nc3sc4c(c3c(=O)[nH]2)CCCC4)c

In [13]:
import sys
from multiprocessing import Pool


def process(data):
    vocab = set()
    for line in data:
        s = line.strip("\r\n ")
        try:
            hmol = MolGraph(s)
        except Chem.KekulizeException as e:
            print(f"Was not able to kekulize {s}, RDKit Message: {e}")
            continue
        for node,attr in hmol.mol_tree.nodes(data=True):
            smiles = attr['smiles']
            vocab.add( attr['label'] )
            for i,s in attr['inter_label']:
                vocab.add( (smiles, s) )
    return vocab


class Argx:
    def __init__(self, ncpu: int = 4):
        self.ncpu = ncpu

args = Argx(4)

data = list(set(data))

batch_size = len(data) // args.ncpu + 1
batches = [data[i : i + batch_size] for i in range(0, len(data), batch_size)]

pool = Pool(args.ncpu)
vocab_list = pool.map(process, batches)
vocab = [(x,y) for vocab in vocab_list for x,y in vocab]
vocab = list(set(vocab))

for x,y in sorted(vocab):
    print(x, y)

[18:47:36] Can't kekulize mol.  Unkekulized atoms: 22


Was not able to kekulize Cc1ccccc1COc1ccc([C@@H]2C3=C(CCCC3=O)Nc3nnnn32)cc1, RDKit Message: Can't kekulize mol.  Unkekulized atoms: 22


[18:47:36] Can't kekulize mol.  Unkekulized atoms: 16 17 19 20 21


Was not able to kekulize COc1ccccc1[C@@H](C)NC(=O)c1cnc2c(C)cccn2c1=O, RDKit Message: Can't kekulize mol.  Unkekulized atoms: 16 17 19 20 21


[18:47:36] Can't kekulize mol.  Unkekulized atoms: 7 8 9 10 11


Was not able to kekulize O=C1O[C@H](C(=O)Nc2ccnc3ccnn23)Cc2ccccc21, RDKit Message: Can't kekulize mol.  Unkekulized atoms: 7 8 9 10 11


[18:47:36] Can't kekulize mol.  Unkekulized atoms: 8
[18:47:36] Can't kekulize mol.  Unkekulized atoms: 16


Was not able to kekulize CCCc1cc(=O)n2c(n1)SC[C@@H]2CC(=O)Nc1cccc(Cl)c1Cl, RDKit Message: Can't kekulize mol.  Unkekulized atoms: 8
Was not able to kekulize Fc1ccc([C@@H]2C[C@@H](c3ccc(Br)cc3)Nc3ncnn32)cc1Br, RDKit Message: Can't kekulize mol.  Unkekulized atoms: 16


[18:47:36] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 26


Was not able to kekulize Cc1ccc2ncc(C(=O)Nc3ncccc3OCc3ccncc3)n2c1, RDKit Message: Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 26


[18:47:36] Can't kekulize mol.  Unkekulized atoms: 26 27 28 29 30


Was not able to kekulize CN(Cc1ccno1)Cc1c(C(=O)N2CC[NH+](C3CCCCC3)CC2)nc2ccccn12, RDKit Message: Can't kekulize mol.  Unkekulized atoms: 26 27 28 29 30


[18:47:36] Can't kekulize mol.  Unkekulized atoms: 3


Was not able to kekulize Cc1nc2n(n1)CCC[C@@H]2N[C@@H]1CCc2c(Cl)cc(Cl)cc21, RDKit Message: Can't kekulize mol.  Unkekulized atoms: 3


[18:47:36] Can't kekulize mol.  Unkekulized atoms: 3 4 5 7 27


Was not able to kekulize Cc1cc2nc(C)c(CCC(=O)NC[C@@H](c3ccccc3)N3CCOCC3)c(C)n2n1, RDKit Message: Can't kekulize mol.  Unkekulized atoms: 3 4 5 7 27


[18:47:36] Can't kekulize mol.  Unkekulized atoms: 23


Was not able to kekulize CCOC(=O)[C@@H]1CCCN(C(=O)c2nc(-c3ccc(C)cc3)n3c2CCCCC3)C1, RDKit Message: Can't kekulize mol.  Unkekulized atoms: 23


[18:47:36] Can't kekulize mol.  Unkekulized atoms: 5


Was not able to kekulize CC(=O)Nc1c2n(c3ccccc13)C[C@](C)(C(=O)NC1CCCCC1)N(C1CCCCC1)C2=O, RDKit Message: Can't kekulize mol.  Unkekulized atoms: 5


[18:47:36] Can't kekulize mol.  Unkekulized atoms: 4 5 6 7 12


Was not able to kekulize C[C@@H]([C@@H](O)c1ccc2ncnn2c1)[N+](=O)[O-], RDKit Message: Can't kekulize mol.  Unkekulized atoms: 4 5 6 7 12


[18:47:36] Can't kekulize mol.  Unkekulized atoms: 10


Was not able to kekulize COc1ccc(N2CCn3c2nn(CC(N)=O)c(=O)c3=O)cc1, RDKit Message: Can't kekulize mol.  Unkekulized atoms: 10


[18:47:36] Can't kekulize mol.  Unkekulized atoms: 11 13 14


Was not able to kekulize O=C(c1c(-c2ccccc2)nc2sc3c(n12)CCCC3)C(F)(F)F, RDKit Message: Can't kekulize mol.  Unkekulized atoms: 11 13 14


[18:47:36] Can't kekulize mol.  Unkekulized atoms: 21


Was not able to kekulize COc1ccc([C@H]2C(C(=O)NCc3ccccc3)=C(C)Nc3ncnn32)cc1OC, RDKit Message: Can't kekulize mol.  Unkekulized atoms: 21


[18:47:36] Can't kekulize mol.  Unkekulized atoms: 9


Was not able to kekulize CSc1ccc(/C=c2\sc3ncnn3c2=O)cc1, RDKit Message: Can't kekulize mol.  Unkekulized atoms: 9


[18:47:36] Can't kekulize mol.  Unkekulized atoms: 6


Was not able to kekulize Cc1cc(=O)n2c(n1)SC[C@@H]2CC(=O)NCCC(C)C, RDKit Message: Can't kekulize mol.  Unkekulized atoms: 6


[18:47:36] Can't kekulize mol.  Unkekulized atoms: 12 13 14 15 16


Was not able to kekulize C[C@H](OC(=O)c1nc(C2CC2)n2ccccc12)c1cnc2ccccc2c1, RDKit Message: Can't kekulize mol.  Unkekulized atoms: 12 13 14 15 16


[18:47:36] Can't kekulize mol.  Unkekulized atoms: 1 2 3 8 20


Was not able to kekulize Cc1nc2ncnn2c(NCCOC2CCCCCC2)c1C, RDKit Message: Can't kekulize mol.  Unkekulized atoms: 1 2 3 8 20


[18:47:36] Can't kekulize mol.  Unkekulized atoms: 22 23 25


Was not able to kekulize Cc1nc(-n2cccc2)sc1C(=O)Nc1cccc(-c2cn3ccsc3n2)c1, RDKit Message: Can't kekulize mol.  Unkekulized atoms: 22 23 25


[18:47:36] Can't kekulize mol.  Unkekulized atoms: 7 8 9 11 12 13 15 16 18


Was not able to kekulize CC[C@H](Sc1nnc2cc(C)c3cc(C)cc(C)c3n12)C(=O)Nc1nnc(COC)s1, RDKit Message: Can't kekulize mol.  Unkekulized atoms: 7 8 9 11 12 13 15 16 18


[18:47:36] Can't kekulize mol.  Unkekulized atoms: 5 6 10 11 12


Was not able to kekulize c1cc(-c2nc3c4cn[nH]c4ncn3n2)ccc1COc1ccc2c(c1)CCC2, RDKit Message: Can't kekulize mol.  Unkekulized atoms: 5 6 10 11 12


[18:47:37] Can't kekulize mol.  Unkekulized atoms: 17 18 19 26 31


Was not able to kekulize O=C(c1cccs1)N(Cc1ccc(F)cc1)Cc1cc(-c2ccccc2)cn2nnnc12, RDKit Message: Can't kekulize mol.  Unkekulized atoms: 17 18 19 26 31


[18:47:37] Can't kekulize mol.  Unkekulized atoms: 11


Was not able to kekulize COc1cccc(CN2CCc3nnc(CCc4ccccc4)n3CC2)c1, RDKit Message: Can't kekulize mol.  Unkekulized atoms: 11


[18:47:37] Can't kekulize mol.  Unkekulized atoms: 15 16 17 18 20


Was not able to kekulize CNC(=O)[C@H](C)CN(C)Cc1cc(=O)n2cccc(C)c2[nH+]1, RDKit Message: Can't kekulize mol.  Unkekulized atoms: 15 16 17 18 20


[18:47:37] Can't kekulize mol.  Unkekulized atoms: 9 10 12 13 22


Was not able to kekulize CCCCNC(=O)CCc1c(C)nc2c3ccccc3nn2c1C, RDKit Message: Can't kekulize mol.  Unkekulized atoms: 9 10 12 13 22


[18:47:37] Can't kekulize mol.  Unkekulized atoms: 11 12 13 14 15


Was not able to kekulize COC(=O)CCN(Cc1cnc2ncccn12)C1CCOCC1, RDKit Message: Can't kekulize mol.  Unkekulized atoms: 11 12 13 14 15


[18:47:37] Can't kekulize mol.  Unkekulized atoms: 1 2 12 15 23


Was not able to kekulize Cc1c(/C=N/c2cc(Br)ccn2)c(O)n2c(nc3ccccc32)c1C#N, RDKit Message: Can't kekulize mol.  Unkekulized atoms: 1 2 12 15 23


[18:47:37] Can't kekulize mol.  Unkekulized atoms: 14


Was not able to kekulize Cc1ccc(CNC(=O)NCc2nnc3n2CCC3)s1, RDKit Message: Can't kekulize mol.  Unkekulized atoms: 14


[18:47:37] Can't kekulize mol.  Unkekulized atoms: 26


Was not able to kekulize COc1ccc([C@H]2C[C@@H](C(F)(F)F)n3nc(C(=O)NC4CCCCC4)cc3N2)cc1OC, RDKit Message: Can't kekulize mol.  Unkekulized atoms: 26


[18:47:37] Can't kekulize mol.  Unkekulized atoms: 3 26 31


Was not able to kekulize Cc1cc2n(C[C@H](O)CO[C@H](c3ccccc3)c3ccccc3C)c(=O)c3ccccc3n2n1, RDKit Message: Can't kekulize mol.  Unkekulized atoms: 3 26 31


[18:47:37] Can't kekulize mol.  Unkekulized atoms: 4


Was not able to kekulize CCc1nc2n(n1)CCC[C@H]2NC(=O)c1ccc(-n2cc(C)cn2)cc1, RDKit Message: Can't kekulize mol.  Unkekulized atoms: 4


[18:47:37] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 23


Was not able to kekulize Cc1cccn2c(=O)c(C(=O)NC[C@H]3CCO[C@@H]3C(C)C)cnc12, RDKit Message: Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 23
C C
C#C C#[CH:1]
C#N C#[N:1]
C#N N#[CH:1]
C1=CC=CC=C1 C1=CC=[CH:1]C=C1
C1=CC=CC=C1 C1=CC=[CH:1][CH:1]=C1
C1=CC=NC=C1 C1=CC=[CH:1]N=C1
C1=CC=NC=C1 C1=CN=C[CH:1]=C1
C1=CC=NC=C1 C1=CN=[CH:1]C=C1
C1=CC=NC=C1 C1=CN=[CH:1][CH:1]=C1
C1=CC=NC=C1 C1=C[CH:1]=CN=C1
C1=CC=NC=C1 C1=NC=C[CH:1]=C1
C1=CC=NN=C1 C1=C[CH:1]=NN=C1
C1=CC=[NH+]C=C1 C1=CC=[CH:1][NH+]=C1
C1=CC=[NH+]C=C1 C1=[NH+]C=C[CH:1]=C1
C1=CCC=C1 C1=CC[CH:1]=C1
C1=CCC=C1 C1=C[CH:1]=CC1
C1=CCC=C1 C1=C[CH:1]=[CH:1]C1
C1=CCCC1 C1=CC[CH2:1]C1
C1=CCCC1 C1=C[CH2:1]CC1
C1=CCCC1 C1=C[CH2:1][CH2:1][CH2:1]1
C1=CCCC1 C1=[CH:1]CCC1
C1=CCCC1 C1C[CH:1]=[CH:1]C1
C1=CCCC=C1 C1=CCC[CH:1]=C1
C1=CCCC=C1 C1=CC[CH2:1]C=C1
C1=CCCC=C1 C1=C[CH2:1][CH2:1]C=C1
C1=CCCC=C1 C1=C[CH:1]=CCC1
C1=CCCC=C1 C1=C[CH:1]=[CH:1]CC1
C1=CCCCC1 C1=CC[CH2:1]CC1
C1=CCCCC1 C1=CC[CH2:1][CH2:1]C1
C1=CCCCC1 C1=C[CH2:1]CCC1
C1=CCCCC1 C1=[CH:1]CCCC1

In [16]:
len(vocab)

284

In [22]:
us = list(set([vocab[i][0] for i in range(len(vocab))]))

In [24]:
len(us)

132