In [1]:
import gzip
import logging
from pathlib import Path
from typing import List, Optional

import pandas as pd
from rdkit import Chem
from tqdm import tqdm
from joblib import Parallel, delayed


from rxitect.chem import mol_utils
from rxitect.structs.vocabulary import SmilesVocabulary

logger = logging.getLogger(__name__)
MIN_TOKEN_LEN = 10
MAX_TOKEN_LEN = 100
N_CHEMBL_COMPOUNDS = {
    "v30": 2_157_379
}


def generate_smiles_corpus(
    raw_data_filepath: Path,
    destdir: Path,
    corpus_type: str,
    requires_clean: bool = True,
    is_isomeric: bool = False,
    create_voc_file: bool = False,
):
    """
    This method constructs a dataset with molecules represented as SMILES. Each molecule will be decomposed
    into a series of tokens. In the end, all the tokens will be put into one set as vocabulary.

    Args:
        raw_data_filepath (Path): The file path of input, either .sdf file or tab-delimited file
        destdir (Path): The file path to write the created files to.
        corpus_type (str): type of the file to be processed and written. Can be 'chembl' or 'ligand'.
        requires_clean (bool): If the molecule is required to be clean, the charge metal will be
                removed and only the largest fragment will be kept.
        is_isomeric (bool): If the molecules in the dataset keep conformational information. If not,
                the conformational tokens (e.g. @@, @, \, /) will be removed. False by default.
        create_voc_file:
    """
    if corpus_type == "chembl":
        df = _get_mols_from_sdf(is_isomeric, raw_data_filepath)
    elif corpus_type == "ligand":
        df = pd.read_table(raw_data_filepath).Smiles.dropna()
    else:
        raise ValueError("Only valid corpus types are 'chembl' and 'ligand'.")

    voc = SmilesVocabulary(vocabulary_file_path=None)
    words = set()
    canons = []
    tokens = []
    if requires_clean:
        smiles = set()
        for smile in tqdm(df, desc="Cleaning molecules"):
            try:
                smile = mol_utils.clean_mol(smile, is_isomeric=is_isomeric)
                smiles.add(Chem.CanonSmiles(smile))
            except Exception as e:
                logger.warning("Parsing Error: ", e)
    else:
        smiles = df.values
    for smile in tqdm(smiles, desc="Tokenizing SMILES"):
        token = voc.tokenize(smile)
        # Only collect the organic molecules
        if {"C", "c"}.isdisjoint(token):
            logger.warning("Non-organic token detected: ", smile)
            continue
        # Remove the metal tokens
        if not {"[Na]", "[Zn]"}.isdisjoint(token):
            logger.warning("Metal token detected: ", smile)
            continue
        # control the minimum and maximum of sequence length.
        if MIN_TOKEN_LEN < len(token) <= MAX_TOKEN_LEN:
            words.update(token)
            canons.append(smile)
            tokens.append(" ".join(token))

    if create_voc_file:
        # output the vocabulary file
        with open(destdir / f"smiles_voc.txt", "w") as voc_file:
            voc_file.write("\n".join(sorted(words)))

    outfile = destdir / f"smiles_{corpus_type}_corpus.txt"
    _write_corpus(canon_smiles=canons, outfile=outfile, tokens=tokens)


def _write_corpus(canon_smiles, outfile, tokens):
    """Output the dataset file as tab-delimited file"""
    corpus_df = pd.DataFrame()
    corpus_df["smiles"] = canon_smiles
    corpus_df["token"] = tokens
    corpus_df.drop_duplicates(subset="smiles")
    corpus_df.to_csv(path_or_buf=outfile, sep="\t", index=False)


def _get_mols_from_sdf(is_isomeric: bool, raw_data_filepath: Path, chembl_version: Optional[str] = "v30") -> List[str]:
    """Handle sdf file with RDkit"""
    inf = gzip.open(raw_data_filepath)
    fsuppl = Chem.ForwardSDMolSupplier(inf)
    smiles = []
    total_mols = N_CHEMBL_COMPOUNDS.get(chembl_version) if chembl_version else None
    for mol in tqdm(fsuppl, total=total_mols, desc="Processing ChEMBL molecules"):
        try:
            smiles.append(Chem.MolToSmiles(mol, is_isomeric))
        except Exception as e:
            logger.warning(f"Was not able to convert {mol} to smiles: {e}")
    return smiles


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = _get_mols_from_sdf(raw_data_filepath="../data/raw/chembl_30.sdf.gz", is_isomeric=False)

Processing ChEMBL molecules:  99%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▉ | 2136187/2157379 [09:31<00:05, 3739.12it/s]


In [3]:
# 2_136_187

In [6]:
open("../data/processed/chembl_30_smiles.txt", "w").write('\n'.join(smi for smi in df))

115060425

In [10]:
%%time
smiles = set()
for smile in tqdm(df[:1000], desc="Cleaning molecules"):
    try:
        smile = mol_utils.clean_mol(smile)
        smiles.add(Chem.CanonSmiles(smile))
    except Exception as e:
        print("Parsing Error: ", e)

Cleaning molecules: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:03<00:00, 320.27it/s]

CPU times: user 3.12 s, sys: 12 ms, total: 3.13 s
Wall time: 3.12 s





In [12]:
df_set = set(df[:1000])
for smile in tqdm(df_set, desc="Cleaning molecules"):
    try:
        smile = mol_utils.clean_mol(smile)
        smiles.add(Chem.CanonSmiles(smile))
    except Exception as e:
        print("Parsing Error: ", e)

Cleaning molecules: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 992/992 [00:03<00:00, 324.89it/s]


In [30]:
%%time
smiles = set()
for smile in df[:1000]:
    try:
        smile = mol_utils.clean_mol(smile)
        smiles.add(Chem.CanonSmiles(smile))
    except Exception as e:
        print("Parsing Error: ", e)

CPU times: user 3.02 s, sys: 0 ns, total: 3.02 s
Wall time: 3.02 s


In [24]:
def clean_and_canonalize(smiles: str) -> str:
    processed_smiles = None
    try:
        smiles = mol_utils.clean_mol(smiles)
        processed_smiles = Chem.CanonSmiles(smiles)
    except Exception as e:
        print("Parsing Error: ", e)
    return processed_smiles

In [33]:
%%time
clean_smiles = set(Parallel(n_jobs=-1)(delayed(clean_and_canonalize)(smi) for smi in df[:1000]))

CPU times: user 426 ms, sys: 208 ms, total: 634 ms
Wall time: 908 ms


In [28]:
len(clean_smiles)

992

In [29]:
%%time
clean_smiles = set((clean_and_canonalize(smi) for smi in df[:1000]))

CPU times: user 3.02 s, sys: 0 ns, total: 3.02 s
Wall time: 3.02 s


In [34]:
sv = SmilesVocabulary()

In [37]:
tks = [sv.tokenize(cs) for cs in clean_smiles]

In [43]:
%%time

words = set()
canon_smiles = []
tokenized_smiles = []
for smi in tqdm(df, desc="Tokenizing SMILES"):
    tokens = sv.tokenize(smi)
    # Only collect the organic molecules
    if {"C", "c"}.isdisjoint(tokens):
        print("Non-organic token detected: ", smi)
        continue
    # Remove the metal tokens
    if not {"[Na]", "[Zn]"}.isdisjoint(tokens):
        print("Metal token detected: ", smi)
        continue
    # control the minimum and maximum of sequence length.
    if 10 < len(tokens) <= 100:
        words.update(tokens)
        canon_smiles.append(smi)
        tokenized_smiles.append(" ".join(tokens))


Tokenizing SMILES:   2%|██▎                                                                                                                     | 41903/2136187 [00:00<00:09, 212092.24it/s]

Non-organic token detected:  O=S(=O)(O)NS(=O)(=O)O
Non-organic token detected:  OB(O)O
Non-organic token detected:  F[Ag]


Tokenizing SMILES:   4%|████▉                                                                                                                   | 86810/2136187 [00:00<00:09, 220585.43it/s]

Non-organic token detected:  O=O.[O-][Cl+]O.[O-][Cl+]O.[O-][Cl+]O.[O-][Cl+]O
Non-organic token detected:  NS(=O)(=O)O
Non-organic token detected:  [Cl-].[Li+]
Non-organic token detected:  OO
Non-organic token detected:  [N-]=[N+]=[N-]
Non-organic token detected:  [N-]=[N+]=[N-].[Na+]
Non-organic token detected:  O=N[O-].[Na+]
Non-organic token detected:  NN


Tokenizing SMILES:   6%|███████▎                                                                                                               | 130811/2136187 [00:00<00:09, 218615.17it/s]

Non-organic token detected:  [I-].[K+]
Non-organic token detected:  O=[Se]([O-])[O-].[Na+].[Na+]
Non-organic token detected:  O=S(=O)([O-])OS(=O)(=O)[O-].[Na+].[Na+]
Non-organic token detected:  O=[Se](=O)([O-])[O-].[Na+].[Na+]
Non-organic token detected:  [Ra]
Non-organic token detected:  NP(=O)(O)O
Metal token detected:  Nc1c(N=Nc2ccc(-c3ccccc3OCCC[Na])nc2)cc(S(=O)(=O)O)c2ccccc12
Non-organic token detected:  [Cs]
Metal token detected:  C=CCCCCCCCCC(=O)O[Zn]OC(=O)CCCCCCCCC=C
Metal token detected:  Nc1ccc(S(=O)(=O)O[Zn]OS(=O)(=O)c2ccc(N)cc2)cc1
Non-organic token detected:  N#N
Metal token detected:  CC(=O)OCC1=C2C(=O)O[Zn]OC(=O)C(N)CCCC(=O)NC3C(=O)N2C3SC1


Tokenizing SMILES:   9%|██████████▉                                                                                                            | 196790/2136187 [00:00<00:08, 217958.62it/s]

Non-organic token detected:  F[P-](F)(F)(F)(F)F.[Na+]
Non-organic token detected:  [Ag+].[Ag+].[S-2]
Metal token detected:  OC1COc2ccccc2CNCCNCCNCc2ccccc2OC1.[O-][Cl+3]([O-])([O-])O[Zn]O[Cl+3]([O-])([O-])[O-]
Non-organic token detected:  O=[N+]([O-])[O-].[Ag+]
Non-organic token detected:  F[Al-3](F)(F)(F)(F)F
Non-organic token detected:  [Cl-].[Rb+]
Non-organic token detected:  O=[N+]([O-])[O-]
Non-organic token detected:  [N-]=[N+]=N
Non-organic token detected:  [Ag+].[Cl-]
Non-organic token detected:  NS(=O)(=O)NO


Tokenizing SMILES:  11%|█████████████▎                                                                                                         | 240074/2136187 [00:01<00:08, 213481.80it/s]

Non-organic token detected:  O.O=[Mg].O=[Mg].O=[Mg].O=[Si]=O.O=[Si]=O.O=[Si]=O.O=[Si]=O
Metal token detected:  O=C(CCCCC1CCS[Zn]S1)NC(Cc1c[nH]cn1)C(=O)[O-].[Na+]
Non-organic token detected:  [Ag+].[OH-]
Non-organic token detected:  [Ag+].[Ag+].[Se-2]
Non-organic token detected:  NS(=O)(=O)[O-].[Na+]
Non-organic token detected:  O=S(=O)([O-])[O-].[Na+].[Na+]
Non-organic token detected:  [F-].[Na+]


Tokenizing SMILES:  14%|████████████████▉                                                                                                      | 303975/2136187 [00:01<00:08, 210885.62it/s]

Non-organic token detected:  [Cl-].[Cs+]
Non-organic token detected:  O=P(O)(O)NP(=O)(O)O
Non-organic token detected:  O=BOB([O-])OB([O-])OB=O.[Na+].[Na+]
Non-organic token detected:  [N-]=[N+]=N.[Na+]


Tokenizing SMILES:  16%|███████████████████▎                                                                                                   | 346259/2136187 [00:01<00:08, 210154.11it/s]

Non-organic token detected:  ClP1(Cl)=NP(Cl)(Cl)=NP(Cl)(Cl)=NP(Cl)(Cl)=N1
Non-organic token detected:  O=P([O-])([O-])O.[Na+].[Na+]
Non-organic token detected:  O=P(O)(O)O
Non-organic token detected:  NP(N)(=O)O
Non-organic token detected:  O=P([O-])([O-])[O-].[Ag+].[Ag+].[Ag+]
Non-organic token detected:  NS(N)(=O)=O


Tokenizing SMILES:  18%|█████████████████████▋                                                                                                 | 388784/2136187 [00:01<00:08, 210570.42it/s]

Non-organic token detected:  O=P([O-])(O)O.[Na+]
Non-organic token detected:  O=P([O-])([O-])[O-].[Na+].[Na+].[Na+]
Metal token detected:  [O-][Cl+3]([O-])([O-])O[Zn]O[Cl+3]([O-])([O-])[O-].c1ccc2c(c1)CNCCNCCNCc1ccccc1OCC(Oc1ccc3ccccc3c1)CO2
Non-organic token detected:  O=[N+]([O-])O
Non-organic token detected:  [O-][N+](O)=NO
Metal token detected:  COC1COc2ccccc2CNCCNCCNCc2ccccc2OC1.[O-][Cl+3]([O-])([O-])O[Zn]O[Cl+3]([O-])([O-])[O-]


Tokenizing SMILES:  20%|███████████████████████▉                                                                                               | 430632/2136187 [00:02<00:08, 206992.03it/s]

Non-organic token detected:  O=[PH]([O-])O.[K+]
Non-organic token detected:  O=S(=O)([O-])F.[Na+]
Non-organic token detected:  O=P([O-])([O-])OP(=O)([O-])[O-].[Na+].[Na+].[Na+].[Na+]
Non-organic token detected:  O=S(=O)(O)OS(=O)(=O)O


Tokenizing SMILES:  23%|███████████████████████████▌                                                                                           | 493794/2136187 [00:02<00:07, 209440.75it/s]

Non-organic token detected:  O=[SH](=O)O
Non-organic token detected:  O=S(O)S(=O)O
Non-organic token detected:  O=NOO
Metal token detected:  NS(=O)(=O)c1ccc(CCNC(=O)CN(CCN(CCN(CC(=O)O)CC(=O)NCCc2ccc(S(N)(=O)=O)cc2)CC(=O)O)CC(=O)O)cc1.[Zn]
Non-organic token detected:  [Al]#P
Metal token detected:  NS(=O)(=O)c1ccc(CNC(=O)CN(CCOCCOCCN(CC(=O)O)CC(=O)NCc2ccc(S(N)(=O)=O)cc2)CC(=O)O)cc1.[Zn]
Non-organic token detected:  [He]
Non-organic token detected:  O=S(=O)([O-])NS(=O)(=O)[O-].[Na+].[Na+]
Non-organic token detected:  FS(F)(F)(F)(F)F
Non-organic token detected:  O=[Te](=O)([O-])[O-].[Na+].[Na+]
Metal token detected:  Cn1nc(S(N)(=O)=O)sc1=NC(=O)CN(CCN(CC(=O)O)CC(=O)N=c1sc(S(N)(=O)=O)nn1C)CC(=O)O.[Zn]
Metal token detected:  NS(=O)(=O)c1nnc(NC(=O)CN(CCN(CC(=O)O)CC(=O)Nc2nnc(S(N)(=O)=O)s2)CC(=O)O)s1.[Zn]
Metal token detected:  NS(=O)(=O)c1ccc(CNC(=O)CN(CCN(CC(=O)O)CC(=O)NCc2ccc(S(N)(=O)=O)cc2)CC(=O)O)cc1.[Zn]
Metal token detected:  NS(=O)(=O)c1ccc(NC(=O)CN(CCN(CC(=O)O)CC(=O)Nc2ccc(S(N)(=O)=O)c

Tokenizing SMILES:  25%|█████████████████████████████▊                                                                                         | 535518/2136187 [00:02<00:07, 207201.55it/s]

Non-organic token detected:  OP1(O)=NP(O)(O)=NP(O)(O)=NP(O)(O)=N1
Metal token detected:  [O-][Cl+3]([O-])([O-])O[Zn]O[Cl+3]([O-])([O-])[O-].c1ccc(OC2COc3ccccc3CNCCNCCNCc3ccccc3OC2)cc1
Non-organic token detected:  [O-][N+]([O-])=NO
Non-organic token detected:  O=S(=O)(O)F
Non-organic token detected:  O=S(=O)(O)OOS(=O)(=O)O
Metal token detected:  [O-][Cl+3]([O-])([O-])O.[O-][Cl+3]([O-])([O-])O.[Zn].c1ccc2c(-n3cc(CN4CCCNCCN(Cc5cn(-c6cccc7ccccc67)nn5)CCCNCC4)nn3)cccc2c1


Tokenizing SMILES:  28%|█████████████████████████████████▎                                                                                     | 598688/2136187 [00:02<00:07, 209189.53it/s]

Non-organic token detected:  Cl.NO
Non-organic token detected:  Cl.Cl.NN
Non-organic token detected:  [Xe]
Non-organic token detected:  O=[Al]O[Al]=O
Non-organic token detected:  O=S(=O)(O)S


Tokenizing SMILES:  32%|██████████████████████████████████████                                                                                 | 683629/2136187 [00:03<00:06, 209134.23it/s]

Non-organic token detected:  O=S(=O)(O)O
Non-organic token detected:  [Se-2]


Tokenizing SMILES:  38%|████████████████████████████████████████████▉                                                                          | 807551/2136187 [00:03<00:06, 205082.63it/s]

Non-organic token detected:  O
Non-organic token detected:  O=NO
Non-organic token detected:  O=P(O)(O)OP(=O)(O)O
Non-organic token detected:  O=S(O)O
Non-organic token detected:  N
Non-organic token detected:  [O-][Cl+3]([O-])([O-])O
Non-organic token detected:  [O-][Br+2]([O-])O
Non-organic token detected:  [O-][I+2]([O-])O
Non-organic token detected:  [O-][I+3]([O-])([O-])O
Non-organic token detected:  O=P(O)(O)OO
Non-organic token detected:  NO
Non-organic token detected:  O=P([O-])([O-])O.[K+].[K+]
Non-organic token detected:  II
Non-organic token detected:  [Al+3].[OH-].[OH-].[OH-]
Non-organic token detected:  [Cl-].[Na+]
Non-organic token detected:  [Zn]
Non-organic token detected:  [Mg+2].[OH-].[OH-]
Non-organic token detected:  [K+].[O-][Cl+3]([O-])([O-])[O-]
Non-organic token detected:  [I-].[Na+]
Non-organic token detected:  [I-].[Na+]
Non-organic token detected:  [Ca+2].[Cl-].[Cl-]
Non-organic token detected:  [Cl-].[Cl-].[Zn+2]
Non-organic token detected:  [Xe]
Non-organic

Tokenizing SMILES:  40%|███████████████████████████████████████████████▎                                                                       | 850101/2136187 [00:04<00:06, 195618.44it/s]

Non-organic token detected:  [K]
Non-organic token detected:  O=S(=O)([O-])OOS(=O)(=O)[O-].[Na+].[Na+]
Non-organic token detected:  [O-][Cl+2]([O-])O
Metal token detected:  CN(C)c1ccc2nc3ccc(=[N+](C)C)cc-3oc2c1.Cl[Zn]Cl.[Cl-]


Tokenizing SMILES:  43%|██████████████████████████████████████████████████▋                                                                    | 910466/2136187 [00:04<00:06, 197259.42it/s]

Non-organic token detected:  Br
Non-organic token detected:  I
Non-organic token detected:  [Ag+]
Non-organic token detected:  O=[PH](O)O
Non-organic token detected:  Cl
Non-organic token detected:  O=P(O)(O)OP(=O)(O)OP(=O)(O)O
Non-organic token detected:  [H]O[H]
Non-organic token detected:  F
Non-organic token detected:  O[As](O)O
Non-organic token detected:  [C-]#[O+]
Non-organic token detected:  O=S=O
Non-organic token detected:  [SeH2]
Non-organic token detected:  [AsH3]
Non-organic token detected:  [Kr]
Non-organic token detected:  O=O
Non-organic token detected:  N#[N+][O-]
Non-organic token detected:  [Zn+2]
Non-organic token detected:  [Li+]
Non-organic token detected:  [Xe]
Non-organic token detected:  N=O
Non-organic token detected:  SSS
Non-organic token detected:  [Na+].[O-]Cl
Non-organic token detected:  N.N.O=NN(O)S(=O)(=O)O


Tokenizing SMILES:  45%|█████████████████████████████████████████████████████▎                                                                 | 956044/2136187 [00:04<00:05, 212769.85it/s]

Non-organic token detected:  [Na+].[O-][Br+2]([O-])[O-]


Tokenizing SMILES:  49%|█████████████████████████████████████████████████████████▉                                                            | 1048934/2136187 [00:04<00:04, 228094.41it/s]

Non-organic token detected:  ClP(Cl)(Cl)(Cl)Cl


Tokenizing SMILES:  52%|█████████████████████████████████████████████████████████████▊                                                        | 1119245/2136187 [00:05<00:04, 232175.81it/s]

Non-organic token detected:  O=[N+]([O-])[O-].[NH4+]


Tokenizing SMILES:  56%|█████████████████████████████████████████████████████████████████▋                                                    | 1189043/2136187 [00:05<00:04, 231985.64it/s]

Non-organic token detected:  O.O.O.O.O.O.O.O.O.O.O=S(=O)([O-])[O-].[Na+].[Na+]
Non-organic token detected:  [Na+].[O-][Cl+2]([O-])[O-]
Non-organic token detected:  [F-].[K+]
Non-organic token detected:  [Na+].[Na+].[O-]N=[N+]([O-])[O-]
Non-organic token detected:  [I-].[Na+]
Non-organic token detected:  O=S(=O)([O-])ONOS(=O)(=O)[O-].[Na+].[Na+]


Tokenizing SMILES:  58%|████████████████████████████████████████████████████████████████████▏                                                 | 1234977/2136187 [00:05<00:03, 226142.21it/s]

Non-organic token detected:  [Br-].[K+]
Non-organic token detected:  [Br-].[Na+]
Metal token detected:  CN(C)c1ccc2nc3ccc(=[N+](C)C)cc-3oc2c1.Cl[Zn]Cl
Non-organic token detected:  [Na+].[O-][Cl+3]([O-])([O-])[O-]
Non-organic token detected:  O=[N+]([O-])[O-].[Na+]
Non-organic token detected:  [C-]#N.[Na+]
Non-organic token detected:  [Na+].[SH-]
Non-organic token detected:  OCl
Non-organic token detected:  O=P1(O)OP(=O)(O)OP(=O)(O)O1
Non-organic token detected:  O=S([O-])O.[Na+]
Non-organic token detected:  O=[N+]([O-])[O-].[K+]
Non-organic token detected:  [OH]


Tokenizing SMILES:  61%|███████████████████████████████████████████████████████████████████████▉                                              | 1301792/2136187 [00:06<00:03, 219335.72it/s]

Non-organic token detected:  O=S(=O)(O)ONOS(=O)(=O)O
Non-organic token detected:  F[B-](F)(F)F
Non-organic token detected:  NO.NO.O=S(=O)(O)O
Non-organic token detected:  Cl[Al](Cl)Cl.O.O.O.O.O.O
Non-organic token detected:  O=P([O-])([O-])[O-].[Al+3]
Non-organic token detected:  N.[O-][Cl+3]([O-])([O-])O
Non-organic token detected:  F[B-](F)(F)F.[Na+]
Non-organic token detected:  N=S(=O)([O-])[O-].[Na+].[Na+]
Non-organic token detected:  F
Non-organic token detected:  I


Tokenizing SMILES:  63%|██████████████████████████████████████████████████████████████████████████▎                                           | 1345338/2136187 [00:06<00:03, 212639.14it/s]

Non-organic token detected:  O=[As]O
Non-organic token detected:  [Na+].[O-][Cl+][O-]
Non-organic token detected:  NN.O=S(=O)(O)O
Non-organic token detected:  O.O.O.O.O.O.[Cl-].[Cl-].[Sr+2]
Non-organic token detected:  O=[As][O-].[Na+]
Non-organic token detected:  [O-][Cl+]O
Non-organic token detected:  O=P([O-])([O-])F.[Na+].[Na+]
Non-organic token detected:  O=S([O-])[O-].[Na+].[Na+]


Tokenizing SMILES:  66%|█████████████████████████████████████████████████████████████████████████████▉                                        | 1410340/2136187 [00:06<00:03, 209903.18it/s]

Non-organic token detected:  O=S(O)S(=O)(=O)O
Non-organic token detected:  O=[Se](O)O
Non-organic token detected:  O=S(=O)([O-])[O-].[Mg+2]
Non-organic token detected:  O=S(=O)([O-])[O-].[K+].[K+]
Non-organic token detected:  O=[As][O-].[K+]
Non-organic token detected:  ClP1(Cl)=NP(Cl)(Cl)=NP(Cl)(Cl)=N1
Non-organic token detected:  O=P([O-])(O)OP(=O)(O)O.[Na+]
Non-organic token detected:  O=[Se]([O-])O.[Na+]
Non-organic token detected:  O=S([O-])S(=O)(=O)[O-].[Na+].[Na+]
Non-organic token detected:  O=[Si](O)O
Non-organic token detected:  O=[Te](=O)(O)O
Non-organic token detected:  O=[Si]([O-])O[Si]([O-])([O-])O[Si](=O)[O-].[Mg+2].[Mg+2]
Metal token detected:  O=C(CCCCC1CCS[Zn]S1)NC(Cc1c[nH]cn1)C(=O)O
Non-organic token detected:  O=BOB(O)OB(O)OB=O
Non-organic token detected:  O.O.O.O.O.O=S([O-])([O-])=S.[Na+].[Na+]


Tokenizing SMILES:  68%|████████████████████████████████████████████████████████████████████████████████▏                                     | 1452153/2136187 [00:06<00:03, 204364.84it/s]

Non-organic token detected:  O=[Se](=O)(O)O
Non-organic token detected:  O=[Si](O)O[Si](O)(O)O[Si](=O)O
Non-organic token detected:  O=P[O-].[Na+]
Non-organic token detected:  [CaH2]
Non-organic token detected:  [MgH2]
Non-organic token detected:  [LiH]
Non-organic token detected:  O=S(O)S(=O)(=O)[O-].[K+]
Non-organic token detected:  [C-]#[N+][O-].[Na+]
Non-organic token detected:  [NH-]S(N)(=O)=O.[Na+]
Non-organic token detected:  O=P(=O)[O-].[K+]
Non-organic token detected:  O=P([O-])([O-])[O-].O=P([O-])([O-])[O-].[Ca+2].[Ca+2].[Ca+2]
Non-organic token detected:  [C]
Non-organic token detected:  [H]O[H]
Non-organic token detected:  O=[Ca]
Non-organic token detected:  [Cl-].[K+]
Non-organic token detected:  O=S(=O)([O-])[O-].[Ba+2]
Non-organic token detected:  [K+].[OH-]
Non-organic token detected:  O.O.O.O.O.O.O.O.O.[Na+].[Na+].[S-2]
Non-organic token detected:  O=[Ca].[Na+].[OH-]
Non-organic token detected:  [Cl-].[Cl-].[Ra+2]
Non-organic token detected:  [Na+].[OH-]
Non-organic to

Tokenizing SMILES:  71%|███████████████████████████████████████████████████████████████████████████████████▌                                  | 1513327/2136187 [00:07<00:03, 194186.41it/s]

Non-organic token detected:  [K+].[O-][Br+2]([O-])[O-]
Non-organic token detected:  [C-]#[O+]
Non-organic token detected:  O[As](O)O[As](O)O[As](O)O
Non-organic token detected:  O1[As]2O[As]3O[As]1O[As](O2)O3
Non-organic token detected:  [Ba+2].[Ca+2].[OH-].[OH-].[OH-].[OH-]
Metal token detected:  O=S(=O)(O)C1CCC(O)C2NCCCC21.O=S(=O)(O)C1CCC(O)C2NCCCC21.[Zn]


Tokenizing SMILES:  73%|█████████████████████████████████████████████████████████████████████████████████████▊                                | 1554607/2136187 [00:07<00:02, 200120.95it/s]

Non-organic token detected:  O=[As](O)(O)O
Non-organic token detected:  O=NN(O)S(=O)(=O)O
Non-organic token detected:  O=P([O-])([O-])[O-].O=P([O-])([O-])[O-].[Zn+2].[Zn+2].[Zn+2]
Metal token detected:  N=C(N)c1ccc2nc(Cc3nc4ccc(C(=N)N)cc4[nH]3)[nH]c2c1.[Zn]
Metal token detected:  CC(=O)[O-].CC(=O)[O-].CC(=O)[O-].CC(=O)[O-].COc1ccc2[nH]c3c(C)c4cc[n+](CCCCC(=O)Nc5ccc(-c6c7nc(c(-c8cc[n+](C)cc8)c8ccc([nH]8)c(-c8cc[n+](C)cc8)c8nc(c(-c9cc[n+](C)cc9)c9ccc6[nH]9)C=C8)C=C7)cc5)cc4c(C)c3c2c1.[Zn]
Metal token detected:  CCCCc1ccc2c(c1)-c1nc-2nc2[nH]c(nc3nc(nc4[nH]c(n1)c1ccc(CCCC)cc41)-c1cccc(C#CC4(O)CCC5C6CCc7cc(O)ccc7C6CCC54C)c1-3)c1ccc(CCCC)cc21.[Zn]
Non-organic token detected:  [C-]#N.[K+]
Non-organic token detected:  [Ca+2].[O-]Cl.[O-]Cl
Non-organic token detected:  N.N.O=S(=O)(O)OOS(=O)(=O)O
Non-organic token detected:  Cl[Ca]Cl.Cl[Mg]Cl
Metal token detected:  CC12CCC3c4ccc(O)cc4CCC3C1CCC2(O)CCCCC#CC#Cc1cccc2c1-c1nc-2nc2[nH]c(nc3nc(nc4[nH]c(n1)c1ccc(S(=O)(=O)[O-])cc41)-c1ccc(S(=O)(=O)[O-])cc

Tokenizing SMILES:  78%|███████████████████████████████████████████████████████████████████████████████████████████▊                          | 1662848/2136187 [00:07<00:02, 215141.78it/s]

Non-organic token detected:  O=P(Cl)(Cl)Cl
Non-organic token detected:  O.O.O.O.O.O.O=P([O-])([O-])OP(=O)([O-])OP(=O)([O-])[O-].[Na+].[Na+].[Na+].[Na+].[Na+]
Non-organic token detected:  O=[N+]([O-])[O-].O=[N+]([O-])[O-].[Ca+2]
Non-organic token detected:  N.NS(=O)(=O)O
Non-organic token detected:  O=[Se]=O
Non-organic token detected:  O.O.O.O.O.O.[Cl-].[Cl-].[Mg+2]
Non-organic token detected:  O=[N+]([O-])[O-].O=[N+]([O-])[O-].[Ba+2]
Non-organic token detected:  [Ba+2].[Cl-].[Cl-]
Metal token detected:  NCCC(=O)N1[Zn]OC(=O)C1Cc1cnc[nH]1
Non-organic token detected:  O.O.O.O.O=S(=O)([O-])[O-].[Be+2]
Non-organic token detected:  O=P([O-])(O)OP(=O)([O-])O.[Na+].[Na+]
Non-organic token detected:  [K+].[O-][Cl+2]([O-])[O-]
Non-organic token detected:  O=[Si]=O
Non-organic token detected:  O.O[Al](O)O
Non-organic token detected:  F[B-](F)(F)F.[K+]
Non-organic token detected:  O=S(=O)([O-])OOS(=O)(=O)[O-].[K+].[K+]
Non-organic token detected:  [Ag+].[C-]#N.[C-]#N.[K+]
Non-organic token detect

Tokenizing SMILES:  81%|███████████████████████████████████████████████████████████████████████████████████████████████▋                      | 1731687/2136187 [00:08<00:01, 226146.79it/s]

Non-organic token detected:  O.O=[Si]([O-])[O-].O=[Si]([O-])[O-].O=[Si]([O-])[O-].O=[Si]([O-])[O-].[Al+3].[Al+3].[Mg+2]
Non-organic token detected:  O=S([O-])S(=O)[O-].[Na+].[Na+]
Non-organic token detected:  O=S(=O)(O)SSS(=O)(=O)O


Tokenizing SMILES:  84%|███████████████████████████████████████████████████████████████████████████████████████████████████▋                  | 1803954/2136187 [00:08<00:01, 219546.80it/s]

Non-organic token detected:  [Na+].[Na+].[S-]S[S-]
Non-organic token detected:  [Li+].[O-][Cl+3]([O-])([O-])[O-]
Non-organic token detected:  O.O=S(=O)([O-])[O-].O=S(=O)([O-])[O-].O=S(=O)([O-])[O-].[Al+3].[Al+3]


Tokenizing SMILES:  87%|███████████████████████████████████████████████████████████████████████████████████████████████████████▏              | 1868711/2136187 [00:08<00:01, 208695.11it/s]

Non-organic token detected:  F[Ag].N.N
Non-organic token detected:  [I-].[Na+]
Non-organic token detected:  [Mg+2].[O-][O-]
Non-organic token detected:  O[B-]1(O)OO[B-](O)(O)OO1.[Na+].[Na+]
Non-organic token detected:  O[B-]1(O)OO[B-](O)(O)OO1
Non-organic token detected:  O.O=O.[O-][Cl+][O-].[O-][Cl+][O-].[O-][Cl+][O-].[O-][Cl+][O-]


Tokenizing SMILES:  89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▌            | 1910365/2136187 [00:09<00:01, 206480.71it/s]

Non-organic token detected:  O=S(=O)([O-])SSSSS(=O)(=O)[O-].[K+].[K+]
Non-organic token detected:  O=S(=O)([O-])SSSS(=O)(=O)[O-].[K+].[K+]
Non-organic token detected:  O=S(=O)([O-])SS(=O)(=O)[O-].[K+].[K+]
Non-organic token detected:  O=S([O-])([O-])=S.[Na+].[Na+]
Non-organic token detected:  O=S(=O)(O)SSSSS(=O)(=O)O
Non-organic token detected:  O=S(=O)(O)SSSS(=O)(=O)O
Non-organic token detected:  O=S(=O)(O)SS(=O)(=O)O
Non-organic token detected:  O=S(=O)(O)NS(=O)(=O)O.[Na+]
Non-organic token detected:  O.O=S(=O)([O-])[O-].O=S(=O)([O-])[O-].[Al+3].[Al+3].[Al+3].[Al+3].[Al+3].[Mg+2].[Mg+2].[Mg+2].[Mg+2].[Mg+2].[Mg+2].[Mg+2].[Mg+2].[Mg+2].[Mg+2].[OH-].[OH-].[OH-].[OH-].[OH-].[OH-].[OH-].[OH-].[OH-].[OH-].[OH-].[OH-].[OH-].[OH-].[OH-].[OH-].[OH-].[OH-].[OH-].[OH-].[OH-].[OH-].[OH-].[OH-].[OH-].[OH-].[OH-].[OH-].[OH-].[OH-].[OH-]
Non-organic token detected:  O=[SH](=O)[O-].[Na+]
Non-organic token detected:  O.O.O=[Al]O[Si](=O)O[Si](=O)O[Al]=O
Non-organic token detected:  O.O.O.O.O.O.O.O.O.

Tokenizing SMILES:  95%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏     | 2029843/2136187 [00:09<00:00, 192835.98it/s]

Non-organic token detected:  [Ag+].[C-]#N.[C-]#N.[Na+]
Non-organic token detected:  F[As-](F)(F)(F)(F)F.[Na+]
Non-organic token detected:  F[Al-3](F)(F)(F)(F)F.[Na+].[Na+].[Na+]
Non-organic token detected:  [O-2].[Zn+2]
Non-organic token detected:  O.O.O.O.O.O.O.O=S(=O)([O-])[O-].[Mg+2]
Non-organic token detected:  O=S([O-])([O-])=S.[K+].[K+].[K]S[K]
Non-organic token detected:  O.O.O.O.O.O=P([O-])([O-])[O-].O=P([O-])([O-])[O-].[Mg+2].[Mg+2].[Mg+2]
Non-organic token detected:  O=[Mg].O=[Mg].O=[Mg].O=[Si]=O.O=[Si]=O.O=[Si]=O.O=[Si]=O
Non-organic token detected:  O=S(O)(O)=S.[K]S[K]


Tokenizing SMILES:  98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍  | 2089526/2136187 [00:09<00:00, 196790.18it/s]

Non-organic token detected:  S.S.S
Metal token detected:  Nc1c(N=Nc2ccc(-c3ccccc3OCCC[Na])nc2)cc(S(=O)(=O)[O-])c2ccccc12.[Na+]
Non-organic token detected:  OP(O)(=S)S


Tokenizing SMILES: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2136187/2136187 [00:10<00:00, 210032.95it/s]

Non-organic token detected:  [Ba]
Non-organic token detected:  [Ra]
Non-organic token detected:  [Rb]
Non-organic token detected:  Br
Non-organic token detected:  F[AsH](F)(F)(F)(F)F
Non-organic token detected:  [Cs]
Non-organic token detected:  I
CPU times: user 10.2 s, sys: 116 ms, total: 10.3 s
Wall time: 10.3 s





In [45]:
%%time
smiles = set(Parallel(n_jobs=-1)(delayed(mol_utils.clean_and_canonalize)(smi) for smi in df))

AttributeError: module 'rxitect.chem.mol_utils' has no attribute 'clean_and_canonalize'