## Pre-Tokenizer
Tokenize SMILES (Simplified Molecular-Input Line-Entry System) into units.

In [2]:
#hide
from nbdev.showdoc import *

In [3]:
def atomwise_tokenizer(smi, exclusive_tokens = None):
    """
    Tokenize a SMILES molecule at atom-level:
        (1) 'Br' and 'Cl' are two-character tokens
        (2) Symbols with bracket are considered as tokens
    exclusive_tokens: A list of specifical symbols with bracket you want to keep. e.g., ['[C@@H]', '[nH]'].
    Other symbols with bracket will be replaced by '[UNK]'. default is `None`.
    """
    import re
    pattern =  "(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9])"
    regex = re.compile(pattern)
    tokens = [token for token in regex.findall(smi)]

    if exclusive_tokens:
        for i, tok in enumerate(tokens):
            if tok.startswith('['):
                if tok not in exclusive_tokens:
                    tokens[i] = '[UNK]'
    return tokens
     

In [4]:
def kmer_tokenizer(smiles, ngram=4, stride=1, remove_last = False, exclusive_tokens = None):
    units = atomwise_tokenizer(smiles, exclusive_tokens = exclusive_tokens) #collect all the atom-wise tokens from the SMILES
    if ngram == 1:
        tokens = units
    else:
        tokens = [tokens_to_mer(units[i:i+ngram]) for i in range(0, len(units), stride) if len(units[i:i+ngram]) == ngram]

    if remove_last:
        if len(tokens[-1]) < ngram: #truncate last whole k-mer if the length of the last k-mers is less than ngram.
            tokens = tokens[:-1]
    return tokens

def tokens_to_mer(toks):
    return ''.join(toks)

Tokenize a SMILES string on atom-level.

In [5]:
f = open("C:/Users/Administrator/moses.smi.txt",'r')
smii = f.readlines()

print(smii[:5])
tokss=[atomwise_tokenizer(smi) for smi in smii]
print(tokss[:5])

['CCCS(=O)c1ccc2[nH]c(=NC(=O)OC)[nH]c2c1\n', 'CC(C)(C)C(=O)C(Oc1ccc(Cl)cc1)n1ccnc1\n', 'Cc1c(Cl)cccc1Nc1ncccc1C(=O)OCC(O)CO\n', 'Cn1cnc2c1c(=O)n(CC(O)CO)c(=O)n2C\n', 'CC1Oc2ccc(Cl)cc2N(CC(O)CO)C1=O\n']
[['C', 'C', 'C', 'S', '(', '=', 'O', ')', 'c', '1', 'c', 'c', 'c', '2', '[nH]', 'c', '(', '=', 'N', 'C', '(', '=', 'O', ')', 'O', 'C', ')', '[nH]', 'c', '2', 'c', '1'], ['C', 'C', '(', 'C', ')', '(', 'C', ')', 'C', '(', '=', 'O', ')', 'C', '(', 'O', 'c', '1', 'c', 'c', 'c', '(', 'Cl', ')', 'c', 'c', '1', ')', 'n', '1', 'c', 'c', 'n', 'c', '1'], ['C', 'c', '1', 'c', '(', 'Cl', ')', 'c', 'c', 'c', 'c', '1', 'N', 'c', '1', 'n', 'c', 'c', 'c', 'c', '1', 'C', '(', '=', 'O', ')', 'O', 'C', 'C', '(', 'O', ')', 'C', 'O'], ['C', 'n', '1', 'c', 'n', 'c', '2', 'c', '1', 'c', '(', '=', 'O', ')', 'n', '(', 'C', 'C', '(', 'O', ')', 'C', 'O', ')', 'c', '(', '=', 'O', ')', 'n', '2', 'C'], ['C', 'C', '1', 'O', 'c', '2', 'c', 'c', 'c', '(', 'Cl', ')', 'c', 'c', '2', 'N', '(', 'C', 'C', '(', 'O', ')', 'C',

In [7]:
smi = 'CC[N+](C)(C)Cc1ccccc1Br'
toks = atomwise_tokenizer(smi)
print(toks)

['C', 'C', '[N+]', '(', 'C', ')', '(', 'C', ')', 'C', 'c', '1', 'c', 'c', 'c', 'c', 'c', '1', 'Br']


In [8]:
#hide
atomwise_tokenizer('ABrBCD>>[dum]dumcobrclCl[Br] %11')

['Br', 'B', 'C', '>', '>', '[dum]', 'c', 'o', 'b', 'c', 'Cl', '[Br]', '%11']

In [9]:
sep_tokens = ['[C@@H]', '[C@@]']
smi = 'CC(C)C[C@@H]1N2C(=O)[C@](NC(=O)[C@H]3CN(C)[C@@H]4Cc5c(Br)[nH]c6cccc(C4=C3)c56)(O[C@@]2(O)[C@@H]7CCCN7C1=O)C(C)C'
toks = atomwise_tokenizer(smi, exclusive_tokens=sep_tokens)
print(toks)

['C', 'C', '(', 'C', ')', 'C', '[C@@H]', '1', 'N', '2', 'C', '(', '=', 'O', ')', '[UNK]', '(', 'N', 'C', '(', '=', 'O', ')', '[UNK]', '3', 'C', 'N', '(', 'C', ')', '[C@@H]', '4', 'C', 'c', '5', 'c', '(', 'Br', ')', '[UNK]', 'c', '6', 'c', 'c', 'c', 'c', '(', 'C', '4', '=', 'C', '3', ')', 'c', '5', '6', ')', '(', 'O', '[C@@]', '2', '(', 'O', ')', '[C@@H]', '7', 'C', 'C', 'C', 'N', '7', 'C', '1', '=', 'O', ')', 'C', '(', 'C', ')', 'C']


In [10]:
#hide
seq = 'ABCDTTDSE'
toks = atomwise_tokenizer(seq)
print(toks)

['B', 'C', 'S']


In [11]:
smi = 'CC[N+](C)(C)Cc1ccccc1Br'
toks = kmer_tokenizer(smi, ngram=4)
print(toks)

['CC[N+](', 'C[N+](C', '[N+](C)', '(C)(', 'C)(C', ')(C)', '(C)C', 'C)Cc', ')Cc1', 'Cc1c', 'c1cc', '1ccc', 'cccc', 'cccc', 'ccc1', 'cc1Br']


In [12]:
#smi = 'CC[N+](C)(C)Cc1ccccc1Br'


tokss=[kmer_tokenizer(smi,ngram=4) for smi in smii]
print(tokss[:5])


[['CCCS', 'CCS(', 'CS(=', 'S(=O', '(=O)', '=O)c', 'O)c1', ')c1c', 'c1cc', '1ccc', 'ccc2', 'cc2[nH]', 'c2[nH]c', '2[nH]c(', '[nH]c(=', 'c(=N', '(=NC', '=NC(', 'NC(=', 'C(=O', '(=O)', '=O)O', 'O)OC', ')OC)', 'OC)[nH]', 'C)[nH]c', ')[nH]c2', '[nH]c2c', 'c2c1'], ['CC(C', 'C(C)', '(C)(', 'C)(C', ')(C)', '(C)C', 'C)C(', ')C(=', 'C(=O', '(=O)', '=O)C', 'O)C(', ')C(O', 'C(Oc', '(Oc1', 'Oc1c', 'c1cc', '1ccc', 'ccc(', 'cc(Cl', 'c(Cl)', '(Cl)c', 'Cl)cc', ')cc1', 'cc1)', 'c1)n', '1)n1', ')n1c', 'n1cc', '1ccn', 'ccnc', 'cnc1'], ['Cc1c', 'c1c(', '1c(Cl', 'c(Cl)', '(Cl)c', 'Cl)cc', ')ccc', 'cccc', 'ccc1', 'cc1N', 'c1Nc', '1Nc1', 'Nc1n', 'c1nc', '1ncc', 'nccc', 'cccc', 'ccc1', 'cc1C', 'c1C(', '1C(=', 'C(=O', '(=O)', '=O)O', 'O)OC', ')OCC', 'OCC(', 'CC(O', 'C(O)', '(O)C', 'O)CO'], ['Cn1c', 'n1cn', '1cnc', 'cnc2', 'nc2c', 'c2c1', '2c1c', 'c1c(', '1c(=', 'c(=O', '(=O)', '=O)n', 'O)n(', ')n(C', 'n(CC', '(CC(', 'CC(O', 'C(O)', '(O)C', 'O)CO', ')CO)', 'CO)c', 'O)c(', ')c(=', 'c(=O', '(=O)', '=O)n', 'O)n2', 

In [36]:
#!pip install SmilesPE

Collecting SmilesPE
  Downloading SmilesPE-0.0.3-py3-none-any.whl (15 kB)
Installing collected packages: SmilesPE
Successfully installed SmilesPE-0.0.3


## Pre-trained SmilesPE Tokenizer

In [13]:
smiii=smii[:10]
smiii

['CCCS(=O)c1ccc2[nH]c(=NC(=O)OC)[nH]c2c1\n',
 'CC(C)(C)C(=O)C(Oc1ccc(Cl)cc1)n1ccnc1\n',
 'Cc1c(Cl)cccc1Nc1ncccc1C(=O)OCC(O)CO\n',
 'Cn1cnc2c1c(=O)n(CC(O)CO)c(=O)n2C\n',
 'CC1Oc2ccc(Cl)cc2N(CC(O)CO)C1=O\n',
 'CCOC(=O)c1cncn1C1CCCc2ccccc21\n',
 'COc1ccccc1OC(=O)Oc1ccccc1OC\n',
 'O=C1Nc2ccc(Cl)cc2C(c2ccccc2Cl)=NC1O\n',
 'CN1C(=O)C(O)N=C(c2ccccc2Cl)c2cc(Cl)ccc21\n',
 'CCC(=O)c1ccc(OCC(O)CO)c(OC)c1\n']

In [14]:
import codecs
from SmilesPE.tokenizer import *

spe_vob= codecs.open('C:/Users/Administrator/SPE_ChEMBL.txt')
spe = SPE_Tokenizer(spe_vob)

for smi in smiii:
    tokens = spe.tokenize(smi)
    print(tokens.split()) #split(); 공백을 기준으로 분리

['CCC', 'S(=O)', 'c1ccc2[nH]', 'c(', '=', 'NC(=O)', 'OC)', '[nH]c2c1']
['CC(C)(C)', 'C(=O)C(', 'Oc1ccc(', 'Cl)cc1)', 'n1cc', 'nc1']
['Cc1c(Cl)', 'cccc1', 'Nc1n', 'cccc1', 'C(=O)OCC', '(O)', 'CO']
['Cn1', 'cnc2', 'c1c(=O)', 'n(', 'CC(O)', 'CO)', 'c(=O)n2', 'C']
['CC1', 'O', 'c2ccc(Cl)cc2', 'N(', 'CC(O)', 'CO)', 'C1=O']
['CCOC(=O)', 'c1cn', 'cn1', 'C1', 'CCC', 'c2ccccc21']
['COc1ccccc1', 'O', 'C(=O)O', 'c1ccccc1', 'OC']
['O=C1N', 'c2ccc(Cl)cc2', 'C(', 'c2ccccc2Cl)', '=N', 'C1O']
['CN1C(=O)', 'C(O)', 'N=C(', 'c2ccccc2Cl)', 'c2cc(Cl)cc', 'c21']
['CCC(=O)', 'c1ccc(OCC', '(O)', 'CO)', 'c(OC)c1']


## SmilesPE Learner
Train a SmilesPE learner

In [15]:
#!pip install fastprogress

In [16]:
#export

import os
import sys
import inspect
import copy
import io
import warnings
import re
from collections import defaultdict, Counter
from fastprogress.fastprogress import master_bar, progress_bar

from SmilesPE.pretokenizer import *

def randomize_smiles(smiles):
    """
    Require `RDKit` library. 
    
    Generate a new SMILES string for the same molecule.
    
    Perform a randomization of a SMILES string must be RDKit sanitizable.
    """
    import random
    import numpy as np
    from rdkit import Chem
    
    m = Chem.MolFromSmiles(smiles)
    ans = list(range(m.GetNumAtoms()))
    np.random.shuffle(ans)
    nm = Chem.RenumberAtoms(m,ans)
    return Chem.MolToSmiles(nm, canonical=False, isomericSmiles=True, kekuleSmiles=False)

def corpus_augment(infile, outdir, cycles): #randomize_smiles로 데이터 augmentation
    '''
    infile: line separated SMILES file
    outdir: directory to save the  augmented SMILE file. 
        Each round of augmentation will save as a separated file, named as `infile_Ri`. 
    cycles: number of rounds for SMILES augmentation
    '''
    if cycles <= 0:
        raise ValueError("Invalid option,  cycle should be larger than 0")
    
    with open(infile, "r") as ins:
        can_smiles = []
        for line in ins:
            can_smiles.append(line.split('\n')[0])
    
    fname = os.path.basename(infile).split('.')[0]
    ftype = os.path.basename(infile).split('.')[1]
    
    mb = master_bar(range(cycles))
    for i in mb:
        with open(f'{outdir}/{fname}_R{i}.{ftype}', 'a') as outfile:
            for smi in progress_bar(can_smiles, parent=mb):
                randomized_smi = randomize_smiles(smi)
                outfile.write(randomized_smi + '\n')

def get_vocabulary(smiles, augmentation=0, exclusive_tokens = False): 
    #input된 smiles를 count하고, 지정된 횟수만큼 augmentation 하고, 
    #atomwise_tokenizer함수로 토큰화하고 해당 토큰을 카운트하고, 딕셔너리 형태로 반환함.
    """Read text and return dictionary that encodes vocabulary
    """
    print('Counting SMILES...')
    vocab = Counter()
    
    for i, smi in enumerate(smiles):
        vocab[smi] += 1
    
    print(f'{len(vocab)} unique Canonical SMILES')
    
    if augmentation>0:
        print(f'Augmenting SMILES...({augmentation} times)')
        mb = master_bar(range(augmentation))
        for i in mb:        
            for smi in progress_bar(smiles, parent=mb):
                randomized_smi = randomize_smiles(smi)
                vocab[randomized_smi] += 1
    
        print(f'{len(vocab)} unique SMILES (Canonical + Augmented)')
    return dict([(tuple(atomwise_tokenizer(x)) ,y) for (x,y) in vocab.items()])

def update_pair_statistics(pair, changed, stats, indices):
    #자주 등장하는 smiles pair를 하나의 문자로
    #pair가 포함된 문자열의 빈도수와 인덱스를 업데이트함.
    """Minimally update the indices and frequency of symbol pairs
    if we merge a pair of symbols, only pairs that overlap with occurrences
    of this pair are affected, and need to be updated.
    """
    stats[pair] = 0
    indices[pair] = defaultdict(int)
    first, second = pair
    new_pair = first+second
    for j, word, old_word, freq in changed:

        # find all instances of pair, and update frequency/indices around it
        i = 0
        while True:
            # find first symbol
            try:
                i = old_word.index(first, i)
            except ValueError:
                break
            # if first symbol is followed by second symbol, we've found an occurrence of pair (old_word[i:i+2])
            if i < len(old_word)-1 and old_word[i+1] == second:
                # assuming a symbol sequence "A B C", if "B C" is merged, reduce the frequency of "A B"
                if i:
                    prev = old_word[i-1:i+1]
                    stats[prev] -= freq
                    indices[prev][j] -= 1
                if i < len(old_word)-2:
                    # assuming a symbol sequence "A B C B", if "B C" is merged, reduce the frequency of "C B".
                    # however, skip this if the sequence is A B C B C, because the frequency of "C B" will be reduced by the previous code block
                    if old_word[i+2] != first or i >= len(old_word)-3 or old_word[i+3] != second:
                        nex = old_word[i+1:i+3]
                        stats[nex] -= freq
                        indices[nex][j] -= 1
                i += 2
            else:
                i += 1

        i = 0
        while True:
            try:
                # find new pair
                i = word.index(new_pair, i)
            except ValueError:
                break
            # assuming a symbol sequence "A BC D", if "B C" is merged, increase the frequency of "A BC"
            if i:
                prev = word[i-1:i+1]
                stats[prev] += freq
                indices[prev][j] += 1
            # assuming a symbol sequence "A BC B", if "B C" is merged, increase the frequency of "BC B"
            # however, if the sequence is A BC BC, skip this step because the count of "BC BC" will be incremented by the previous code block
            if i < len(word)-1 and word[i+1] != new_pair:
                nex = word[i:i+2]
                stats[nex] += freq
                indices[nex][j] += 1
            i += 1
            
def get_pair_statistics(vocab):#pair의 빈도 count, index 생성
    """Count frequency of all symbol pairs, and create index"""

    # data structure of pair frequencies
    stats = defaultdict(int)

    #index from pairs to words
    indices = defaultdict(lambda: defaultdict(int))
    
    for i, (word, freq) in enumerate(progress_bar(vocab)):
        prev_char = word[0]
        for char in word[1:]:
            stats[prev_char, char] += freq
            indices[prev_char, char][i] += 1
            prev_char = char

    return stats, indices



In [17]:
def replace_pair(pair, vocab, indices):
    """Replace all occurrences of a symbol pair ('A', 'B') with a new symbol 'AB'"""
    first, second = pair
    pair_str = ''.join(pair)
    pair_str = pair_str.replace('\\','\\\\')
    changes = []
    pattern = re.compile(r'(?<!\S)' + re.escape(first + ' ' + second) + r'(?!\S)')
    if sys.version_info < (3, 0):
        iterator = indices[pair].iteritems()
    else:
        iterator = indices[pair].items()
    for j, freq in iterator:
        if freq < 1:
            continue
        word, freq = vocab[j]
        new_word = ' '.join(word)
        new_word = pattern.sub(pair_str, new_word)
        new_word = tuple(new_word.split(' '))

        vocab[j] = (new_word, freq)
        changes.append((j, new_word, word, freq))

    return changes

def prune_stats(stats, big_stats, threshold): #속도 높이기 위해 pair의 빈도 가지치기
    """Prune statistics dict for efficiency of max()
    The frequency of a symbol pair never increases, so pruning is generally safe
    (until we the most frequent pair is less frequent than a pair we previously pruned)
    big_stats keeps full statistics for when we need to access pruned items
    """
    for item,freq in list(stats.items()):
        if freq < threshold:
            del stats[item]
            if freq < 0:
                big_stats[item] += freq
            else:
                big_stats[item] = freq

def learn_SPE(infile, outfile, num_symbols, min_frequency=2, augmentation=0, verbose=False, total_symbols=False):
    """
    Learn num_symbols SPE operations from infile and write to outfile.
    
    *infile*: a list of SMILES
    
    *num_symbols*: maximum total number of SPE symbols 
    
    *min_frequency*: the minimum frequency of SPE symbols appears.
    
    *augmentation*: times of SMILES augmentation
    
    *verbose*: if True, print the merging process
    
    *total_symbols*: if True; the maximum total of SPE symbols = num_symbols - number of atom-level tokens
    """
    

    
    vocab = get_vocabulary(infile, augmentation=augmentation)
    sorted_vocab = sorted(vocab.items(), key=lambda x: x[1], reverse=True)
    
    print('Gettting Pair Statistics')
    stats, indices = get_pair_statistics(sorted_vocab)
    big_stats = copy.deepcopy(stats)

    if total_symbols:
        uniq_char = set()
        for word in vocab:
            for char in word:
                uniq_char.add(char)
        sys.stderr.write(f'Number of unique characters & Reducing number of merge operations by: {len(uniq_char)}\n')
        sys.stderr.write(f'Unique characters: {(uniq_char)}\n')
        num_symbols -= len(uniq_char)
                    
    # threshold is inspired by Zipfian assumption, but should only affect speed
    threshold = max(stats.values()) / 10
    for i in range(num_symbols):
        if stats:
            most_frequent = max(stats, key=lambda x: (stats[x], x))

        # we probably missed the best pair because of pruning; go back to full statistics
        if not stats or (i and stats[most_frequent] < threshold):
            prune_stats(stats, big_stats, threshold)
            stats = copy.deepcopy(big_stats)
            most_frequent = max(stats, key=lambda x: (stats[x], x))
            # threshold is inspired by Zipfian assumption, but should only affect speed
            threshold = stats[most_frequent] * i/(i+10000.0)
            prune_stats(stats, big_stats, threshold)

        if stats[most_frequent] < min_frequency:
            sys.stderr.write('no pair has frequency >= {0}. Stopping\n'.format(min_frequency))
            break
        
        if verbose:
            sys.stderr.write('pair {0}: {1} {2} -> {1}{2} (frequency {3})\n'.format(i, most_frequent[0], most_frequent[1], stats[most_frequent]))
        outfile.write('{0} {1}\n'.format(*most_frequent))
        changes = replace_pair(most_frequent, sorted_vocab, indices)
        update_pair_statistics(most_frequent, changes, stats, indices)
        stats[most_frequent] = 0
        if not i % 100:
            prune_stats(stats, big_stats, threshold)

train_SPE

In [18]:
import codecs
from SmilesPE.learner import *

In [19]:
file_name = "C:/Users/Administrator/moses.smi.txt"

with open(file_name, "r") as ins:
    SMILES = []
    for line in ins:
        SMILES.append(line.split('\n')[0])
print('Number of SMILES:', len(SMILES))

Number of SMILES: 1584663


## Training
input: a list of SMILES

output: the file to save the learned vocabulary.

num_symbols: maximum total number of SPE symbols, set to 30,000

min_frequency: the minimum frequency of SPE symbols appears, set to 2,000.

augmentation: times of SMILES augmentation, set to 1. The final data set is ~2 times larger than the original one.

verbose: if True, print the merging process

total_symbols: if True; the maximum total of SPE symbols = num_symbols - number of atom-level token.

In [57]:
!pip install rdkit

Collecting rdkit
  Downloading rdkit-2022.9.5-cp37-cp37m-win_amd64.whl (20.5 MB)
     --------------------------------------- 20.5/20.5 MB 13.4 MB/s eta 0:00:00
Installing collected packages: rdkit
Successfully installed rdkit-2022.9.5


In [20]:
%%time
output = codecs.open('../SPE_output.txt', 'w')
learn_SPE(SMILES, output, 30000, min_frequency=2000, augmentation=1, verbose=True, total_symbols=True)


Counting SMILES...
1584663 unique Canonical SMILES
Augmenting SMILES...(1 times)


3159633 unique SMILES (Canonical + Augmented)
Gettting Pair Statistics


Number of unique characters & Reducing number of merge operations by: 25
Unique characters: {'n', '7', 'O', 'S', '1', 'F', '=', '(', '3', 'N', '6', 'Cl', '#', 'o', '[H]', '-', 'c', 'Br', '4', '5', '2', 's', 'C', '[nH]', ')'}
pair 0: c c -> cc (frequency 11709857)
pair 1: C C -> CC (frequency 5874847)
pair 2: c 1 -> c1 (frequency 4895619)
pair 3: ( C -> (C (frequency 4233604)
pair 4: = O -> =O (frequency 4093346)
pair 5: =O ) -> =O) (frequency 3823709)
pair 6: ( =O) -> (=O) (frequency 2978411)
pair 7: c 2 -> c2 (frequency 2970701)
pair 8: c ( -> c( (frequency 2119276)
pair 9: (C ) -> (C) (frequency 2116033)
pair 10: c1 cc -> c1cc (frequency 1830669)
pair 11: N C -> NC (frequency 1452832)
pair 12: ) cc -> )cc (frequency 1429503)
pair 13: c2 cc -> c2cc (frequency 1100286)
pair 14: 2 ) -> 2) (frequency 986195)
pair 15: c 3 -> c3 (frequency 897909)
pair 16: O C -> OC (frequency 860289)
pair 17: (=O) N -> (=O)N (frequency 858618)
pair 18: CC C -> CCC (frequency 821381)
pair 19: n 1 -> n1 (fr

pair 190: c1 (C -> c1(C (frequency 45530)
pair 191: (=O) OC -> (=O)OC (frequency 45349)
pair 192: cn c2 -> cnc2 (frequency 44323)
pair 193: c [nH] -> c[nH] (frequency 44148)
pair 194: c2n n -> c2nn (frequency 44137)
pair 195: O CC1 -> OCC1 (frequency 43925)
pair 196: c2n cc -> c2ncc (frequency 43310)
pair 197: N 2CC -> N2CC (frequency 43155)
pair 198: C (C -> C(C (frequency 43040)
pair 199: ( F)cc -> (F)cc (frequency 42753)
pair 200: CC (C)C -> CC(C)C (frequency 42664)
pair 201: c1ccc( Cl -> c1ccc(Cl (frequency 42347)
pair 202: c1n cc -> c1ncc (frequency 41896)
pair 203: c(C (N -> c(C(N (frequency 41047)
pair 204: c(- c2cc -> c(-c2cc (frequency 41038)
pair 205: C (N -> C(N (frequency 40476)
pair 206: c(C #N -> c(C#N (frequency 40409)
pair 207: cn c1 -> cnc1 (frequency 40342)
pair 208: CCC (=O)N -> CCC(=O)N (frequency 40288)
pair 209: c(C) cc1 -> c(C)cc1 (frequency 40203)
pair 210: c( OC -> c(OC (frequency 40172)
pair 211: 3 )CC -> 3)CC (frequency 39870)
pair 212: CCC O -> CCCO (frequen

pair 374: c2cc c3c( -> c2ccc3c( (frequency 19070)
pair 375: c4 cccc -> c4cccc (frequency 18981)
pair 376: c2ccccc2 )n -> c2ccccc2)n (frequency 18953)
pair 377: n (C -> n(C (frequency 18853)
pair 378: )C (=O)C -> )C(=O)C (frequency 18840)
pair 379: N CC -> NCC (frequency 18832)
pair 380: c1ccc( F) -> c1ccc(F) (frequency 18753)
pair 381: o c(C -> oc(C (frequency 18729)
pair 382: c1cc (C) -> c1cc(C) (frequency 18694)
pair 383: ) s -> )s (frequency 18584)
pair 384: OC O -> OCO (frequency 18530)
pair 385: c2 cs -> c2cs (frequency 18270)
pair 386: 2) CC -> 2)CC (frequency 18262)
pair 387: CC NC(=O) -> CCNC(=O) (frequency 18142)
pair 388: )cc cc -> )cccc (frequency 18126)
pair 389: c12 c( -> c12c( (frequency 18078)
pair 390: c(=O) n(C) -> c(=O)n(C) (frequency 18073)
pair 391: c2cc c(C -> c2ccc(C (frequency 17826)
pair 392: S(C) (=O)=O) -> S(C)(=O)=O) (frequency 17738)
pair 393: OC (C)C -> OC(C)C (frequency 17673)
pair 394: ) c1ccccc1 -> )c1ccccc1 (frequency 17670)
pair 395: OCC O -> OCCO (fre

pair 552: COC (=O)C -> COC(=O)C (frequency 11907)
pair 553: c1ccc( O -> c1ccc(O (frequency 11854)
pair 554: c1 c[nH] -> c1c[nH] (frequency 11839)
pair 555: (C)C )=O) -> (C)C)=O) (frequency 11816)
pair 556: 2CCC O -> 2CCCO (frequency 11794)
pair 557: c2) n1 -> c2)n1 (frequency 11745)
pair 558: )C1 CC1 -> )C1CC1 (frequency 11707)
pair 559: c2ccc(F)cc 2 -> c2ccc(F)cc2 (frequency 11611)
pair 560: Br )cc -> Br)cc (frequency 11599)
pair 561: 2) c1C -> 2)c1C (frequency 11506)
pair 562: Cc1n o -> Cc1no (frequency 11500)
pair 563: c2ccccc2 )n1 -> c2ccccc2)n1 (frequency 11489)
pair 564: CCCO 1 -> CCCO1 (frequency 11488)
pair 565: C NC(=O) -> CNC(=O) (frequency 11485)
pair 566: C c1ccccc1 -> Cc1ccccc1 (frequency 11464)
pair 567: (C) O -> (C)O (frequency 11441)
pair 568: c(C N -> c(CN (frequency 11434)
pair 569: (C#N ) -> (C#N) (frequency 11403)
pair 570: Br )cc1 -> Br)cc1 (frequency 11378)
pair 571: c( OCC -> c(OCC (frequency 11350)
pair 572: (N CC -> (NCC (frequency 11330)
pair 573: c4 c( -> c4c

pair 731: c(S CC(=O)N -> c(SCC(=O)N (frequency 8211)
pair 732: c1 (=O) -> c1(=O) (frequency 8208)
pair 733: c1n cn -> c1ncn (frequency 8168)
pair 734: c(C (F)(F)F) -> c(C(F)(F)F) (frequency 8109)
pair 735: c2n c3ccccc3 -> c2nc3ccccc3 (frequency 8078)
pair 736: c1ccc(C #N)cc1 -> c1ccc(C#N)cc1 (frequency 8072)
pair 737: NC (NC -> NC(NC (frequency 8070)
pair 738: 2 c1 -> 2c1 (frequency 8062)
pair 739: (Br )c1 -> (Br)c1 (frequency 8058)
pair 740: )=O) CC1 -> )=O)CC1 (frequency 8052)
pair 741: n2 c( -> n2c( (frequency 8023)
pair 742: #N ) -> #N) (frequency 8023)
pair 743: c2ccccc2 Cl -> c2ccccc2Cl (frequency 8022)
pair 744: (NC (C)=O) -> (NC(C)=O) (frequency 8022)
pair 745: CCOC (=O)C -> CCOC(=O)C (frequency 8013)
pair 746: OC (=O)C -> OC(=O)C (frequency 8008)
pair 747: CCC NC(=O) -> CCCNC(=O) (frequency 7980)
pair 748: ) (=O) -> )(=O) (frequency 7934)
pair 749: c3cccc n3 -> c3ccccn3 (frequency 7921)
pair 750: o c1 -> oc1 (frequency 7910)
pair 751: c1ccccc1 - -> c1ccccc1- (frequency 7901)
p

pair 907: 2)=O) C1 -> 2)=O)C1 (frequency 6216)
pair 908: c1c(Cl)cc cc1 -> c1c(Cl)cccc1 (frequency 6209)
pair 909: c2c(C) cccc -> c2c(C)cccc (frequency 6207)
pair 910: )=O) n -> )=O)n (frequency 6204)
pair 911: )n n1 -> )nn1 (frequency 6150)
pair 912: 2) nn -> 2)nn (frequency 6132)
pair 913: c1 (Cl)cc -> c1(Cl)cc (frequency 6125)
pair 914: NC 2 -> NC2 (frequency 6123)
pair 915: (CC (=O)NC -> (CC(=O)NC (frequency 6117)
pair 916: c1ccccc1 F) -> c1ccccc1F) (frequency 6113)
pair 917: S(N )(=O)=O) -> S(N)(=O)=O) (frequency 6112)
pair 918: c2cc(F)cc c2 -> c2cc(F)ccc2 (frequency 6110)
pair 919: c1cc (C(N -> c1cc(C(N (frequency 6106)
pair 920: C (NC -> C(NC (frequency 6104)
pair 921: ( F)cc1 -> (F)cc1 (frequency 6098)
pair 922: c1n c(-c2cc -> c1nc(-c2cc (frequency 6079)
pair 923: c2cc c(-n3 -> c2ccc(-n3 (frequency 6078)
pair 924: C c1o -> Cc1o (frequency 6074)
pair 925: c1cc2 cccc -> c1cc2cccc (frequency 6063)
pair 926: (C NC(=O) -> (CNC(=O) (frequency 6056)
pair 927: S(=O)(=O)N 1CC -> S(=O)(=O

pair 1080: CC n1n -> CCn1n (frequency 4907)
pair 1081: )c(OC )cc1 -> )c(OC)cc1 (frequency 4899)
pair 1082: 3 cc -> 3cc (frequency 4896)
pair 1083: c2 c1C -> c2c1C (frequency 4888)
pair 1084: CC(C)(C) OC -> CC(C)(C)OC (frequency 4885)
pair 1085: C1 N( -> C1N( (frequency 4885)
pair 1086: 3 CCCO -> 3CCCO (frequency 4882)
pair 1087: c3cc (F)cc -> c3cc(F)cc (frequency 4872)
pair 1088: n c2C -> nc2C (frequency 4870)
pair 1089: NC(=O) c1cn -> NC(=O)c1cn (frequency 4854)
pair 1090: c1ccc( CC -> c1ccc(CC (frequency 4846)
pair 1091: N1CC N( -> N1CCN( (frequency 4834)
pair 1092: 2 cc -> 2cc (frequency 4831)
pair 1093: c3ccc( OC)cc -> c3ccc(OC)cc (frequency 4828)
pair 1094: c1cc (NC -> c1cc(NC (frequency 4826)
pair 1095: c1cc(OC )ccc1 -> c1cc(OC)ccc1 (frequency 4825)
pair 1096: CCN1C (=O) -> CCN1C(=O) (frequency 4822)
pair 1097: )c( OCC -> )c(OCC (frequency 4813)
pair 1098: c1 (OCC -> c1(OCC (frequency 4803)
pair 1099: CC(C) (C)C -> CC(C)(C)C (frequency 4802)
pair 1100: N(C (CC -> N(C(CC (frequenc

pair 1252: #N )c1 -> #N)c1 (frequency 4054)
pair 1253: )c( F)cc -> )c(F)cc (frequency 4052)
pair 1254: NC(=O)N (C)C -> NC(=O)N(C)C (frequency 4051)
pair 1255: c(Cl )c2 -> c(Cl)c2 (frequency 4047)
pair 1256: (N (C)C -> (N(C)C (frequency 4042)
pair 1257: n1 (C) -> n1(C) (frequency 4037)
pair 1258: c2cccc c2n -> c2ccccc2n (frequency 4031)
pair 1259: c12 cccc -> c12cccc (frequency 4031)
pair 1260: = N -> =N (frequency 4030)
pair 1261: CCN(CC )C(=O) -> CCN(CC)C(=O) (frequency 4029)
pair 1262: (C#N )CC -> (C#N)CC (frequency 4027)
pair 1263: (=O)N (C -> (=O)N(C (frequency 4021)
pair 1264: )ccc1 F -> )ccc1F (frequency 4017)
pair 1265: )cc c1C -> )ccc1C (frequency 4014)
pair 1266: c2cc3c( cc2) -> c2cc3c(cc2) (frequency 4012)
pair 1267: - c1ccccc1 -> -c1ccccc1 (frequency 4001)
pair 1268: (C (=O)OCC -> (C(=O)OCC (frequency 3996)
pair 1269: c(C) o1 -> c(C)o1 (frequency 3983)
pair 1270: 2)cc1 F -> 2)cc1F (frequency 3981)
pair 1271: CC N(C(=O) -> CCN(C(=O) (frequency 3958)
pair 1272: N2C (=O)C -> N2

pair 1421: n1 )C -> n1)C (frequency 3396)
pair 1422: OC )CC -> OC)CC (frequency 3395)
pair 1423: (C) NC(=O)N -> (C)NC(=O)N (frequency 3381)
pair 1424: (OCC (=O)N -> (OCC(=O)N (frequency 3380)
pair 1425: 2)n cc1 -> 2)ncc1 (frequency 3376)
pair 1426: (C)=O) cc1 -> (C)=O)cc1 (frequency 3375)
pair 1427: c2ncccc 2 -> c2ncccc2 (frequency 3374)
pair 1428: c( NC(=O)CC -> c(NC(=O)CC (frequency 3373)
pair 1429: )ccc1 OC -> )ccc1OC (frequency 3373)
pair 1430: n1 c2c( -> n1c2c( (frequency 3367)
pair 1431: ) S(=O)(=O) -> )S(=O)(=O) (frequency 3363)
pair 1432: nc1 S -> nc1S (frequency 3361)
pair 1433: c1c(C) cccc1 -> c1c(C)cccc1 (frequency 3357)
pair 1434: S(=O)(=O)N (C)C -> S(=O)(=O)N(C)C (frequency 3355)
pair 1435: c1ccc(C N -> c1ccc(CN (frequency 3352)
pair 1436: C1 N(C -> C1N(C (frequency 3347)
pair 1437: c(F) c(F) -> c(F)c(F) (frequency 3344)
pair 1438: c1c(C) [nH] -> c1c(C)[nH] (frequency 3343)
pair 1439: CC (O)C -> CC(O)C (frequency 3343)
pair 1440: c2 C1 -> c2C1 (frequency 3342)
pair 1441: c

pair 1588: )C O -> )CO (frequency 2939)
pair 1589: c1 (NC(C -> c1(NC(C (frequency 2935)
pair 1590: (C)(C)C )cc1 -> (C)(C)C)cc1 (frequency 2933)
pair 1591: c2=O) c1 -> c2=O)c1 (frequency 2930)
pair 1592: N# C -> N#C (frequency 2927)
pair 1593: )c(Cl )c2 -> )c(Cl)c2 (frequency 2924)
pair 1594: c1ccc(F)cc1 ) -> c1ccc(F)cc1) (frequency 2923)
pair 1595: C1CC N( -> C1CCN( (frequency 2922)
pair 1596: ) c(C(=O)N -> )c(C(=O)N (frequency 2920)
pair 1597: [nH] c3 -> [nH]c3 (frequency 2915)
pair 1598: cc2 1 -> cc21 (frequency 2913)
pair 1599: c3cn n(C) -> c3cnn(C) (frequency 2910)
pair 1600: Cc1cccc (C)c1 -> Cc1cccc(C)c1 (frequency 2907)
pair 1601: cn c1C -> cnc1C (frequency 2906)
pair 1602: 2)n cn1 -> 2)ncn1 (frequency 2902)
pair 1603: c(C (=O)OCC -> c(C(=O)OCC (frequency 2900)
pair 1604: C1C (C -> C1C(C (frequency 2897)
pair 1605: N(C(=O) OC(C)(C)C -> N(C(=O)OC(C)(C)C (frequency 2893)
pair 1606: COC(=O) C1 -> COC(=O)C1 (frequency 2892)
pair 1607: c1s cc -> c1scc (frequency 2891)
pair 1608: c2ccc

pair 1756: c1cccc(C (F)(F)F)c1 -> c1cccc(C(F)(F)F)c1 (frequency 2575)
pair 1757: c1cc(F)cc (F)c1 -> c1cc(F)cc(F)c1 (frequency 2573)
pair 1758: (N ) -> (N) (frequency 2572)
pair 1759: C N(C)C(=O)C -> CN(C)C(=O)C (frequency 2567)
pair 1760: (C)C )=O)cc -> (C)C)=O)cc (frequency 2566)
pair 1761: C(C O -> C(CO (frequency 2561)
pair 1762: c2ccc(F)cc 2)c1 -> c2ccc(F)cc2)c1 (frequency 2560)
pair 1763: c(-c3cc co -> c(-c3ccco (frequency 2555)
pair 1764: c( NC(=O)NC -> c(NC(=O)NC (frequency 2553)
pair 1765: C (NC(=O) -> C(NC(=O) (frequency 2551)
pair 1766: nn c1 -> nnc1 (frequency 2550)
pair 1767: c1cc (O -> c1cc(O (frequency 2548)
pair 1768: nn 3C -> nn3C (frequency 2547)
pair 1769: 4CC 4)n -> 4CC4)n (frequency 2547)
pair 1770: )c(N )c1 -> )c(N)c1 (frequency 2547)
pair 1771: c2) cn1 -> c2)cn1 (frequency 2546)
pair 1772: c1ccc( Br -> c1ccc(Br (frequency 2543)
pair 1773: n2cccn 2)cc1 -> n2cccn2)cc1 (frequency 2537)
pair 1774: )c1 s -> )c1s (frequency 2537)
pair 1775: 2 N -> 2N (frequency 2534)
pa

pair 1922: COc1ccc( NC(=O)C -> COc1ccc(NC(=O)C (frequency 2308)
pair 1923: 2)n1 C -> 2)n1C (frequency 2306)
pair 1924: c1n 2c( -> c1n2c( (frequency 2304)
pair 1925: C (NC( -> C(NC( (frequency 2301)
pair 1926: n2 c(C) -> n2c(C) (frequency 2295)
pair 1927: C1C OCC -> C1COCC (frequency 2295)
pair 1928: (C)C )CC1 -> (C)C)CC1 (frequency 2292)
pair 1929: n3 cc -> n3cc (frequency 2291)
pair 1930: 2CCCC 2=O)cc1 -> 2CCCC2=O)cc1 (frequency 2291)
pair 1931: 2) CCO1 -> 2)CCO1 (frequency 2290)
pair 1932: c3cc(F)cc c3 -> c3cc(F)ccc3 (frequency 2286)
pair 1933: 2)cc cc -> 2)cccc (frequency 2281)
pair 1934: O=C(C NC(=O) -> O=C(CNC(=O) (frequency 2280)
pair 1935: c2ncn c(N -> c2ncnc(N (frequency 2278)
pair 1936: c2c( OCC -> c2c(OCC (frequency 2278)
pair 1937: (C OC -> (COC (frequency 2278)
pair 1938: c1c( Cl -> c1c(Cl (frequency 2277)
pair 1939: c(C NC(=O)N -> c(CNC(=O)N (frequency 2277)
pair 1940: c1(C (=O)NC -> c1(C(=O)NC (frequency 2276)
pair 1941: c(=O)[nH] c(=O) -> c(=O)[nH]c(=O) (frequency 2276)


pair 2086: o 2)c1 -> o2)c1 (frequency 2066)
pair 2087: Cc1ccc( N -> Cc1ccc(N (frequency 2066)
pair 2088: c2ncccc 2C -> c2ncccc2C (frequency 2065)
pair 2089: c1ccc(F)cc1 F) -> c1ccc(F)cc1F) (frequency 2063)
pair 2090: (=O)N ( -> (=O)N( (frequency 2063)
pair 2091: c2c(C) o -> c2c(C)o (frequency 2062)
pair 2092: ) c1ccncc1 -> )c1ccncc1 (frequency 2062)
pair 2093: c3c( OC)cc -> c3c(OC)cc (frequency 2059)
pair 2094: c(F)cc cc1 -> c(F)cccc1 (frequency 2059)
pair 2095: cccc c12 -> ccccc12 (frequency 2056)
pair 2096: (F)F) c1 -> (F)F)c1 (frequency 2054)
pair 2097: )n cn -> )ncn (frequency 2053)
pair 2098: c2cc c[nH] -> c2ccc[nH] (frequency 2052)
pair 2099: c(O )c1 -> c(O)c1 (frequency 2052)
pair 2100: ) c1n -> )c1n (frequency 2052)
pair 2101: )=O) CCC1 -> )=O)CCC1 (frequency 2051)
pair 2102: c1 c(=O)[nH] -> c1c(=O)[nH] (frequency 2050)
pair 2103: c2cc(C)cc c2 -> c2cc(C)ccc2 (frequency 2047)
pair 2104: c1 (Cl -> c1(Cl (frequency 2046)
pair 2105: (CC (C)C) -> (CC(C)C) (frequency 2046)
pair 2106:

Wall time: 3h 33min 52s


## SPE 결과 비교

In [21]:
import codecs
from SmilesPE.tokenizer import *

spe_vob= codecs.open('C:/Users/Administrator/SPE_output.txt')
spe = SPE_Tokenizer(spe_vob)

for smi in smiii:
    tokens = spe.tokenize(smi)
    print(tokens.split()) #split(); 공백을 기준으로 분리

['CCC', 'S(=O)', 'c1ccc2[nH]', 'c(', '=', 'NC(=O)', 'OC)', '[nH]', 'c2c1']
['CC(C)(C)C', '(=O)C', '(O', 'c1ccc(Cl)cc1', ')n1', 'cc', 'nc1']
['C', 'c1c(Cl)cccc1', 'Nc1n', 'cccc1C', '(=O)OCC', '(O)C', 'O']
['Cn1', 'cnc2c1', 'c(=O)n(CC', '(O)C', 'O)', 'c(=O)n2C']
['CC1', 'O', 'c2ccc(Cl)cc2', 'N(CC', '(O)C', 'O', ')C1=O']
['CCOC(=O)', 'c1cn', 'cn1', 'C1CCC', 'c2ccccc21']
['COc1ccccc1', 'OC(=O)', 'O', 'c1ccccc1OC']
['O=C1', 'N', 'c2ccc(Cl)cc', '2C', '(c2ccccc2', 'Cl)', '=', 'NC1', 'O']
['C', 'N1C(=O)C', '(O', ')N', '=C(', 'c2ccccc2', 'Cl)', 'c2cc(Cl)cc', 'c21']
['CCC(=O)', 'c1ccc(OCC', '(O)C', 'O', ')c(OC)c1']


In [22]:
import codecs
from SmilesPE.tokenizer import *

spe_vob= codecs.open('C:/Users/Administrator/SPE_ChEMBL.txt')
spe = SPE_Tokenizer(spe_vob)

for smi in smiii:
    tokens = spe.tokenize(smi)
    print(tokens.split()) #split(); 공백을 기준으로 분리

['CCC', 'S(=O)', 'c1ccc2[nH]', 'c(', '=', 'NC(=O)', 'OC)', '[nH]c2c1']
['CC(C)(C)', 'C(=O)C(', 'Oc1ccc(', 'Cl)cc1)', 'n1cc', 'nc1']
['Cc1c(Cl)', 'cccc1', 'Nc1n', 'cccc1', 'C(=O)OCC', '(O)', 'CO']
['Cn1', 'cnc2', 'c1c(=O)', 'n(', 'CC(O)', 'CO)', 'c(=O)n2', 'C']
['CC1', 'O', 'c2ccc(Cl)cc2', 'N(', 'CC(O)', 'CO)', 'C1=O']
['CCOC(=O)', 'c1cn', 'cn1', 'C1', 'CCC', 'c2ccccc21']
['COc1ccccc1', 'O', 'C(=O)O', 'c1ccccc1', 'OC']
['O=C1N', 'c2ccc(Cl)cc2', 'C(', 'c2ccccc2Cl)', '=N', 'C1O']
['CN1C(=O)', 'C(O)', 'N=C(', 'c2ccccc2Cl)', 'c2cc(Cl)cc', 'c21']
['CCC(=O)', 'c1ccc(OCC', '(O)', 'CO)', 'c(OC)c1']
