In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import selfies as sf
from selfies import encoder, DecoderError
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Draw
import logging
import pickle
from data_preprocess import DataPreprocessor
from utils import read_segmented_mols, smiles_to_text, selfies_to_text , smiles_fragments_to_selfies_fragments , build_vocab
from tokenization_algorithms.MacFragger import MacFragGenerator
from tokenization_algorithms.spe import SMILESLearner
from utils import *
import codecs
from SmilesPE.tokenizer import *

In [18]:
# create a data preprocessor object to preprocess clintox dataset (create SELFIES and SMILES with corresponding target)
clintox_preprocessor = DataPreprocessor(input='datasets/clintox.csv', output='datasets/pre_processed', target='CT_TOX', name='clintox')
tox21_preprocessor = DataPreprocessor(input='datasets/tox21.csv', output='datasets/pre_processed', target='NR-AR', name='tox21')
clintox_data = clintox_preprocessor.pre_process()
tox21_data = tox21_preprocessor.pre_process()
# Transfer SMILES and SELFIES to text files 
smiles_to_text(dataframe = clintox_data,output_file= 'datasets/pre_processed/clintox.smi')
smiles_to_text(dataframe = tox21_data,output_file= 'datasets/pre_processed/tox21.smi')
selfies_to_text(dataframe = clintox_data, output_file='datasets/pre_processed/clintox.selfi')
selfies_to_text(dataframe = tox21_data,output_file= 'datasets/pre_processed/tox21.selfi')

Pre-processing clintox dataset...
Number Of processed SMILES: 1484
Number of removed SMILES: 8
Number of valid SMILES: 1476
Number of positive samples: 112
Number of negative samples: 1364
Percentage of positive samples: 7.588075880758808 %
Percentage of negative samples: 92.41192411924119 %
saving  clintox dataset to CSV file...
saving  clintox dataset SMILES to TXT file...
Pre-processing tox21 dataset...
Number Of processed SMILES: 3079
Number of removed SMILES: 0
Number of valid SMILES: 3079
Number of positive samples: 59
Number of negative samples: 3020
Percentage of positive samples: 1.916206560571614 %
Percentage of negative samples: 98.08379343942839 %
saving  tox21 dataset to CSV file...
saving  tox21 dataset SMILES to TXT file...


In [5]:
# Mc fragger tokenization
macFragger = MacFragGenerator()
# clintoxt dataset
input_file = './datasets/pre_processed/clintox.smi'
dir = './datasets/mac_fragments/'
asMols = False
maxBlocks = 6
maxSR = 8
minFragAtoms = 1
name = 'clintox'
macFragger.write_file(input_file, dir, maxBlocks, maxSR, asMols, minFragAtoms, name)
# tox21 dataset
input_file = './datasets/pre_processed/tox21.smi'
dir = './datasets/mac_fragments/'
asMols = False
maxBlocks = 6
maxSR = 8
minFragAtoms = 1
name = 'tox21'
macFragger.write_file(input_file, dir, maxBlocks, maxSR, asMols, minFragAtoms, name)

# zinc dataset
input_file = './datasets/zinc.smi'
dir = './datasets/mac_fragments/'
asMols = False
maxBlocks = 6
maxSR = 8
minFragAtoms = 1
name = 'zinc'
macFragger.write_file(input_file, dir, maxBlocks, maxSR, asMols, minFragAtoms, name)


# smilesDB dataset
input_file = './datasets/smilesDB.smi'
dir = './datasets/mac_fragments/'
asMols = False
maxBlocks = 6
maxSR = 8
minFragAtoms = 1
name = 'smilesDB'
macFragger.write_file(input_file, dir, maxBlocks, maxSR, asMols, minFragAtoms, name)
        

non_mols:  0
processed_mols:  1476
non_mols:  0
processed_mols:  3079
non_mols:  0
processed_mols:  249455
Error: the input file contains non-mols
Error: the input file contains non-mols
Error: the input file contains non-mols
Error: the input file contains non-mols
Error: the input file contains non-mols
Error: the input file contains non-mols
non_mols:  6
processed_mols:  258764


In [6]:
#build vocabulary for Mac Fragger fragments
special_tokens = ['[PAD]', 
                '[unused1]', '[unused2]', '[unused3]', '[unused4]','[unused5]', '[unused6]', '[unused7]', '[unused8]', '[unused9]', '[unused10]', 
                '[UNK]', '[CLS]', '[SEP]', '[MASK]']
 # clintox dataset
input= './datasets/mac_fragments/clintox_fragments.smi'
output = './vocabs/macFrag_vocab/clintox_vocab.smi'
build_vocab(input,output, special_tokens=special_tokens)
# tox21 dataset
input= './datasets/mac_fragments/tox21_fragments.smi'
output = './vocabs/macFrag_vocab/tox21_vocab.smi'
build_vocab(input,output, special_tokens=special_tokens)
# zinc dataset
input= './datasets/mac_fragments/zinc_fragments.smi'
output = './vocabs/macFrag_vocab/zinc_vocab.smi'
build_vocab(input,output, special_tokens=special_tokens)
# smilesDB dataset
input= './datasets/mac_fragments/smilesDB_fragments.smi'
output = './vocabs/macFrag_vocab/smilesDB_vocab.smi'
build_vocab(input,output, special_tokens=special_tokens)

In [8]:
# train spe , BPE encoder on smiles
vocab_size = 1_000
input_file = './datasets/cleaned_smilesDB.smi'
output_file = './vocabs/spe_vocab/cleaned_smilesDB' + '_' + str(vocab_size) + '.smi'
spe_learner = SMILESLearner(file_name=input_file, output_file=output_file,num_symbols= vocab_size, min_frequency=1, augmentation=1, verbose=True, total_symbols=True)
spe_smi ,spe_smi_vocab = spe_learner.learn_SMILES()
bpe_smi , bpe_smi_vocab= spe_learner.learn_BPE()
# save model
save_model(bpe_smi, './models/tokenizers/bpe_DBsmi_1000.bin') # already saved while training
# train spe encoder on selfies
input_file = "./datasets/cleaned_selfiesDB.self"
output_file = "./vocab/spe_vocab/cleaned_selfiesDB" + '_' + str(vocab_size) + '.self'
spe_learner = SMILESLearner(file_name=input_file, output_file=output_file,num_symbols= vocab_size, min_frequency=1)
bpe_self , bpe_self_vocab = spe_learner.learn_BPE()
# save picle models
save_model(bpe_self, './models/tokenizers/bpe_DBself_1000.bin')
# train ByteLevelBPETokenizer on smiles
from tokenizers.implementations import ByteLevelBPETokenizer
tokenizer = ByteLevelBPETokenizer()
tokenizer.train(files='./datasets/smilesDB.smi', vocab_size=vocab_size, min_frequency=1)
tokenizer.save_model('./models/tokenizers/ByteLevelBPETokenizer')
save_model(tokenizer, './models/tokenizers/ByteLevelBPETokenizer.bin')






['./models/tokenizers/ByteLevelBPETokenizer/vocab.json',
 './models/tokenizers/ByteLevelBPETokenizer/merges.txt']

In [26]:
print(tokenizer.encode('COC(=O)c1c(N2C(=O)[C@@H]3[C@@H]4CCC[NH+]4[C@@]4(C(=O)Nc5c(C)cc(C)cc54)[C@@H]3C2=O)sc(C)c1C').tokens)
# load Bpe tokenizers 
bpe_smi = load_model('models/tokenizers/bpe_DBsmi_1000.bin')
bpe_self = load_model('models/tokenizers/bpe_DBself_1000.bin')
print(bpe_smi.encode('COC(=O)c1c(N2C(=O)[C@@H]3[C@@H]4CCC[NH+]4[C@@]4(C(=O)Nc5c(C)cc(C)cc54)[C@@H]3C2=O)sc(C)c1C').tokens)
# Load ByteLevelBPETokenizer's tokenizer
byteLevelBPETokenizer = load_model('models/tokenizers/ByteLevelBPETokenizer.bin')
print(byteLevelBPETokenizer.encode('COC(=O)c1c(N2C(=O)[C@@H]3[C@@H]4CCC[NH+]4[C@@]4(C(=O)Nc5c(C)cc(C)cc54)[C@@H]3C2=O)sc(C)c1C').tokens)
# load spe 
spe_vob= codecs.open('./vocabs/spe_vocab/cleaned_smilesDB_1000.smi')
spe = SPE_Tokenizer(spe_vob)
print(spe.tokenize('COC(=O)c1c(N2C(=O)[C@@H]3[C@@H]4CCC[NH+]4[C@@]4(C(=O)Nc5c(C)cc(C)cc54)[C@@H]3C2=O)sc(C)c1C'))

['COC', '(=', 'O', ')', 'c', '1', 'c', '(', 'N', '2', 'C', '(=', 'O', ')[', 'C', '@@', 'H', ']', '3', '[', 'C', '@@', 'H', ']', '4', 'CCC', '[', 'NH', '+]', '4', '[', 'C', '@@]', '4', '(', 'C', '(=', 'O', ')', 'Nc', '5', 'c', '(', 'C', ')', 'cc', '(', 'C', ')', 'cc', '54', ')[', 'C', '@@', 'H', ']', '3', 'C', '2', '=', 'O', ')', 'sc', '(', 'C', ')', 'c', '1', 'C']
['COC(=O)', 'c1c(N', '2C(=O)', '[C@@H]3', '[C@@H]4CCC', '[NH+]4', '[C@@]4(C(=O)N', 'c5', 'c(C)cc(C)cc', '5', '4)[C@@H]3C', '2=O)', 's', 'c(C)c1C']
['COC', '(=', 'O', ')', 'c', '1', 'c', '(', 'N', '2', 'C', '(=', 'O', ')[', 'C', '@@', 'H', ']', '3', '[', 'C', '@@', 'H', ']', '4', 'CCC', '[', 'NH', '+]', '4', '[', 'C', '@@]', '4', '(', 'C', '(=', 'O', ')', 'Nc', '5', 'c', '(', 'C', ')', 'cc', '(', 'C', ')', 'cc', '54', ')[', 'C', '@@', 'H', ']', '3', 'C', '2', '=', 'O', ')', 'sc', '(', 'C', ')', 'c', '1', 'C']
COC(=O) c1c( N2C(=O) [C@@H]3 [C@@H] 4 CCC [NH+] 4 [C@@] 4 (C(=O)N c5 c(C)cc (C)cc 5 4) [C@@H]3 C2=O) s c(C) c1C


In [28]:
# save tokenizers models
# bpe_smi2.save('./models/tokenizers/bpe_smi2.bin')
# bpe_self.save('./models/tokenizers/bpe_self.bin')
# some default tokens from huggingface
default_toks = ['[PAD]', 
                '[unused1]', '[unused2]', '[unused3]', '[unused4]','[unused5]', '[unused6]', '[unused7]', '[unused8]', '[unused9]', '[unused10]', 
                '[UNK]', '[CLS]', '[SEP]', '[MASK]']

# atom-level tokens used for trained the spe vocabulary
atom_toks = ['[c-]', '[SeH]', '[N]', '[C@@]', '[Te]', '[OH+]', 'n', '[AsH]', '[B]', 'b', 
             '[S@@]', 'o', ')', '[NH+]', '[SH]', 'O', 'I', '[C@]', '-', '[As+]', '[Cl+2]', 
             '[P+]', '[o+]', '[C]', '[C@H]', '[CH2]', '\\', 'P', '[O-]', '[NH-]', '[S@@+]', 
             '[te]', '[s+]', 's', '[B-]', 'B', 'F', '=', '[te+]', '[H]', '[C@@H]', '[Na]', 
             '[Si]', '[CH2-]', '[S@+]', 'C', '[se+]', '[cH-]', '6', 'N', '[IH2]', '[As]', 
             '[Si@]', '[BH3-]', '[Se]', 'Br', '[C+]', '[I+3]', '[b-]', '[P@+]', '[SH2]', '[I+2]', 
             '%11', '[Ag-3]', '[O]', '9', 'c', '[N-]', '[BH-]', '4', '[N@+]', '[SiH]', '[Cl+3]', '#', 
             '(', '[O+]', '[S-]', '[Br+2]', '[nH]', '[N+]', '[n-]', '3', '[Se+]', '[P@@]', '[Zn]', '2', 
             '[NH2+]', '%10', '[SiH2]', '[nH+]', '[Si@@]', '[P@@+]', '/', '1', '[c+]', '[S@]', '[S+]', 
             '[SH+]', '[B@@-]', '8', '[B@-]', '[C-]', '7', '[P@]', '[se]', 'S', '[n+]', '[PH]', '[I+]', '5', 'p', '[BH2-]', '[N@@+]', '[CH]', 'Cl']

# build spe vocab 

# spe tokens
with open('./vocabs/spe_vocab/cleaned_smilesDB' + '_' + str(vocab_size) + '.smi', "r") as ins:
    spe_toks = []
    for line in ins:
        spe_toks.append(line.split('\n')[0])
        

spe_tokens = []
for s in spe_toks:
    spe_tokens.append(''.join(s.split(' ')))
print('Number of SMILES:', len(spe_toks))
# build the vocabulary for the spe tokenizer
print(spe_tokens[:10])
spe_vocab = default_toks + atom_toks + spe_tokens
print('Number of tokens:', len(spe_vocab))
# save the vocabulary
with open('./vocabs/spe_vocab/vocab_speSMILESDB' + '_' + str(vocab_size) + '.txt', 'w') as f:
    for voc in spe_vocab:
        f.write(f'{voc}\n')

print("################################################################")
# build vocab for Bpe tokenizers
# build Bpe vocab
vocab_path = './vocabs/bpe_vocab/'
# write bpe_smi2 vocab file
vocs = list(default_toks) + list(bpe_smi_vocab.keys()) 
with open(vocab_path + 'bpe_DBsmi_vocab_' + str(vocab_size) + '.txt', 'w') as f:
    for voc in vocs:
        f.write(f'{voc.strip()}\n')


# write bpe_self vocab file
vocs = list(default_toks) + list(bpe_self_vocab.keys())
with open(vocab_path + 'bpe_DBself_vocab_' + str(vocab_size) + '.txt', 'w') as f:
    for voc in bpe_self_vocab:
        f.write(f'{voc.strip()}\n')



Number of SMILES: 851
['cc', 'CC', '(C', 'c1', 'O)', '=O)', '(=O)', 'c2', '(C)', 'c(']
Number of tokens: 981
################################################################


In [7]:
# build vocab for morfessor tokenizers
# some default tokens from huggingface
morf_model_pt='models/tokenizers/morf_smilesDB_1000.bin' #'models/tokenizers/morf_zinc_1000.bin'
morf_vocab_pt = './vocabs/morf_vocab/morf_smilesDB_vocab1000.txt' # './vocabs/morf_vocab/morf_zinc250k_vocab1000.txt'

default_toks = ['[PAD]', 
                '[unused1]', '[unused2]', '[unused3]', '[unused4]','[unused5]', '[unused6]', '[unused7]', '[unused8]', '[unused9]', '[unused10]', 
                '[UNK]', '[CLS]', '[SEP]', '[MASK]']
chem_morfessor= pickle.load(open(morf_model_pt, 'rb'))
print(chem_morfessor.viterbi_segment('C([C@@H]1[C@H]([C@@H]([C@H]([C@@H](O1)O[C@]2([C@H]([C@@H]([C@H](O2)OS(=O)(=O)O[Al](O)O)OS(=O)(=O)O[Al](O)O)OS(=O)(=O)O[Al](O)O)COS(=O)(=O)O[Al](O)O)OS(=O)(=O)O[Al](O)O)OS(=O)(=O)O[Al](O)O)OS(=O)(=O)O[Al](O)O)OS(=O)(=O)O[Al](O)O')[0])
print(len(chem_morfessor.get_constructions()))
constructions  = list(map(lambda t: t[0], chem_morfessor.get_constructions()))
# buil  vocab file
vocs = list(default_toks) + list(constructions)
with open(morf_vocab_pt, 'w') as f:
    for voc in vocs:
        f.write(f'{voc}\n')

['C', '(', '[C@@H]1', '[C@H](', '[C@@H](', '[C@H](', '[C@@H](', 'O', '1)', 'O', '[C@]2(', '[C@H](', '[C@@H](', '[C@H](', 'O', '2)', 'O', 'S(=O)(=O)', 'O', '[', 'A', 'l', ']', '(O)', 'O', ')', 'O', 'S(=O)(=O)', 'O', '[', 'A', 'l', ']', '(O)', 'O', ')', 'O', 'S(=O)(=O)', 'O', '[', 'A', 'l', ']', '(O)', 'O', ')', 'C', 'O', 'S(=O)(=O)', 'O', '[', 'A', 'l', ']', '(O)', 'O', ')', 'O', 'S(=O)(=O)', 'O', '[', 'A', 'l', ']', '(O)', 'O', ')', 'O', 'S(=O)(=O)', 'O', '[', 'A', 'l', ']', '(O)', 'O', ')', 'O', 'S(=O)(=O)', 'O', '[', 'A', 'l', ']', '(O)', 'O', ')', 'O', 'S(=O)(=O)', 'O', '[', 'A', 'l', ']', '(O)', 'O']
1002


In [5]:
chem_morfessor1= pickle.load(open('models/tokenizers/morf_zinc_1000.bin', 'rb'))
chem_morfessor2= pickle.load(open('models/tokenizers/morf_smilesDB_1000.bin', 'rb'))

print(chem_morfessor1.viterbi_segment('C([C@@H]1[C@H]([C@@H]([C@H]([C@H](O1)O[C@]2([C@H]([C@@H]([C@H](O2)CO)O)O)CO)O)O)O)O'))
print(chem_morfessor2.viterbi_segment('C([C@@H]1[C@H]([C@@H]([C@H]([C@H](O1)O[C@]2([C@H]([C@@H]([C@H](O2)CO)O)O)CO)O)O)O)O'))

(['C', '(', '[C@@H]1', '[C@H](', '[C@@H](', '[C@H](', '[C@H](', 'O', '1)', 'O', '[C@]2(', '[C@H](', '[C@@H](', '[C@H](', 'O', '2)', 'C', 'O', ')', 'O', ')', 'O', ')', 'C', 'O', ')', 'O', ')', 'O', ')', 'O', ')', 'O'], 135.78457592359314)
(['C', '(', '[C@@H]1', '[C@H](', '[C@@H](', '[C@H](', '[C@H](', 'O', '1)', 'O', '[C@]2(', '[C@H](', '[C@@H](', '[C@H](', 'O', '2)', 'C', 'O', ')', 'O', ')', 'O', ')', 'C', 'O', ')', 'O', ')', 'O', ')', 'O', ')', 'O'], 134.99606192059179)


In [34]:
chem_morfessor= pickle.load(open('models/tokenizers/morf_tox21_1000.bin', 'rb'))
segments1 = chem_morfessor.viterbi_segment('C([C@@H]1[C@H]([C@@H]([C@H]([C@@H](O1)O[C@]2([C@H]([C@@H]([C@H](O2)OS(=O)(=O)O[Al](O)O)OS(=O)(=O)O[Al](O)O)OS(=O)(=O)O[Al](O)O)COS(=O)(=O)O[Al](O)O)OS(=O)(=O)O[Al](O)O)OS(=O)(=O)O[Al](O)O)OS(=O)(=O)O[Al](O)O)OS(=O)(=O)O[Al](O)O')[0]
segments2 = chem_morfessor.viterbi_segment('NC(=O)c1ccc[n+]([C@@H]2O[C@H](COP(=O)([O-])OP(=O)(O)OC[C@H]3O[C@@H](n4cnc5c(N)ncnc54)[C@H](O)[C@@H]3O)[C@@H](O)[C@H]2O)c1')[0]
constructions = chem_morfessor.get_constructions()
# check if segments are in the constructions
for seg in segments2:
    if seg not in constructions:
        print(seg)
        print('not in constructions')
        break

NC(=O)
not in constructions


In [None]:
# checked later
# Get compounds once instead of calling it in every loop iteration
# compounds = chem_morfessor.get_compounds()
# clintox_df = pd.read_csv('./datasets/post_processed/clintox_processed_macFrag.csv')
# compounds = clintox_df['smiles'].tolist()
# print(len(compounds))
# # Use list comprehension for faster execution
# morfessor_smi = [chem_morfessor.viterbi_segment(compound)[0] for compound in compounds]

# # save segments in one list and add default tokens , then write to file
# import itertools
# morfessor_segs_1= list(itertools.chain.from_iterable(morfessor_smi))
# morfessor_segs_2 = list(itertools.chain(*morfessor_smi))
# print(len(morfessor_segs_1))
# print(len(morfessor_segs_2))
# print('constructions: ', len(chem_morfessor.get_constructions()))
# # remove redundancy from segments
# morfessor_segs = list(set(morfessor_segs_1))
# print(len(morfessor_segs))
# morfessor_segs = list(set(morfessor_segs_2))
# print(len(morfessor_segs))

# # write segments to file (build morfessor tokenizer)
# total_tokens = list(default_toks) + morfessor_segs
# with open('./vocabs/morf_vocab/morf_tox21_vocab.txt', 'w') as f:
#     for voc in total_tokens:
#         f.write(f'{voc}\n')

# morfessor_segs_1


In [35]:
#clintoxt dataset
fragmented_mols_smi = read_segmented_mols('./datasets/mac_fragments/clintox_fragments.smi')
# fragmented_mols_selfies = smiles_fragments_to_selfies_fragments(fragmented_mols_smi)
# add new column of fragments to dataframe before CT_TOX column
df_clin= pd.read_csv('datasets/pre_processed/clintox.csv')#clintox_data.copy()
# insert new colum to df_clin with name 'smi_fragments' and values from fragmented_mols_smi
df_clin['smi_mac_fragments'] = fragmented_mols_smi
# insert new colum to df_clin with name 'selfies_fragments' and values from fragmented_mols_selfies
# df_clin['selfies_fragments'] = fragmented_mols_selfies
# rearange columns order to smiles, fragments, CT_TOX
df_clin = df_clin[['smiles', 'smi_mac_fragments', 'selfies', 'CT_TOX']]
# save dataframe to csv file
df_clin.to_csv('./datasets/post_processed/clintox_processed.csv', index=False)
# tox21 dataset
fragmented_mols_smi = read_segmented_mols('datasets/mac_fragments/tox21_fragments.smi')
# fragmented_mols_selfies = smiles_fragments_to_selfies_fragments(fragmented_mols_smi)
# add new column of fragments to dataframe before NR-AR column
df_tox =  pd.read_csv('datasets/pre_processed/tox21.csv')#tox21_data.copy()
df_tox['smi_mac_fragments'] = fragmented_mols_smi
# df_tox['selfies_fragments'] = fragmented_mols_selfies
# rearange columns order to smiles, fragments, NR-AR
df_tox = df_tox[['smiles', 'smi_mac_fragments', 'selfies', 'NR-AR']]
# save dataframe to csv file
df_tox.to_csv('./datasets/post_processed/tox21_processed.csv', index=False)

In [27]:
df_clin['smi_mac_fragments'].tolist()

['Cl[C@H]1[C@H](Cl)[C@@H](Cl)[C@@H](Cl)[C@H](Cl)[C@H]1Cl',
 'O=C([O-])[C@@H](O)[C@@H](O)[C@H](O)[C@@H](O)C(=O)[O-]',
 '[5*]=CC=c1ccc(=C(N)[NH3+])cc1,[5*]=C1C=CC(/C(N)=[NH+]/[H])=CC1=O',
 '[2*]O[2*],[14*]c1ccc(/C(N)=[NH+]/[H])cc1,[3*]CCCCC[3*]',
 'O=[N+]([O-])[O-]',
 '[N]=O',
 'O=[99Tc](=O)(=O)[O-]',
 'O=P([O-])([O-])F',
 'O=S(=O)([O-])[O-]',
 'O=S([O-])([O-])=S',
 '[Se]',
 '[3*][C@@H](CC(C)C)B(O)O,[12*]c1cnccn1,[1*]C(=O)[C@@H]([3*])C[6*],[14*]c1ccccc1,[1*]C([1*])=O,[4*]N[4*]',
 '[3*][C@@H](CC(C)C)B(O)O,[12*]c1cnccn1,[1*]C(=O)[C@@H]([3*])C[6*],[14*]c1ccccc1,[1*]C([1*])=O,[4*]N[4*]',
 '[1*]C(=O)C[3*],[3*][C@@H](CC(C)C)B(O)O,[1*]C([1*])=O,[4*]N[4*],[14*]c1cc(Cl)ccc1Cl',
 '[2*]O[2*],[1*]C(N)=O,[3*]C1(C#C)CCCCC1',
 '[4*][NH2+][4*],[3*]CC#C,[3*][C@@H]1CCc2ccccc21',
 '[12*]c1cnc2nc(N)nc(N)c2n1,[3*][C@@H](CCC(=O)[O-])C(=O)[O-],[1*]C([1*])=O,[4*]N[4*],[6*]CC([6*])CC#C,[14*]c1ccc([14*])cc1',
 'C#N',
 'N#C[Fe-2](C#N)(C#N)(C#N)(C#N)N=O',
 '[6*][C@H](O)CO,[11*][C@H]1OC(=O)C(O)=C1[O-]',
 '[NH3+][C@@

In [None]:
import codecs
from SmilesPE.tokenizer import *
clintox_smiles = df_clin['smiles'].tolist()
clintox_selfies = df_clin['selfies'].tolist()
spe_vob= codecs.open('./spe_vocab/SPE_ChEMBL.txt')
spe_vocab_smi1 = codecs.open('./spe_vocab/clin_tox21_zinc.smi')
print("spe vocab size of spe: ", len(spe_vob.readlines()))
print("spe vocab size of spe_smi1: ",len(spe_vocab_smi1.readlines()))
print("spe vocab size of spe_smi2: ",spe_smi2.get_vocab_size())
spe = SPE_Tokenizer(spe_vob)
# check validity of all fragments
for smile  in clintox_smiles:
    print(smile)
    print(spe.tokenize(smile).split(' '))
    print(spe_smi1.tokenize(smile).split(' '))
    print(spe_smi2.encode(smile).tokens)
    print('-----------------')

In [None]:
spe_smi2.get_vocab()
list(spe_smi2.get_vocab().keys())