In [1]:
import deepsmiles as ds

In [2]:
import pandas as pd

df = pd.read_parquet('data/RNN_dataset_ECFP.parquet')

In [3]:
df.head()

Unnamed: 0,smiles,fps
0,Cc1ccc(C)n1-c1cccc(C(=O)O)c1,"[94, 389, 456, 650, 807, 875, 883, 935, 1039, ..."
1,Cc1nc2c(c(C)c1CC(=O)NCc1ccco1)c(=O)[nH]n2C,"[24, 41, 80, 92, 197, 252, 255, 314, 325, 333,..."
2,O=C(c1csnn1)N1CCCC2(CCN(c3ncccn3)C2)C1,"[18, 180, 216, 338, 369, 378, 395, 399, 407, 4..."
3,CCC(C(=O)NCc1ccco1)n1nc(C)c2c(C)n(-c3ccc(C)cc3...,"[1, 80, 92, 122, 197, 233, 235, 255, 294, 302,..."
4,O=S(=O)(NCc1ccc2c(c1)OCO2)c1c[nH]cn1,"[29, 80, 84, 255, 310, 350, 378, 441, 502, 625..."


In [4]:
from profis.utils.vectorizer import SMILESVectorizer
vectorizer = SMILESVectorizer()

converter = ds.Converter(rings=True, branches=True)
df['deepsmiles'] = df['smiles'].apply(lambda x: converter.encode(x))
df['split'] = df['deepsmiles'].apply(lambda x: vectorizer.split(x))

In [5]:
df.head()

Unnamed: 0,smiles,fps,deepsmiles,split
0,Cc1ccc(C)n1-c1cccc(C(=O)O)c1,"[94, 389, 456, 650, 807, 875, 883, 935, 1039, ...",CccccC)n5-cccccC=O)O))c6,"[C, c, c, c, c, C, ), n, 5, -, c, c, c, c, c, ..."
1,Cc1nc2c(c(C)c1CC(=O)NCc1ccco1)c(=O)[nH]n2C,"[24, 41, 80, 92, 197, 252, 255, 314, 325, 333,...",CcncccC)c6CC=O)NCcccco5)))))))))))c=O)[nH]n5C,"[C, c, n, c, c, c, C, ), c, 6, C, C, =, O, ), ..."
2,O=C(c1csnn1)N1CCCC2(CCN(c3ncccn3)C2)C1,"[18, 180, 216, 338, 369, 378, 395, 399, 407, 4...",O=Cccsnn5)))))NCCCCCCNcncccn6))))))C5))))C6,"[O, =, C, c, c, s, n, n, 5, ), ), ), ), ), N, ..."
3,CCC(C(=O)NCc1ccco1)n1nc(C)c2c(C)n(-c3ccc(C)cc3...,"[1, 80, 92, 122, 197, 233, 235, 255, 294, 302,...",CCCC=O)NCcccco5))))))))nncC)ccC)n-ccccC)cc6)))...,"[C, C, C, C, =, O, ), N, C, c, c, c, c, o, 5, ..."
4,O=S(=O)(NCc1ccc2c(c1)OCO2)c1c[nH]cn1,"[29, 80, 84, 255, 310, 350, 378, 441, 502, 625...",O=S=O)NCcccccc6)OCO5))))))))))cc[nH]cn5,"[O, =, S, =, O, ), N, C, c, c, c, c, c, c, 6, ..."


In [6]:
splits = df['split'].tolist()

In [8]:
from tqdm import tqdm
alphabet = []
for split in tqdm(splits):
    for s in split:
        if s not in alphabet:
            alphabet.append(s)

100%|██████████| 1126085/1126085 [00:05<00:00, 212922.74it/s]


In [10]:
counts = {s: 0 for s in alphabet}
for split in tqdm(splits):
    for s in split:
        counts[s] += 1

100%|██████████| 1126085/1126085 [00:05<00:00, 215217.92it/s]


In [17]:
alphabet

['C',
 'c',
 ')',
 'n',
 '5',
 '-',
 '=',
 'O',
 '6',
 'N',
 'o',
 '[nH]',
 's',
 '9',
 'S',
 '%10',
 'F',
 'Cl',
 'Br',
 '#',
 '3',
 '4',
 '%18',
 '%13',
 '[N+]',
 '7',
 '%20',
 '%15',
 '%11',
 '8',
 '%19',
 '%21',
 'P',
 'I',
 '%14',
 '%12',
 '%17',
 '[n+]',
 '%16',
 '%22',
 '%24',
 '%23',
 '%28',
 '%27',
 '%26',
 '%25']

In [16]:
percents = []
for s in df.deepsmiles:
    if '%' in s:
        percents.append(s)
percents = percents[:10]
smiles = [converter.decode(p) for p in percents]
pairs = list(zip(percents, smiles))
pairs

[('O=CcccO)ncccccc%106))))))))))NCCNcccccCF)F)F))c6))))))CC6',
  'O=C(c1cc(O)nc2ccccc12)N4CCN(c3cccc(C(F)(F)F)c3)CC4'),
 ('O=CNCCCCcncNCCCCC6))))))ncc6%10)))))))))))CCCCC5',
  'O=C(NC3CCCc2nc(N1CCCCC1)ncc23)C4CCCC4'),
 ('COC=O)CNC=O)CSccccS=O)=O)NccccCC)=O))cc6))))))))cc6%10',
  'COC(=O)CN3C(=O)CSc2ccc(S(=O)(=O)Nc1ccc(C(C)=O)cc1)cc23'),
 ('O=ccCl)cocccccc6nc-%10cccccc%186', 'O=c3c(Cl)c2oc1ccccc1nc-2c4ccccc34'),
 ('COC=CNCCOCC6))))))C=O)cccccc6C%10=O', 'COC3=C(N1CCOCC1)C(=O)c2ccccc2C3=O'),
 ('Ccccc[nH]cc=O)nCC=O)NccccBr)cc6C))))))))))ncc6c9c%13',
  'Cc4ccc3[nH]c2c(=O)n(CC(=O)Nc1ccc(Br)cc1C)ncc2c3c4'),
 ('Ccncccccc6nc%10OCC=O)NCCNcccccc6))))))CC6',
  'Cc2nc1ccccc1nc2OCC(=O)N4CCN(c3ccccc3)CC4'),
 ('COccccS=O)=O)NCC)C)C))))cc6-cnnccccccc6cC)nn%13%10',
  'COc1ccc(S(=O)(=O)NC(C)(C)C)cc1-c3nnc4c2ccccc2c(C)nn34'),
 ('CS=O)=O)NccccC=NNSC)=O)=O))Cccccnccnc6c%10))))))))))C5)))))cc6',
  'CS(=O)(=O)Nc4ccc(C3=NN(S(C)(=O)=O)C(c2ccc1nccnc1c2)C3)cc4'),
 ('N=CN)cccccOCccccccCOcccccC=N)N))c6))))))))ccc6c

In [18]:
import selfies as sf

In [19]:
test_smile = 'CS(=O)(=O)Nc4ccc(C3=NN(S(C)(=O)=O)C(c2ccc1nccnc1c2)C3)cc4'

In [20]:
sf_selfie = sf.encoder(test_smile)

In [21]:
sf_selfie

'[C][S][=Branch1][C][=O][=Branch1][C][=O][N][C][=C][C][=C][Branch2][Ring2][Ring1][C][=N][N][Branch1][=Branch2][S][Branch1][C][C][=Branch1][C][=O][=O][C][Branch1][#C][C][=C][C][=C][N][=C][C][=N][C][Ring1][=Branch1][=C][Ring1][#Branch2][C][Ring2][Ring1][Ring1][C][=C][Ring2][Ring1][=Branch2]'