In [78]:
import pickle as pkl
import pandas as pd 
import numpy as np
from tqdm import tqdm 
from rdkit.Chem import MolFromSmiles, rdFingerprintGenerator
from rdkit.DataStructs import TanimotoSimilarity
np.random.seed(seed=42)

In [51]:
msgdf = pd.read_csv('../data/msg/split.tsv', sep = '\t', index_col='name')
train, val, test = pd.value_counts(msgdf['split'])
df_msg_len = train + val + test
train_p  = train / df_msg_len
val_p = val / df_msg_len
test_p = test / df_msg_len
print(f'train: {train_p* 100:2f} %\ntest: {test_p* 100:2f} %\nval: {val_p* 100:2f} %')

train: 83.996383 %
test: 7.596580 %
val: 8.407038 %


In [52]:
with open('../data/neims/df_neims_gecko.pkl', 'rb') as file:
    df = pkl.load(file)
df

Unnamed: 0,SMILES,spec
0,C(=O),"[[14, 15, 16, 18, 19, 25, 26, 27, 28, 29, 30, ..."
1,C(=O)(N(=O)(=O)),"[[14, 16, 17, 18, 19, 27, 28, 29, 30, 31, 32, ..."
2,C(=O)(N(=O)(=O))C(=O),"[[14, 15, 16, 17, 18, 19, 25, 27, 28, 29, 30, ..."
3,C(=O)(N(=O)(=O))C(=O)(O),"[[14, 15, 16, 17, 18, 19, 25, 27, 28, 29, 30, ..."
4,C(=O)(N(=O)(=O))C(=O)(OO),"[[14, 15, 16, 17, 18, 19, 25, 26, 27, 28, 29, ..."
...,...,...
166429,CC1(O)C(O)(C(=O)(OON(=O)(=O)))OOC1C(O)C(=O)(OO...,"[[14, 15, 16, 17, 18, 19, 26, 27, 28, 29, 30, ..."
166430,CC1(O)C(O)(C(=O)(OON(=O)(=O)))OOC1C(O)C(=O),"[[14, 15, 16, 17, 18, 19, 25, 26, 27, 28, 29, ..."
166431,CC1(O)C(O)(C(=O)(OON(=O)(=O)))OOC1C(=O)C(=O),"[[14, 15, 16, 17, 18, 19, 25, 26, 27, 28, 29, ..."
166432,CC1(O)C(O)(C(=O)(OON(=O)(=O)))OOC1C(=O)(OON(=O...,"[[14, 15, 16, 26, 27, 28, 29, 30, 31, 32, 38, ..."


In [56]:
df['mol'] = df['SMILES'].apply(MolFromSmiles)

In [64]:
morgan_generator = rdFingerprintGenerator.GetMorganGenerator(radius=2,fpSize=2048)

In [80]:
mols = df['mol'].values
fps = [morgan_generator.GetFingerprint(mol) for mol in tqdm(mols, desc="Computing fingerprints")]

num_dupes = 0
n = len(fps)
for i in tqdm(range(n), desc="Checking for duplicates"):
    for j in range(i + 1, n):
        if TanimotoSimilarity(fps[i], fps[j]) == 1.0:
            num_dupes += 1  

Computing fingerprints: 100%|██████████| 166434/166434 [00:02<00:00, 73354.61it/s]
Checking for duplicates:   1%|          | 1593/166434 [03:23<5:51:18,  7.82it/s]


KeyboardInterrupt: 

In [None]:
print(num_dupes)

In [53]:
df.index.name = 'name'

In [54]:
df_len = len(df)
num_train_val_test = (round(train_p * df_len), round(val_p * df_len), round(test_p * df_len))
assert sum(num_train_val_test) == len(df)
trains = np.full(num_train_val_test[0], 'train')
vals = np.full(num_train_val_test[1], 'val')
tests = np.full(num_train_val_test[2], 'test')
split = np.hstack([trains, vals, tests])
np.random.shuffle(split)
df['split'] = split

In [55]:
df.drop(columns=['spec', 'SMILES']).to_csv('../data/neims/split.tsv', sep='\t')