In [26]:
import pickle as pkl
import pandas as pd 
import numpy as np
from tqdm import tqdm 
from rdkit.Chem import MolFromSmiles, rdFingerprintGenerator
from rdkit.DataStructs import TanimotoSimilarity, BulkTanimotoSimilarity
np.random.seed(seed=42)

In [27]:
msgdf = pd.read_csv('../data/msg/split.tsv', sep = '\t', index_col='name')
train, val, test = pd.value_counts(msgdf['split'])
df_msg_len = train + val + test
train_p  = train / df_msg_len
val_p = val / df_msg_len
test_p = test / df_msg_len
print(f'train: {train_p* 100:2f} %\ntest: {test_p* 100:2f} %\nval: {val_p* 100:2f} %')

train: 83.996383 %
test: 7.596580 %
val: 8.407038 %


In [28]:
with open('../data/neims/df_neims_gecko_TMS.pkl', 'rb') as file:
    df = pkl.load(file)
df

Unnamed: 0,SMILES,spec
0,C=O,"[[14, 15, 16, 18, 19, 25, 26, 27, 28, 29, 30, ..."
1,O=C[N+](=O)[O-],"[[14, 16, 17, 18, 19, 27, 28, 29, 30, 31, 32, ..."
2,O=CC(=O)[N+](=O)[O-],"[[14, 15, 16, 17, 18, 19, 25, 27, 28, 29, 30, ..."
3,C[Si](C)(C)OC(=O)C(=O)[N+](=O)[O-],"[[14, 15, 16, 18, 25, 26, 27, 28, 29, 30, 31, ..."
4,C[Si](C)(C)OOC(=O)C(=O)[N+](=O)[O-],"[[14, 15, 16, 25, 26, 27, 28, 29, 30, 31, 33, ..."
...,...,...
166429,CC1(O[Si](C)(C)C)C(C(O[Si](C)(C)C)C(=O)OO[N+](...,"[[14, 15, 16, 26, 27, 28, 29, 30, 31, 32, 39, ..."
166430,CC1(O[Si](C)(C)C)C(C(C=O)O[Si](C)(C)C)OOC1(O[S...,"[[2, 8, 14, 15, 18, 20, 26, 27, 28, 29, 30, 31..."
166431,CC1(O[Si](C)(C)C)C(C(=O)C=O)OOC1(O[Si](C)(C)C)...,"[[14, 15, 16, 18, 20, 25, 26, 27, 28, 29, 30, ..."
166432,CC1(O[Si](C)(C)C)C(C(=O)OO[N+](=O)[O-])OOC1(O[...,"[[14, 15, 26, 27, 28, 29, 30, 31, 32, 39, 41, ..."


In [29]:
df['mol'] = df['SMILES'].apply(MolFromSmiles)

In [30]:
'''from rdkit import DataStructs
morgan_generator = rdFingerprintGenerator.GetMorganGenerator(radius=2,fpSize=2048)
from tqdm import tqdm
fps = [morgan_generator.GetFingerprint(mol) for mol in tqdm(df['mol'].values, desc="Generating fingerprints")]
 
num_dupes = 0 # 306
n = len(fps)
for i in tqdm(range(n), desc="Finding duplicates"):
    sims = BulkTanimotoSimilarity(fps[i], fps[i+1:])  # vectorized similarity to rest
    num_dupes += sum(1 for sim in sims if sim == 1.0)
print(num_dupes)
'''

'from rdkit import DataStructs\nmorgan_generator = rdFingerprintGenerator.GetMorganGenerator(radius=2,fpSize=2048)\nfrom tqdm import tqdm\nfps = [morgan_generator.GetFingerprint(mol) for mol in tqdm(df[\'mol\'].values, desc="Generating fingerprints")]\n \nnum_dupes = 0 # 306\nn = len(fps)\nfor i in tqdm(range(n), desc="Finding duplicates"):\n    sims = BulkTanimotoSimilarity(fps[i], fps[i+1:])  # vectorized similarity to rest\n    num_dupes += sum(1 for sim in sims if sim == 1.0)\nprint(num_dupes)\n'

In [31]:
df.index.name = 'name'

In [32]:
df_len = len(df)
num_train_val_test = (round(train_p * df_len), round(val_p * df_len), round(test_p * df_len))
assert sum(num_train_val_test) == len(df)
trains = np.full(num_train_val_test[0], 'train')
vals = np.full(num_train_val_test[1], 'val')
tests = np.full(num_train_val_test[2], 'test')
split = np.hstack([trains, vals, tests])
np.random.shuffle(split)
df['split'] = split

In [None]:
df.drop(columns=['spec', 'SMILES', 'mol']).to_csv('../data/neims/split.tsv', sep='\t')