In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('./source/freesolv.csv',sep=',')
print(len(data))
data.head()

642


Unnamed: 0,iupac,smiles,expt,calc
0,"4-methoxy-N,N-dimethyl-benzamide",CN(C)C(=O)c1ccc(cc1)OC,-11.01,-9.625
1,methanesulfonyl chloride,CS(=O)(=O)Cl,-4.87,-6.219
2,3-methylbut-1-ene,CC(C)C=C,1.83,2.452
3,2-ethylpyrazine,CCc1cnccn1,-5.45,-5.809
4,heptan-1-ol,CCCCCCCO,-4.21,-2.917


In [3]:
import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem
from tqdm import tqdm_notebook

# cal the atom num
data_smiles = data['smiles'].values.tolist()
data_labels = data['expt'].values.tolist()
distributions = np.zeros(11)
max_length = -1
for smile in data_smiles:
    mol = Chem.MolFromSmiles(smile)
    try:
        Chem.SanitizeMol(mol)
    except:
        continue
    atom_num = mol.GetNumAtoms() // 10
    if atom_num < 10:
        distributions[atom_num] += 1
    else:
        distributions[-1] += 1
    max_length = mol.GetNumAtoms() if mol.GetNumAtoms() > max_length else max_length

for i in range(11):
    if i < 10:
        print(f'{i*10} - {(i+1) * 10}: {distributions[i]}')
    else:
        print(f'> {i * 10}: {distributions[i]}')
print(f'max length: {max_length}')
print(f'source: {len(data_smiles)}, after sanitize: {np.sum(distributions)}')

0 - 10: 441.0
10 - 20: 188.0
20 - 30: 13.0
30 - 40: 0.0
40 - 50: 0.0
50 - 60: 0.0
60 - 70: 0.0
70 - 80: 0.0
80 - 90: 0.0
90 - 100: 0.0
> 100: 0.0
max length: 24
source: 642, after sanitize: 642.0


In [4]:
import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem
from tqdm import tqdm_notebook

element_dict = {}
data_smile = data['smiles'].values.tolist()
data_label = data['expt'].values.tolist()

data_san_mol,data_san_label = [],[]
for smile,label in tqdm_notebook(list(zip(data_smile, data_label))):
    mol=Chem.MolFromSmiles(smile)
    
    # check the sanitizemol
    try:
        Chem.SanitizeMol(mol)
    except:
        continue
        
    # delete the molecule number >= 100
    if mol.GetNumAtoms() >= 100:
        continue
    
    data_san_mol.append(mol)
    data_san_label.append(label)
    
    for atom in mol.GetAtoms():
        element_dict[atom.GetSymbol()] = atom.GetAtomicNum()  

HBox(children=(IntProgress(value=0, max=642), HTML(value='')))




In [5]:
data_mean = np.mean(data_san_label)
data_std = np.std(data_san_label)
print(len(data_san_mol), len(data_san_label),data_mean, data_std)
print(element_dict)

642 642 -3.8030062305295944 3.844822204602953
{'C': 6, 'N': 7, 'O': 8, 'S': 16, 'Cl': 17, 'Br': 35, 'P': 15, 'F': 9, 'I': 53}


In [6]:
# Check list order
for idx,(mol,label) in enumerate(zip(data_san_mol, data_san_label)):
    print(idx, Chem.MolToSmiles(mol), label)
    if idx >= 100:
        break

print()
max_length = max([mol.GetNumAtoms() for mol in data_san_mol])
print("max length is {}".format(max_length))

0 COc1ccc(C(=O)N(C)C)cc1 -11.01
1 CS(=O)(=O)Cl -4.87
2 C=CC(C)C 1.83
3 CCc1cnccn1 -5.45
4 CCCCCCCO -4.21
5 Cc1cc(C)cc(O)c1 -6.27
6 CC(C)C(C)C 2.34
7 CCCC(C)(C)O -3.92
8 C[C@H]1CCCC[C@H]1C 1.58
9 CC[C@H](C)O -4.62
10 BrCBr -1.96
11 CC[C@@H](O)C(C)C -3.88
12 CCc1ccccn1 -4.33
13 CCCCC(=O)OCC -2.49
14 Sc1ccccc1 -2.55
15 CC(C)=CCC/C(C)=C\CO -4.78
16 c1ccc2c(c1)CCC2 -1.46
17 CCOc1ccccc1 -2.22
18 Oc1ccc(Br)cc1 -5.85
19 CCCC(C)(C)C 2.88
20 CC(=O)OCCOC(C)=O -6.34
21 CCOP(=S)(OCC)SCSP(=S)(OCC)OCC -6.1
22 OC1CCCCCC1 -5.48
23 COC(=O)C1CC1 -4.1
24 N#Cc1ccccc1 -4.1
25 CCCCC#N -3.52
26 CC(C)(C)O -4.47
27 CC(C)C(=O)C(C)C -2.74
28 CCC=O -3.43
29 CN(C)C=O -7.81
30 Cc1ccc(C)cc1 -0.8
31 C=CCC=C 0.93
32 Cc1cccc(Nc2ccccc2C(=O)O)c1C -6.78
33 CN(C)C(=O)c1ccccc1 -9.29
34 CCNCC -4.07
35 CC(C)(C)c1ccc(O)cc1 -5.91
36 CC(C)CCOC=O -2.13
37 CCCCCCCCCCO -3.64
38 CCOC(=O)CC -2.68
39 CCCCCCCCC 3.13
40 CNC(C)=O -10.0
41 C=CCCCCCCC 2.06
42 Oc1ccc2ccccc2c1 -8.11
43 Clc1ccc(Cl)c(Cl)c1 -1.12
44 OC[C@@H](O)[C@@H](O)[C@H](O)[

In [7]:
# store as a pickle file
import pickle as pkl
with open('./preprocess/freesolv.pickle','wb') as fw:
    pkl.dump([data_san_mol, data_san_label, data_mean, data_std],fw)

In [8]:
# read from pickle
import pickle as pkl
with open('./preprocess/freesolv.pickle','rb') as f:
    [data_san_mol, data_san_label, data_mean, data_std] = pkl.load(f)
print(len(data_san_mol), len(data_san_label),data_mean, data_std)

642 642 -3.8030062305295944 3.844822204602953
