In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('./source/esol.csv',sep=',')
print(len(data))
data.head()

1128


Unnamed: 0,Compound ID,ESOL predicted log solubility in mols per litre,Minimum Degree,Molecular Weight,Number of H-Bond Donors,Number of Rings,Number of Rotatable Bonds,Polar Surface Area,measured log solubility in mols per litre,smiles
0,Amigdalin,-0.974,1,457.432,7,3,7,202.32,-0.77,OCC3OC(OCC2OC(OC(C#N)c1ccccc1)C(O)C(O)C2O)C(O)...
1,Fenfuram,-2.885,1,201.225,1,2,2,42.24,-3.3,Cc1occc1C(=O)Nc2ccccc2
2,citral,-2.579,1,152.237,0,0,4,17.07,-2.06,CC(C)=CCCC(C)=CC(=O)
3,Picene,-6.618,2,278.354,0,5,0,0.0,-7.87,c1ccc2c(c1)ccc3c2ccc4c5ccccc5ccc43
4,Thiophene,-2.232,2,84.143,0,1,0,0.0,-1.33,c1ccsc1


In [3]:
import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem
from tqdm import tqdm_notebook

# cal the atom num
data_smiles = data['smiles'].values.tolist()
data_labels = data['measured log solubility in mols per litre'].values.tolist()
distributions = np.zeros(11)
max_length = -1
for smile in data_smiles:
    mol = Chem.MolFromSmiles(smile)
    try:
        Chem.SanitizeMol(mol)
    except:
        continue
    atom_num = mol.GetNumAtoms() // 10
    if atom_num < 10:
        distributions[atom_num] += 1
    else:
        distributions[-1] += 1
    max_length = mol.GetNumAtoms() if mol.GetNumAtoms() > max_length else max_length

for i in range(11):
    if i < 10:
        print(f'{i*10} - {(i+1) * 10}: {distributions[i]}')
    else:
        print(f'> {i * 10}: {distributions[i]}')
print(f'max length: {max_length}')
print(f'source: {len(data_smiles)}, after sanitize: {np.sum(distributions)}')

0 - 10: 409.0
10 - 20: 505.0
20 - 30: 196.0
30 - 40: 15.0
40 - 50: 1.0
50 - 60: 2.0
60 - 70: 0.0
70 - 80: 0.0
80 - 90: 0.0
90 - 100: 0.0
> 100: 0.0
max length: 55
source: 1128, after sanitize: 1128.0


In [4]:
import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem
from tqdm import tqdm_notebook

element_dict = {}
data_smile = data['smiles'].values.tolist()
data_label = data['measured log solubility in mols per litre'].values.tolist()

data_san_mol,data_san_label = [],[]
for smile,label in tqdm_notebook(list(zip(data_smile, data_label))):
    mol=Chem.MolFromSmiles(smile)
    
    # check the sanitizemol
    try:
        Chem.SanitizeMol(mol)
    except:
        continue
        
    # delete the molecule number >= 100
    if mol.GetNumAtoms() >= 100:
        continue
    
    data_san_mol.append(mol)
    data_san_label.append(label)
    
    for atom in mol.GetAtoms():
        element_dict[atom.GetSymbol()] = atom.GetAtomicNum()  

HBox(children=(IntProgress(value=0, max=1128), HTML(value='')))




In [5]:
data_mean = np.mean(data_san_label)
data_std = np.std(data_san_label)
print(len(data_san_mol), len(data_san_label),data_mean, data_std)
print(element_dict)

1128 1128 -3.05010195035461 2.0955117304559443
{'O': 8, 'C': 6, 'N': 7, 'S': 16, 'Cl': 17, 'P': 15, 'F': 9, 'I': 53, 'Br': 35}


In [6]:
# Check list order
for idx,(mol,label) in enumerate(zip(data_san_mol, data_san_label)):
    print(idx, Chem.MolToSmiles(mol), label)
    if idx >= 100:
        break

print()
max_length = max([mol.GetNumAtoms() for mol in data_san_mol])
print("max length is {}".format(max_length))

0 N#CC(OC1OC(COC2OC(CO)C(O)C(O)C2O)C(O)C(O)C1O)c1ccccc1 -0.77
1 Cc1occc1C(=O)Nc1ccccc1 -3.3
2 CC(C)=CCCC(C)=CC=O -2.06
3 c1ccc2c(c1)ccc1c2ccc2c3ccccc3ccc21 -7.87
4 c1ccsc1 -1.33
5 c1ccc2scnc2c1 -1.5
6 Clc1cc(Cl)c(-c2c(Cl)cccc2Cl)c(Cl)c1 -7.32
7 CC12CCC3c4ccc(O)cc4CCC3C1CCC2O -5.03
8 ClC1=C(Cl)C2(Cl)C3C4CC(C5OC45)C3C1(Cl)C2(Cl)Cl -6.29
9 C=C(C)C1Cc2c(ccc3c2OC2COc4cc(OC)c(OC)cc4C2C3=O)O1 -4.42
10 O=C1CCCN1 1.07
11 Clc1ccc2ccccc2c1 -4.14
12 C=CCCC -2.68
13 CCC1(c2ccccc2)C(=O)NCNC1=O -2.64
14 CCCCCCCCCCCCCC -7.96
15 CC(C)Cl -1.41
16 CCC(C)CO -0.47
17 N#Cc1ccccc1 -1.0
18 CCOP(=S)(OCC)Oc1cc(C)nc(C(C)C)n1 -3.64
19 CCCCCCCCCC(C)O -2.94
20 Clc1ccc(-c2c(Cl)ccc(Cl)c2Cl)c(Cl)c1 -7.43
21 O=c1[nH]c2c(c(=O)n1C1CCCCC1)CCC2 -4.5939999999999985
22 CCOP(=S)(OCC)SCSCC -4.11
23 CCOc1ccc(NC(C)=O)cc1 -2.35
24 CCN(CC)c1c([N+](=O)[O-])cc(C(F)(F)F)c(N)c1[N+](=O)[O-] -5.47
25 CCCCCCCO -1.81
26 Cn1c(=O)c2[nH]cnc2n(C)c1=O -1.39
27 CCCCC1(CC)C(=O)NC(=O)NC1=O -1.661
28 ClC(Cl)=C(c1ccc(Cl)cc1)c1ccc(Cl)cc1 -6.9
29 CCC

In [7]:
# store as a pickle file
import pickle as pkl
with open('./preprocess/esol.pickle','wb') as fw:
    pkl.dump([data_san_mol, data_san_label, data_mean, data_std],fw)

In [8]:
# read from pickle
import pickle as pkl
with open('./preprocess/esol.pickle','rb') as f:
    [data_san_mol, data_san_label, data_mean, data_std] = pkl.load(f)
print(len(data_san_mol), len(data_san_label),data_mean, data_std)

1128 1128 -3.05010195035461 2.0955117304559443
