In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('./source/lipophilicity.csv',sep=',')
print(len(data))
data.head()

4200


Unnamed: 0,CMPD_CHEMBLID,exp,smiles
0,CHEMBL596271,3.54,Cn1c(CN2CCN(CC2)c3ccc(Cl)cc3)nc4ccccc14
1,CHEMBL1951080,-1.18,COc1cc(OC)c(cc1NC(=O)CSCC(=O)O)S(=O)(=O)N2C(C)...
2,CHEMBL1771,3.69,COC(=O)[C@@H](N1CCc2sccc2C1)c3ccccc3Cl
3,CHEMBL234951,3.37,OC[C@H](O)CN1C(=O)C(Cc2ccccc12)NC(=O)c3cc4cc(C...
4,CHEMBL565079,3.1,Cc1cccc(C[C@H](NC(=O)c2cc(nn2C)C(C)(C)C)C(=O)N...


In [3]:
import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem
from tqdm import tqdm_notebook

# cal the atom num
data_smiles = data['smiles'].values.tolist()
data_labels = data['exp'].values.tolist()
distributions = np.zeros(11)
max_length = -1
for smile in data_smiles:
    mol = Chem.MolFromSmiles(smile)
    try:
        Chem.SanitizeMol(mol)
    except:
        continue
    atom_num = mol.GetNumAtoms() // 10
    if atom_num < 10:
        distributions[atom_num] += 1
    else:
        distributions[-1] += 1
    max_length = mol.GetNumAtoms() if mol.GetNumAtoms() > max_length else max_length

for i in range(11):
    if i < 10:
        print(f'{i*10} - {(i+1) * 10}: {distributions[i]}')
    else:
        print(f'> {i * 10}: {distributions[i]}')
print(f'max length: {max_length}')
print(f'source: {len(data_smiles)}, after sanitize: {np.sum(distributions)}')

0 - 10: 8.0
10 - 20: 729.0
20 - 30: 1819.0
30 - 40: 1516.0
40 - 50: 117.0
50 - 60: 6.0
60 - 70: 2.0
70 - 80: 1.0
80 - 90: 0.0
90 - 100: 0.0
> 100: 2.0
max length: 115
source: 4200, after sanitize: 4200.0


In [4]:
import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem
from tqdm import tqdm_notebook

element_dict = {}
data_smile = data['smiles'].values.tolist()
data_label = data['exp'].values.tolist()

data_san_mol,data_san_label = [],[]
for smile,label in tqdm_notebook(list(zip(data_smile, data_label))):
    mol=Chem.MolFromSmiles(smile)
    
    # check the sanitizemol
    try:
        Chem.SanitizeMol(mol)
    except:
        continue
        
    # delete the molecule number >= 100
    if mol.GetNumAtoms() >= 100:
        continue
        
    data_san_mol.append(mol)
    data_san_label.append(label)
    
    for atom in mol.GetAtoms():
        element_dict[atom.GetSymbol()] = atom.GetAtomicNum()  

HBox(children=(IntProgress(value=0, max=4200), HTML(value='')))




In [5]:
data_mean = np.mean(data_san_label)
data_std = np.std(data_san_label)
print(len(data_san_mol), len(data_san_label),data_mean, data_std)
print(element_dict)

4198 4198 2.1877108146736544 1.201484223942737
{'C': 6, 'N': 7, 'Cl': 17, 'O': 8, 'S': 16, 'F': 9, 'B': 5, 'Br': 35, 'P': 15, 'I': 53, 'Si': 14, 'Se': 34}


In [6]:
for idx,(mol,label) in enumerate(zip(data_san_mol, data_san_label)):
    print(idx, Chem.MolToSmiles(mol), label)
    if idx >= 100:
        break
        
print()
max_length = max([mol.GetNumAtoms() for mol in data_san_mol])
print("max length is {}".format(max_length))

0 Cn1c(CN2CCN(c3ccc(Cl)cc3)CC2)nc2ccccc21 3.54
1 COc1cc(OC)c(S(=O)(=O)N2c3ccccc3CCC2C)cc1NC(=O)CSCC(=O)O -1.18
2 COC(=O)[C@H](c1ccccc1Cl)N1CCc2sccc2C1 3.69
3 O=C(NC1Cc2ccccc2N(C[C@@H](O)CO)C1=O)c1cc2cc(Cl)sc2[nH]1 3.37
4 Cc1cccc(C[C@H](NC(=O)c2cc(C(C)(C)C)nn2C)C(=O)NCC#N)c1 3.1
5 OC1(C#Cc2ccc(-c3ccccc3)cc2)CN2CCC1CC2 3.14
6 COc1cc(OC)c(S(=O)(=O)NCc2ccccc2N2CCCCC2)cc1NC(=O)CCC(=O)O -0.72
7 CNc1cccc(CCOc2ccc(C[C@H](NC(=O)c3c(Cl)cccc3Cl)C(=O)O)cc2C)n1 0.34
8 COc1ccc(-c2coc3cc(OC)cc(OC)c3c2=O)cc1 3.05
9 Oc1ncnc2scc(-c3ccsc3)c12 2.25
10 CS(=O)(=O)c1ccc(Oc2ccc(C#C[C@]3(O)CN4CCC3CC4)cc2)cc1 1.51
11 Cc1cc(Nc2nc(N[C@@H](C)c3ccc(F)cn3)c(C#N)nc2C)n[nH]1 2.61
12 O=C1CCCCCN1 -0.08
13 CCCSc1ncccc1C(=O)N1CCCC1c1ccncc1 1.95
14 Cc1ccc(S(=O)(=O)Nc2c(C(=O)NC3CCCCC3C)cnn2-c2ccccc2)cc1 1.34
15 Nc1ccc(-c2nc3ccc(O)cc3s2)cc1 3.2
16 COc1ccc(N2CCN(C(=O)[C@@H]3CCCC[C@H]3C(=O)NCC#N)CC2)cc1 1.6
17 CCC(COC(=O)c1cc(OC)c(OC)c(OC)c1)(c1ccccc1)N(C)C 3.77
18 COc1cc(-n2nnc3cc(-c4ccc(Cl)cc4)sc3c2=O)ccc1N1CC[C@@H](O)C1 3.1

In [7]:
# store as a pickle file
import pickle as pkl
with open('./preprocess/lipophilicity.pickle','wb') as fw:
    pkl.dump([data_san_mol, data_san_label, data_mean, data_std],fw)

In [8]:
# read from pickle
import pickle as pkl
with open('./preprocess/lipophilicity.pickle','rb') as f:
    [data_san_mol, data_san_label, data_mean, data_std] = pkl.load(f)
print(len(data_san_mol), len(data_san_label),data_mean, data_std)

4198 4198 2.1877108146736544 1.201484223942737
