In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('./source/bbbp.csv',sep=',')
print(len(data))
data

2050


Unnamed: 0,num,name,p_np,smiles
0,1,Propanolol,1,[Cl].CC(C)NCC(O)COc1cccc2ccccc12
1,2,Terbutylchlorambucil,1,C(=O)(OC(C)(C)C)CCCc1ccc(cc1)N(CCCl)CCCl
2,3,40730,1,c12c3c(N4CCN(C)CC4)c(F)cc1c(c(C(O)=O)cn2C(C)CO...
3,4,24,1,C1CCN(CC1)Cc1cccc(c1)OCCCNC(=O)C
4,5,cloxacillin,1,Cc1onc(c2ccccc2Cl)c1C(=O)N[C@H]3[C@H]4SC(C)(C)...
5,6,cefoperazone,1,CCN1CCN(C(=O)N[C@@H](C(=O)N[C@H]2[C@H]3SCC(=C(...
6,7,rolitetracycline,1,CN(C)[C@H]1[C@@H]2C[C@H]3C(=C(O)c4c(O)cccc4[C@...
7,8,ondansetron,1,Cn1c2CCC(Cn3ccnc3C)C(=O)c2c4ccccc14
8,9,diltiazem,1,COc1ccc(cc1)[C@@H]2Sc3ccccc3N(CCN(C)C)C(=O)[C@...
9,10,Amiloride,1,NC(N)=NC(=O)c1nc(Cl)c(N)nc1N


In [3]:
import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem
from tqdm import tqdm_notebook

# cal the atom num
data_smiles = data['smiles'].values.tolist()
distributions = np.zeros(11)
max_length = -1
for smile in data_smiles:
    mol = Chem.MolFromSmiles(smile)
    try:
        Chem.SanitizeMol(mol)
    except:
        continue
    atom_num = mol.GetNumAtoms() // 10
    if atom_num < 10:
        distributions[atom_num] += 1
    else:
        distributions[-1] += 1
    max_length = mol.GetNumAtoms() if mol.GetNumAtoms() > max_length else max_length

for i in range(11):
    if i < 10:
        print(f'{i*10} - {(i+1) * 10}: {distributions[i]}')
    else:
        print(f'> {i * 10}: {distributions[i]}')
print(f'max length: {max_length}')
print(f'source: {len(data_smiles)}, after sanitize: {np.sum(distributions)}')

0 - 10: 78.0
10 - 20: 557.0
20 - 30: 961.0
30 - 40: 345.0
40 - 50: 46.0
50 - 60: 30.0
60 - 70: 11.0
70 - 80: 3.0
80 - 90: 3.0
90 - 100: 1.0
> 100: 4.0
max length: 132
source: 2050, after sanitize: 2039.0


In [4]:
import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem
from tqdm import tqdm_notebook

element_dict = {}

data_smiles = data['smiles'].values.tolist()
data_labels = data['p_np'].values.tolist()
data_san_mol, data_san_label = [],[]
for smile,label in tqdm_notebook(zip(data_smiles, data_labels), total=len(data_smiles)):
    mol = Chem.MolFromSmiles(smile)
    
    # check the sanitizemol
    try:
        Chem.SanitizeMol(mol)
    except:
        continue
        
    if mol.GetNumAtoms() >= 100:
        continue
        
    data_san_mol.append(mol)
    data_san_label.append(label)
    
    for atom in mol.GetAtoms():
        element_dict[atom.GetSymbol()] = atom.GetAtomicNum()

HBox(children=(IntProgress(value=0, max=2050), HTML(value='')))




In [5]:
print(len(data_san_mol), len(data_san_label))
print(element_dict)

2035 2035
{'Cl': 17, 'C': 6, 'N': 7, 'O': 8, 'F': 9, 'S': 16, 'Br': 35, 'I': 53, 'H': 1, 'Na': 11, 'P': 15, 'Ca': 20, 'B': 5}


In [6]:
# Check list order
for idx,(mol,label) in enumerate(zip(data_san_mol, data_san_label)):
    print(idx, Chem.MolToSmiles(mol), label)
    if idx >= 100:
        break
print()
max_length = max([mol.GetNumAtoms() for mol in data_san_mol])
print("max length is {}".format(max_length))

0 CC(C)NCC(O)COc1cccc2ccccc12.[Cl] 1
1 CC(C)(C)OC(=O)CCCc1ccc(N(CCCl)CCCl)cc1 1
2 CC1COc2c(N3CCN(C)CC3)c(F)cc3c(=O)c(C(=O)O)cn1c23 1
3 CC(=O)NCCCOc1cccc(CN2CCCCC2)c1 1
4 Cc1onc(-c2ccccc2Cl)c1C(=O)N[C@@H]1C(=O)N2[C@@H](C(=O)O)C(C)(C)S[C@H]12 1
5 CCN1CCN(C(=O)N[C@@H](C(=O)N[C@@H]2C(=O)N3C(C(=O)O)=C(CSc4nnnn4C)CS[C@H]23)c2ccc(O)cc2)C(=O)C1=O 1
6 CN(C)[C@@H]1C(=O)/C(=C(/O)NCN2CCCC2)C(=O)[C@@]2(O)C(=O)C3=C(O)c4c(O)cccc4[C@@](C)(O)[C@H]3C[C@@H]12 1
7 Cc1nccn1CC1CCc2c(c3ccccc3n2C)C1=O 1
8 COc1ccc([C@@H]2Sc3ccccc3N(CCN(C)C)C(=O)[C@@H]2OC(C)=O)cc1 1
9 NC(N)=NC(=O)c1nc(Cl)c(N)nc1N 1
10 CN1Cc2c(-c3noc(C(C)(O)CO)n3)ncn2-c2cccc(Cl)c2C1=O 0
11 Cc1cn([C@H]2C[C@H](F)[C@@H](CO)O2)c(=O)[nH]c1=O 1
12 ClCCl 1
13 CC(C)(C)NC(=O)C1CC2CCCCC2CN1CC(O)C(Cc1ccccc1)NC(=O)C(CC(N)=O)NC(=O)c1ccc2ccccc2n1 1
14 CCC(=O)C(CC(C)N(C)C)(c1ccccc1)c1ccccc1 1
15 CCC(=O)N(c1ccccc1)C1(COC)CCN(CCn2nnn(CC)c2=O)CC1 1
16 CN(C)C(=O)C(CCN1CCC(O)(c2ccc(Cl)cc2)CC1)(c1ccccc1)c1ccccc1 1
17 CN1C2CCC1CC(OC(=O)[C@H](CO)c1ccccc1)C2 1
18 COc1c

In [7]:
# store as a pickle file
import pickle as pkl
with open('./preprocess/bbbp.pickle','wb') as fw:
    pkl.dump([data_san_mol, data_san_label],fw)

In [8]:
# read from pickle
import pickle as pkl
with open('./preprocess/bbbp.pickle','rb') as f:
    [data_san_mol, data_san_label] = pkl.load(f)
print(len(data_san_mol), len(data_san_label))

2035 2035
