In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('./source/clintox.csv',sep=',')
print(len(data))
data.head()

1484


Unnamed: 0,smiles,FDA_APPROVED,CT_TOX
0,*C(=O)[C@H](CCCCNC(=O)OCCOC)NC(=O)OCCOC,1,0
1,[C@@H]1([C@@H]([C@@H]([C@H]([C@@H]([C@@H]1Cl)C...,1,0
2,[C@H]([C@@H]([C@@H](C(=O)[O-])O)O)([C@H](C(=O)...,1,0
3,[H]/[NH+]=C(/C1=CC(=O)/C(=C\C=c2ccc(=C([NH3+])...,1,0
4,[H]/[NH+]=C(\N)/c1ccc(cc1)OCCCCCOc2ccc(cc2)/C(...,1,0


In [3]:
# 替换所有的空值为-1.0
data = data.fillna(-1.0)
data.head()

Unnamed: 0,smiles,FDA_APPROVED,CT_TOX
0,*C(=O)[C@H](CCCCNC(=O)OCCOC)NC(=O)OCCOC,1,0
1,[C@@H]1([C@@H]([C@@H]([C@H]([C@@H]([C@@H]1Cl)C...,1,0
2,[C@H]([C@@H]([C@@H](C(=O)[O-])O)O)([C@H](C(=O)...,1,0
3,[H]/[NH+]=C(/C1=CC(=O)/C(=C\C=c2ccc(=C([NH3+])...,1,0
4,[H]/[NH+]=C(\N)/c1ccc(cc1)OCCCCCOc2ccc(cc2)/C(...,1,0


In [4]:
import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem
from tqdm import tqdm_notebook

# cal the atom num
data_smiles = data['smiles'].values.tolist()
split_num = 15
distributions = np.zeros(split_num)
max_length = -1
for smile in data_smiles:
    mol = Chem.MolFromSmiles(smile)
    try:
        Chem.SanitizeMol(mol)
    except:
        continue
    atom_num = mol.GetNumAtoms() // 10
    if atom_num < split_num - 1:
        distributions[atom_num] += 1
    else:
        distributions[-1] += 1
    max_length = mol.GetNumAtoms() if mol.GetNumAtoms() > max_length else max_length

for i in range(split_num):
    if i < split_num - 1:
        print(f'{i*10} - {(i+1) * 10}: {distributions[i]}')
    else:
        print(f'> {i * 10}: {distributions[i]}')
print(f'max length: {max_length}')
print(f'source: {len(data_smiles)}, after sanitize: {np.sum(distributions)}')

0 - 10: 78.0
10 - 20: 406.0
20 - 30: 574.0
30 - 40: 265.0
40 - 50: 67.0
50 - 60: 27.0
60 - 70: 19.0
70 - 80: 13.0
80 - 90: 8.0
90 - 100: 11.0
100 - 110: 4.0
110 - 120: 3.0
120 - 130: 2.0
130 - 140: 1.0
> 140: 0.0
max length: 136
source: 1484, after sanitize: 1478.0


In [5]:
import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem
from tqdm import tqdm_notebook

element_dict = {}

data_smiles = data['smiles'].values.tolist()
data_labels = data.iloc[:,1:].values.tolist()

data_san_mol, data_san_label = [],[]
for smile,label in tqdm_notebook(zip(data_smiles, data_labels), total=len(data_smiles)):
    mol = Chem.MolFromSmiles(smile)
    
    # check the sanitizemol
    try:
        Chem.SanitizeMol(mol)
    except:
        continue
        
    # delete the molecule number >= 150
    if mol.GetNumAtoms() >= 100:
        continue
        
    data_san_mol.append(mol)
    data_san_label.append(label)
    
    for atom in mol.GetAtoms():
        element_dict[atom.GetSymbol()] = atom.GetAtomicNum()

HBox(children=(IntProgress(value=0, max=1484), HTML(value='')))




In [6]:
print(len(data_san_mol), np.array(data_san_label).shape)
print(element_dict)

1468 (1468, 2)
{'*': 0, 'C': 6, 'O': 8, 'N': 7, 'Cl': 17, 'H': 1, 'Tc': 43, 'P': 15, 'F': 9, 'S': 16, 'Se': 34, 'B': 5, 'Fe': 26, 'Al': 13, 'Br': 35, 'I': 53, 'Ca': 20, 'Pt': 78, 'Bi': 83, 'Au': 79, 'Tl': 81, 'Cr': 24, 'Cu': 29, 'Mn': 25, 'Zn': 30, 'Si': 14, 'Hg': 80, 'As': 33, 'Ti': 22}


In [7]:
# Check list order
for idx,(mol,label) in enumerate(zip(data_san_mol, data_san_label)):
    print(idx, Chem.MolToSmiles(mol), label)
    if idx >= 100:
        break

max_length = max([mol.GetNumAtoms() for mol in data_san_mol])
print(max_length)

0 *C(=O)[C@H](CCCCNC(=O)OCCOC)NC(=O)OCCOC [1, 0]
1 Cl[C@H]1[C@H](Cl)[C@@H](Cl)[C@@H](Cl)[C@H](Cl)[C@H]1Cl [1, 0]
2 O=C([O-])[C@@H](O)[C@@H](O)[C@H](O)[C@@H](O)C(=O)[O-] [1, 0]
3 [H]/[NH+]=C(\N)C1=CC(=O)/C(=C\C=c2ccc(=C(N)[NH3+])cc2)C=C1 [1, 0]
4 [H]/[NH+]=C(\N)c1ccc(OCCCCCOc2ccc(/C(N)=[NH+]/[H])cc2)cc1 [1, 0]
5 O=[N+]([O-])[O-] [1, 0]
6 [N]=O [1, 0]
7 O=[99Tc](=O)(=O)[O-] [1, 0]
8 O=P([O-])([O-])F [1, 0]
9 O=S(=O)([O-])[O-] [1, 0]
10 O=S([O-])([O-])=S [1, 0]
11 [Se] [0, 1]
12 CC(C)C[C@H](NC(=O)[C@H](Cc1ccccc1)NC(=O)c1cnccn1)B(O)O [1, 0]
13 CC(C)C[C@H](NC(=O)[C@H](Cc1ccccc1)NC(=O)c1cnccn1)B(O)O [0, 1]
14 CC(C)C[C@H](NC(=O)CNC(=O)c1cc(Cl)ccc1Cl)B(O)O [0, 1]
15 C#CC1(OC(N)=O)CCCCC1 [1, 0]
16 C#CC[NH2+][C@@H]1CCc2ccccc21 [1, 0]
17 C#CCC(Cc1cnc2nc(N)nc(N)c2n1)c1ccc(C(=O)N[C@@H](CCC(=O)[O-])C(=O)[O-])cc1 [1, 0]
18 C#N [1, 0]
19 N#C[Fe-2](C#N)(C#N)(C#N)(C#N)N=O [1, 0]
20 O=C1O[C@H]([C@@H](O)CO)C([O-])=C1O [1, 0]
21 [NH3+][C@@H](CS)C(=O)[O-] [1, 0]
22 OC[C@H]1O[C@](O)(CO)[C@@H](O)[C@@H]1O[C@@H

In [8]:
# store as a pickle file
import pickle as pkl
with open('./preprocess/clintox.pickle','wb') as fw:
    pkl.dump([data_san_mol, data_san_label],fw)

In [9]:
# read from pickle
import pickle as pkl
with open('./preprocess/clintox.pickle','rb') as f:
    [data_san_mol, data_san_label] = pkl.load(f)
print(len(data_san_mol), np.array(data_san_label).shape)

1468 (1468, 2)
