In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('./source/tox21.csv',sep=',')
print(len(data))
data.head()

7831


Unnamed: 0,NR-AR,NR-AR-LBD,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53,mol_id,smiles
0,0.0,0.0,1.0,,,0.0,0.0,1.0,0.0,0.0,0.0,0.0,TOX3021,CCOc1ccc2nc(S(N)(=O)=O)sc2c1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,TOX3020,CCN1C(=O)NC(c2ccccc2)C1=O
2,,,,,,,,0.0,,0.0,,,TOX3024,CC[C@]1(O)CC[C@H]2[C@@H]3CCC4=CCCC[C@@H]4[C@H]...
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,TOX3027,CCCN(CC)C(CC)C(=O)Nc1c(C)cccc1C
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,TOX20800,CC(O)(P(=O)(O)O)P(=O)(O)O


In [3]:
# 替换所有的空值为-1.0
data = data.fillna(-1.0)
data.head()

Unnamed: 0,NR-AR,NR-AR-LBD,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53,mol_id,smiles
0,0.0,0.0,1.0,-1.0,-1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,TOX3021,CCOc1ccc2nc(S(N)(=O)=O)sc2c1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,-1.0,0.0,0.0,TOX3020,CCN1C(=O)NC(c2ccccc2)C1=O
2,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,0.0,-1.0,-1.0,TOX3024,CC[C@]1(O)CC[C@H]2[C@@H]3CCC4=CCCC[C@@H]4[C@H]...
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,-1.0,0.0,0.0,TOX3027,CCCN(CC)C(CC)C(=O)Nc1c(C)cccc1C
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,TOX20800,CC(O)(P(=O)(O)O)P(=O)(O)O


In [4]:
import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem
from tqdm import tqdm_notebook

# cal the atom num
data_smiles = data['smiles'].values.tolist()
distributions = np.zeros(11)
max_length = -1
for smile in data_smiles:
    mol = Chem.MolFromSmiles(smile)
    try:
        Chem.SanitizeMol(mol)
    except:
        continue
    atom_num = mol.GetNumAtoms() // 10
    if atom_num < 10:
        distributions[atom_num] += 1
    else:
        distributions[-1] += 1
    max_length = mol.GetNumAtoms() if mol.GetNumAtoms() > max_length else max_length

for i in range(11):
    if i < 10:
        print(f'{i*10} - {(i+1) * 10}: {distributions[i]}')
    else:
        print(f'> {i * 10}: {distributions[i]}')
print(f'max length: {max_length}')
print(f'source: {len(data_smiles)}, after sanitize: {np.sum(distributions)}')

0 - 10: 1270.0
10 - 20: 3625.0
20 - 30: 1991.0
30 - 40: 618.0
40 - 50: 166.0
50 - 60: 72.0
60 - 70: 39.0
70 - 80: 13.0
80 - 90: 20.0
90 - 100: 7.0
> 100: 10.0
max length: 132
source: 7831, after sanitize: 7831.0


In [5]:
import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem
from tqdm import tqdm_notebook

element_dict = {}

data_smiles = data['smiles'].values.tolist()
data_labels = data.iloc[:,0:-2].values.tolist()
data_san_mol, data_san_label = [],[]
for smile,label in tqdm_notebook(zip(data_smiles, data_labels), total=len(data_smiles)):
    mol = Chem.MolFromSmiles(smile)
    
    # check the sanitizemol
    try:
        Chem.SanitizeMol(mol)
    except:
        continue
        
    # delete the molecule number >= 100
    if mol.GetNumAtoms() >= 100:
        continue
        
    data_san_mol.append(mol)
    data_san_label.append(label)
    
    for atom in mol.GetAtoms():
        element_dict[atom.GetSymbol()] = atom.GetAtomicNum()

HBox(children=(IntProgress(value=0, max=7831), HTML(value='')))




In [6]:
print(len(data_san_mol), np.array(data_san_label).shape)
print(element_dict)

7821 (7821, 12)
{'C': 6, 'O': 8, 'N': 7, 'S': 16, 'P': 15, 'Cl': 17, 'I': 53, 'Zn': 30, 'F': 9, 'Ca': 20, 'As': 33, 'Br': 35, 'B': 5, 'H': 1, 'K': 19, 'Si': 14, 'Cu': 29, 'Mg': 12, 'Hg': 80, 'Cr': 24, 'Zr': 40, 'Sn': 50, 'Na': 11, 'Ba': 56, 'Au': 79, 'Pd': 46, 'Tl': 81, 'Fe': 26, 'Al': 13, 'Gd': 64, 'Ag': 47, 'Mo': 42, 'V': 23, 'Nd': 60, 'Co': 27, 'Yb': 70, 'Pb': 82, 'Sb': 51, 'In': 49, 'Li': 3, 'Ni': 28, 'Bi': 83, 'Cd': 48, 'Ti': 22, 'Se': 34, 'Dy': 66, 'Mn': 25, 'Sr': 38, 'Be': 4, 'Pt': 78, 'Ge': 32}


In [7]:
# Check list order
for idx,(mol,label) in enumerate(zip(data_san_mol, data_san_label)):
    print(idx, Chem.MolToSmiles(mol), label)
    if idx >= 100:
        break

max_length = max([mol.GetNumAtoms() for mol in data_san_mol])
print(max_length)

0 CCOc1ccc2nc(S(N)(=O)=O)sc2c1 [0.0, 0.0, 1.0, -1.0, -1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]
1 CCN1C(=O)NC(c2ccccc2)C1=O [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -1.0, 0.0, -1.0, 0.0, 0.0]
2 CC[C@]1(O)CC[C@H]2[C@@H]3CCC4=CCCC[C@@H]4[C@H]3CC[C@@]21C [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 0.0, -1.0, 0.0, -1.0, -1.0]
3 CCCN(CC)C(CC)C(=O)Nc1c(C)cccc1C [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -1.0, 0.0, -1.0, 0.0, 0.0]
4 CC(O)(P(=O)(O)O)P(=O)(O)O [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
5 CC(C)(C)OOC(C)(C)CCC(C)(C)OOC(C)(C)C [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, 0.0]
6 O=S(=O)(Cl)c1ccccc1 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
7 O=C(O)Cc1cc(I)c(Oc2ccc(O)c(I)c2)c(I)c1 [0.0, -1.0, 0.0, -1.0, 1.0, -1.0, -1.0, 1.0, 0.0, 1.0, 0.0, 1.0]
8 OC[C@H](O)[C@@H](O)[C@H](O)CO [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -1.0, 0.0]
9 CCCCCCCC(=O)[O-].CCCCCCCC(=O)[O-].[Zn+2] [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 0.0, -1.0, 0.0, -1.0, -1.0]


In [8]:
# store as a pickle file
import pickle as pkl
with open('./preprocess/tox21.pickle','wb') as fw:
    pkl.dump([data_san_mol, data_san_label],fw)

In [9]:
# read from pickle
import pickle as pkl
with open('./preprocess/tox21.pickle','rb') as f:
    [data_san_mol, data_san_label] = pkl.load(f)
print(len(data_san_mol), np.array(data_san_label).shape)

7821 (7821, 12)
