In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('./source/sider.csv',sep=',')
print(len(data))
data.head()

1427


Unnamed: 0,smiles,Hepatobiliary disorders,Metabolism and nutrition disorders,Product issues,Eye disorders,Investigations,Musculoskeletal and connective tissue disorders,Gastrointestinal disorders,Social circumstances,Immune system disorders,...,"Congenital, familial and genetic disorders",Infections and infestations,"Respiratory, thoracic and mediastinal disorders",Psychiatric disorders,Renal and urinary disorders,"Pregnancy, puerperium and perinatal conditions",Ear and labyrinth disorders,Cardiac disorders,Nervous system disorders,"Injury, poisoning and procedural complications"
0,C(CNCCNCCNCCN)N,1,1,0,0,1,1,1,0,0,...,0,0,1,1,0,0,1,1,1,0
1,CC(C)(C)C1=CC(=C(C=C1NC(=O)C2=CNC3=CC=CC=C3C2=...,0,1,0,0,1,1,1,0,0,...,0,1,1,0,0,0,1,0,1,0
2,CC[C@]12CC(=C)[C@H]3[C@H]([C@@H]1CC[C@]2(C#C)O...,0,1,0,1,1,0,1,0,1,...,0,0,0,1,0,0,0,0,1,0
3,CCC12CC(=C)C3C(C1CC[C@]2(C#C)O)CCC4=CC(=O)CCC34,1,1,0,1,1,1,1,0,1,...,1,1,1,1,1,1,0,0,1,1
4,C1C(C2=CC=CC=C2N(C3=CC=CC=C31)C(=O)N)O,1,1,0,1,1,1,1,0,1,...,0,1,1,1,0,0,1,0,1,0


In [3]:
# 替换所有的空值为-1.0
data = data.fillna(-1.0)
data.head()

Unnamed: 0,smiles,Hepatobiliary disorders,Metabolism and nutrition disorders,Product issues,Eye disorders,Investigations,Musculoskeletal and connective tissue disorders,Gastrointestinal disorders,Social circumstances,Immune system disorders,...,"Congenital, familial and genetic disorders",Infections and infestations,"Respiratory, thoracic and mediastinal disorders",Psychiatric disorders,Renal and urinary disorders,"Pregnancy, puerperium and perinatal conditions",Ear and labyrinth disorders,Cardiac disorders,Nervous system disorders,"Injury, poisoning and procedural complications"
0,C(CNCCNCCNCCN)N,1,1,0,0,1,1,1,0,0,...,0,0,1,1,0,0,1,1,1,0
1,CC(C)(C)C1=CC(=C(C=C1NC(=O)C2=CNC3=CC=CC=C3C2=...,0,1,0,0,1,1,1,0,0,...,0,1,1,0,0,0,1,0,1,0
2,CC[C@]12CC(=C)[C@H]3[C@H]([C@@H]1CC[C@]2(C#C)O...,0,1,0,1,1,0,1,0,1,...,0,0,0,1,0,0,0,0,1,0
3,CCC12CC(=C)C3C(C1CC[C@]2(C#C)O)CCC4=CC(=O)CCC34,1,1,0,1,1,1,1,0,1,...,1,1,1,1,1,1,0,0,1,1
4,C1C(C2=CC=CC=C2N(C3=CC=CC=C31)C(=O)N)O,1,1,0,1,1,1,1,0,1,...,0,1,1,1,0,0,1,0,1,0


In [4]:
import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem
from tqdm import tqdm_notebook

# cal the atom num
data_smiles = data['smiles'].values.tolist()
split_num = 15
distributions = np.zeros(split_num)
max_length = -1
for smile in data_smiles:
    mol = Chem.MolFromSmiles(smile)
    try:
        Chem.SanitizeMol(mol)
    except:
        continue
    atom_num = mol.GetNumAtoms() // 10
    if atom_num < split_num - 1:
        distributions[atom_num] += 1
    else:
        distributions[-1] += 1
    max_length = mol.GetNumAtoms() if mol.GetNumAtoms() > max_length else max_length

for i in range(split_num):
    if i < split_num - 1:
        print(f'{i*10} - {(i+1) * 10}: {distributions[i]}')
    else:
        print(f'> {i * 10}: {distributions[i]}')
print(f'max length: {max_length}')
print(f'source: {len(data_smiles)}, after sanitize: {np.sum(distributions)}')

0 - 10: 96.0
10 - 20: 350.0
20 - 30: 504.0
30 - 40: 267.0
40 - 50: 62.0
50 - 60: 37.0
60 - 70: 26.0
70 - 80: 12.0
80 - 90: 14.0
90 - 100: 11.0
100 - 110: 8.0
110 - 120: 4.0
120 - 130: 4.0
130 - 140: 2.0
> 140: 30.0
max length: 492
source: 1427, after sanitize: 1427.0


In [5]:
import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem
from tqdm import tqdm_notebook

element_dict = {}

data_smiles = data['smiles'].values.tolist()
data_labels = data.iloc[:,1:].values.tolist()

data_san_mol, data_san_label = [],[]
max_length = -1
for smile,label in tqdm_notebook(zip(data_smiles, data_labels), total=len(data_smiles)):
    mol = Chem.MolFromSmiles(smile)
    
    # check the sanitizemol
    try:
        Chem.SanitizeMol(mol)
    except:
        continue
        
    # delete the molecule number >= 150
    if mol.GetNumAtoms() >= 100:
        continue
    
    data_san_mol.append(mol)
    data_san_label.append(label)
    
    for atom in mol.GetAtoms():
        element_dict[atom.GetSymbol()] = atom.GetAtomicNum()


HBox(children=(IntProgress(value=0, max=1427), HTML(value='')))




In [6]:
print(len(data_san_mol), np.array(data_san_label).shape)
print(element_dict)

1379 (1379, 27)
{'C': 6, 'N': 7, 'O': 8, 'Cl': 17, 'F': 9, 'S': 16, 'Tl': 81, 'I': 53, 'Ca': 20, 'P': 15, 'H': 1, 'Gd': 64, 'K': 19, 'Mg': 12, 'Na': 11, 'Ge': 32, 'Br': 35, 'Fe': 26, 'Au': 79, 'Ba': 56, 'Sr': 38, 'As': 33, 'Se': 34, 'Pt': 78, 'Co': 27, 'Li': 3, 'B': 5, 'Ra': 88, 'In': 49, 'Mn': 25, 'La': 57, 'Ag': 47, 'Zn': 30, 'Tc': 43, 'Cf': 98, 'Ga': 31, 'Sm': 62, 'Cr': 24, 'Cu': 29, 'Y': 39}


In [7]:
# Check list order
for idx,(mol,label) in enumerate(zip(data_san_mol, data_san_label)):
    print(idx, Chem.MolToSmiles(mol), label)
    if idx >= 100:
        break
max_length = max([mol.GetNumAtoms() for mol in data_san_mol])
print(max_length)

0 NCCNCCNCCNCCN [1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0]
1 CC(C)(C)c1cc(C(C)(C)C)c(NC(=O)c2c[nH]c3ccccc3c2=O)cc1O [0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0]
2 C#C[C@]1(O)CC[C@H]2[C@@H]3CCC4=CCCC[C@@H]4[C@H]3C(=C)C[C@@]21CC [0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0]
3 C#C[C@]1(O)CCC2C3CCC4=CC(=O)CCC4C3C(=C)CC21CC [1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1]
4 NC(=O)N1c2ccccc2CC(O)c2ccccc21 [1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0]
5 CC1CC2C3CCC4=CC(=O)C=CC4(C)[C@@]3(F)C(O)CC2(C)[C@@]1(O)C(=O)CCl [0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1]
6 CCCCCCOC(=O)N=C(N)c1ccc(NCc2nc3cc(C(=O)N(CCC(=O)OCC)c4ccccn4)ccc3n2C)cc1 [1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1]
7 CSCCC(NC(=O)C(Cc1ccc(OS(=O)(=O)O)cc1)NC(=O)C(N)CC(=O)O)C(=O)NCC

In [8]:
# store as a pickle file
import pickle as pkl
with open('./preprocess/sider.pickle','wb') as fw:
    pkl.dump([data_san_mol, data_san_label],fw)

In [9]:
# read from pickle
import pickle as pkl
with open('./preprocess/sider.pickle','rb') as f:
    [data_san_mol, data_san_label] = pkl.load(f)
print(len(data_san_mol), np.array(data_san_label).shape)

1379 (1379, 27)
