In [30]:
from tqdm import tqdm
from rdkit import Chem
from feature import mol_to_feature
import numpy as np
import pandas as pd
import pickle
from mordred import Calculator, descriptors

MAX_LEN = 150
INPUT_SMILES = 'egfr.csv'
OUTPUT = 'egfr.pickle'
SMILE = 'smiles'
ACTIVE = 'active'
ID = 'id'
DEBUG = False

In [31]:
df = pd.read_csv(INPUT_SMILES)
print(df.head())
print ('Shape before dropping duplicates: ', df.shape)
df = df.drop_duplicates(subset=[SMILE])
print ('Shape after dropping duplicates: ', df.shape)

if DEBUG:
    df = df.head(10)

                                              smiles  active
0  c12c(ncnc1Cc1cccc(c1)I)cc1c(c2)O[C@@H]([C@H](O...       1
1  c12c(cc(c(c1)CN1CCC[C@@H]1C(=O)N)OC)ncnc2Nc1cc...       1
2  C1(CCN(CC1)C)(N(Cc1c(cc2c(c1)c(ncn2)Nc1cccc(c1...       1
3          c12c(cnc(n1)NCCN1CCOCC1)ncnc2Nc1cccc(c1)C       1
4   c12c(cc(c(c1)OC)OC)ncc(c2Nc1cccc(c1)C(F)(F)F)C#N       0
Shape before dropping duplicates:  (3492, 2)
Shape after dropping duplicates:  (3492, 2)


In [32]:
mols = [Chem.MolFromSmiles(i) for i in df.smiles]

In [35]:
calc = Calculator(descriptors, ignore_3D=True)
md = calc.pandas(mols)

  8%|▊         | 277/3492 [01:16<14:12,  3.77it/s]  

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  8%|▊         | 288/3492 [01:20<20:56,  2.55it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 58%|█████▊    | 2026/3492 [07:27<06:45,  3.61it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 3492/3492 [11:06<00:00,  5.24it/s]


In [43]:
smile_ft = [mol_to_feature(mol,-1, 150) for mol in mols]

In [47]:
md['active'] = df.active
md['smile'] = df.smiles
md['smile_ft'] = smile_ft

In [48]:
md.head()

Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,...,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2,active,smile,smile_ft
0,21.86794,16.082486,0,0,36.5002,2.48437,4.96874,36.5002,1.30358,4.26828,...,10.041931,2083,46,148.0,175.0,8.0,6.277778,1,c12c(ncnc1Cc1cccc(c1)I)cc1c(c2)O[C@@H]([C@H](O...,"[0, 1, 0, 0, 0, 0.375, 1.0, 0.0, 0.5, 0, 0, 0,..."
1,23.529442,18.570548,0,1,38.8182,2.46021,4.91238,38.8182,1.29394,4.33701,...,8.414447,2503,49,160.0,190.0,9.72222,6.583333,1,c12c(cc(c(c1)CN1CCC[C@@H]1C(=O)N)OC)ncnc2Nc1cc...,"[0, 1, 0, 0, 0, 0.375, 1.0, 0.0, 0.5, 0, 0, 0,..."
2,26.49387,20.693006,0,2,43.49,2.48122,4.96244,43.49,1.27912,4.45683,...,7.841849,3438,62,182.0,219.0,12.0347,7.472222,1,C1(CCN(CC1)C)(N(Cc1c(cc2c(c1)c(ncn2)Nc1cccc(c1...,"[0, 1, 0, 0, 0, 0.375, 1.0, 0.0, 0.5, 0, 0, 0,..."
3,21.241713,15.127499,0,1,35.8767,2.41424,4.82848,35.8767,1.32877,4.22902,...,7.303928,2088,39,140.0,161.0,6.52778,5.972222,1,c12c(cnc(n1)NCCN1CCOCC1)ncnc2Nc1cccc(c1)C,"[0, 1, 0, 0, 0, 0.375, 1.0, 0.0, 0.5, 0, 0, 0,..."
4,20.759483,16.820563,0,0,34.2001,2.4737,4.94741,34.2001,1.26667,4.22028,...,9.100093,1802,46,142.0,168.0,9.95139,6.027778,0,c12c(cc(c(c1)OC)OC)ncc(c2Nc1cccc(c1)C(F)(F)F)C#N,"[0, 1, 0, 0, 0, 0.375, 1.0, 0.0, 0.5, 0, 0, 0,..."
