Permalink
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
1005 lines (332 sloc) 16 KB
%matplotlib inline
from matplotlib.pyplot import figure, imshow, axis
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from math import sqrt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from math import log10
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw
from IPython.display import display, Markdown, HTML

** Require RDKit 2018.09.01 or later **

import rdkit
rdkit.__version__
'2018.09.1'
#sdf_file = "CHEMBL1827733_5HT2A.sdf"
sdf_file = "CHEMBL930273_GSK3.sdf"
#sdf_file = "CHEMBL952131_EGFR.sdf"
def sdf_to_desc(sdf_file):
    fps = []
    targets = []
    nfps = []
    mols=[]
    bis = []

    for mol in Chem.SDMolSupplier(sdf_file):
        mols.append(mol)
        bi = {}
        fps.append(AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, bitInfo=bi))  # fingerprint
        targets.append(9.0 - log10(float(mol.GetProp("ACTIVITY"))))  # pIC50
        bis.append(bi)

    for fp in fps:
        nfp = np.zeros((1,))
        DataStructs.ConvertToNumpyArray(fp, nfp)
        nfps.append(nfp)

    return (np.array(nfps), np.array(targets), mols, bis)

Random Forest Regression Analysis

def rf(x_train, x_test, y_train, y_test):
    r = RandomForestRegressor().fit(x_train, y_train)
    y_pred = r.predict(x_test)
    r2 = r2_score(y_test, y_pred)
    rmse = sqrt(mean_squared_error(y_test, y_pred))
    print(' RF: R2: {0:f}, RMSE:{1:f}'.format(r2, rmse))
    return r
x, y, mols,bis = sdf_to_desc(sdf_file)
x_train, x_test, y_train, y_test, mols_train, mols_test, bis_train, bis_test = train_test_split(x, y, mols, bis, test_size=0.1)
r = rf(x_train, x_test, y_train, y_test)
 RF: R2: 0.481589, RMSE:0.802244

Sorting bit positions by feature importances

sorted_pos = sorted(zip(range(r.feature_importances_.shape[0]), r.feature_importances_), key=lambda x: x[1], reverse=True)
important_posisions = [t[0] for t in sorted_pos[:20]]

Visualizing important fragments

i = 0
for bi, mol in zip(bis_test, mols_test):
    i += 1
    display(Markdown("## Test molecule #{}<h2>".format(i)))
    display(mol)
    display(Markdown("#### Important fragments"))
    frgs = []
    for pos in important_posisions:
        if bi.get(pos, False):
            print("Bit position: {}".format(pos))
            display(Draw.DrawMorganBit(mol,pos,bi))

Test molecule #1

png

Important fragments

Bit position: 1152

png

Bit position: 155

png

Bit position: 650

png

Test molecule #2

png

Important fragments

Bit position: 328

png

Bit position: 926

png

Bit position: 591

png

Bit position: 1490

png

Bit position: 1816

png

Bit position: 1152

png

Bit position: 898

png

Bit position: 650

png

Bit position: 896

png

Bit position: 980

png

Test molecule #3

png

Important fragments

Bit position: 1602

png

Bit position: 1152

png

Bit position: 155

png

Bit position: 650

png

Test molecule #4

png

Important fragments

Bit position: 328

png

Bit position: 1602

png

Bit position: 926

png

Bit position: 591

png

Bit position: 1313

png

Bit position: 1490

png

Bit position: 786

png

Bit position: 1816

png

Bit position: 1152

png

Bit position: 898

png

Bit position: 650

png

Bit position: 1121

png

Bit position: 896

png

Test molecule #5

png

Important fragments

Bit position: 328

png

Bit position: 591

png

Bit position: 1452

png

Bit position: 786

png

Bit position: 1816

png

Bit position: 1152

png

Bit position: 1535

png

Bit position: 650

png

Test molecule #6

png

Important fragments

Bit position: 1452

png

Bit position: 1535

png

Test molecule #7

png

Important fragments

Bit position: 1602

png

Bit position: 1313

png

Bit position: 1152

png

Bit position: 155

png

Bit position: 650

png

Test molecule #8

png

Important fragments

Bit position: 926

png

Bit position: 650

png

Bit position: 896

png

Test molecule #9

png

Important fragments

Bit position: 926

png

Bit position: 650

png

Test molecule #10

png

Important fragments

Bit position: 328

png

Bit position: 926

png

Bit position: 591

png

Bit position: 1490

png

Bit position: 1816

png

Bit position: 1152

png

Bit position: 898

png

Bit position: 650

png

Bit position: 980

png

Test molecule #11

png

Important fragments

Bit position: 926

png

Bit position: 366

png

Bit position: 650

png

Test molecule #12

png

Important fragments

Bit position: 1152

png

Bit position: 155

png

Bit position: 650

png

Bit position: 896

png

Test molecule #13

png

Important fragments

Bit position: 328

png

Bit position: 926

png

Bit position: 591

png

Bit position: 1490

png

Bit position: 1816

png

Bit position: 1152

png

Bit position: 898

png

Bit position: 650

png

Bit position: 980

png

Test molecule #14

png

Important fragments

Bit position: 1602

png

Bit position: 366

png

Bit position: 1152

png

Bit position: 155

png

Bit position: 650

png

Test molecule #15

png

Important fragments

Bit position: 328

png

Bit position: 926

png

Bit position: 591

png

Bit position: 1490

png

Bit position: 786

png

Bit position: 1816

png

Bit position: 1152

png

Bit position: 650

png

Bit position: 1121

png

Test molecule #16

png

Important fragments

Bit position: 328

png

Bit position: 926

png

Bit position: 591

png

Bit position: 1490

png

Bit position: 786

png

Bit position: 1816

png

Bit position: 1152

png

Bit position: 650

png

Bit position: 1121

png