In [12]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import csv
import _pickle as cPickle
import gzip
import joblib

from collections import Counter

from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Draw
from rdkit.Chem import PandasTools
from rdkit.Chem.Draw import IPythonConsole

from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.model_selection import permutation_test_score, StratifiedKFold

def warn(*args, **kwargs):
    pass
import warnings
warnings.filterwarnings("ignore")
warnings.warn = warn

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline
%reload_ext autoreload
%autoreload 2
Draw.DrawingOptions.atomLabelFontFace = "DejaVu Sans"
Draw.DrawingOptions.atomLabelFontSize = 18

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [13]:
consensus_rdkit = pd.read_excel('../dataset/screened/pubchem_hits_qsar_rdkit_consensus.xlsx')
print(consensus_rdkit.shape)
consensus_rdkit.head()

(101097, 26)


Unnamed: 0.1,Unnamed: 0,CID,HBondDonorCount,HBondAcceptorCount,MolecularWeight,XLogP,CanonicalSMILES,IsomericSMILES,HitFreq,Include,...,svm_score,mlp,mlp_ad,mlp_score,tf,tf_ad,tf_score,consensus,consensus_ad,count
0,6,16723801,1,2,389.9,4.2,O=C(CCc1cn(Cc2ccc(Cl)cc2)c2ccccc12)Nc1ccncc1,C1=CC=C2C(=C1)C(=CN2CC3=CC=C(C=C3)Cl)CCC(=O)NC...,9,1,...,0.79,1.0,1.0,0.95,1.0,1.0,0.97,1.0,1.0,2
1,14,4782931,2,1,326.8,4.1,O=C(CCc1c[nH]c2ccccc12)NCCc1ccc(Cl)cc1,C1=CC=C2C(=C1)C(=CN2)CCC(=O)NCCC3=CC=C(C=C3)Cl,9,1,...,0.55,1.0,1.0,0.76,0.0,,0.44,0.0,1.0,1
2,39,145950271,4,3,450.0,3.6,NC(Cc1c[nH]c2ccccc12)C(=O)NC1CCN(Cc2c[nH]c3cc(...,C1CN(CCC1NC(=O)[C@H](CC2=CNC3=CC=CC=C32)N)CC4=...,11,1,...,0.53,1.0,1.0,0.97,1.0,1.0,0.99,1.0,1.0,2
3,46,118705964,2,4,448.0,4.3,CN1CCN(C(=O)C(Cc2c[nH]c3ccccc23)Nc2ccnc3cc(Cl)...,CN1CCN(CC1)C(=O)[C@H](CC2=CNC3=CC=CC=C32)NC4=C...,12,1,...,0.6,0.0,0.0,0.71,1.0,1.0,0.97,,,0
4,55,42743586,2,3,381.9,4.7,O=C(CCc1ccccc1)NCCCCNc1ccnc2cc(Cl)ccc12,C1=CC=C(C=C1)CCC(=O)NCCCCNC2=C3C=CC(=CC3=NC=C2)Cl,5,1,...,0.91,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2


In [14]:
consensus_morgan = pd.read_excel('../dataset/screened/pubchem_hits_qsar_morgan_consensus.xlsx')
print(consensus_morgan.shape)
consensus_morgan.head()

(101097, 26)


Unnamed: 0.1,Unnamed: 0,CID,HBondDonorCount,HBondAcceptorCount,MolecularWeight,XLogP,CanonicalSMILES,IsomericSMILES,HitFreq,Include,...,svm_score,mlp,mlp_ad,mlp_score,tf,tf_ad,tf_score,consensus,consensus_ad,count
0,6,16723801,1,2,389.9,4.2,O=C(CCc1cn(Cc2ccc(Cl)cc2)c2ccccc12)Nc1ccncc1,C1=CC=C2C(=C1)C(=CN2CC3=CC=C(C=C3)Cl)CCC(=O)NC...,9,1,...,0.62,0,0.0,0.78,0,,0.2,0.0,0.0,0
1,14,4782931,2,1,326.8,4.1,O=C(CCc1c[nH]c2ccccc12)NCCc1ccc(Cl)cc1,C1=CC=C2C(=C1)C(=CN2)CCC(=O)NCCC3=CC=C(C=C3)Cl,9,1,...,0.99,1,1.0,1.0,1,1.0,0.94,1.0,1.0,2
2,39,145950271,4,3,450.0,3.6,NC(Cc1c[nH]c2ccccc12)C(=O)NC1CCN(Cc2c[nH]c3cc(...,C1CN(CCC1NC(=O)[C@H](CC2=CNC3=CC=CC=C32)N)CC4=...,11,1,...,0.66,0,,0.65,0,,0.41,0.0,0.0,0
3,46,118705964,2,4,448.0,4.3,CN1CCN(C(=O)C(Cc2c[nH]c3ccccc23)Nc2ccnc3cc(Cl)...,CN1CCN(CC1)C(=O)[C@H](CC2=CNC3=CC=CC=C32)NC4=C...,12,1,...,0.95,1,1.0,0.98,1,1.0,0.98,1.0,1.0,2
4,55,42743586,2,3,381.9,4.7,O=C(CCc1ccccc1)NCCCCNc1ccnc2cc(Cl)ccc12,C1=CC=C(C=C1)CCC(=O)NCCCCNC2=C3C=CC(=CC3=NC=C2)Cl,5,1,...,0.85,1,1.0,0.85,0,,0.52,,1.0,1


In [15]:
consensus_sirms = pd.read_excel('../dataset/screened/pubchem_hits_qsar_sirms_consensus.xlsx')
print(consensus_sirms.shape)
consensus_sirms.head()

(101097, 26)


Unnamed: 0.1,Unnamed: 0,CID,HBondDonorCount,HBondAcceptorCount,MolecularWeight,XLogP,CanonicalSMILES,IsomericSMILES,HitFreq,Include,...,svm_score,mlp,mlp_ad,mlp_score,tf,tf_ad,tf_score,consensus,consensus_ad,count
0,6,16723801,1,2,389.9,4.2,O=C(CCc1cn(Cc2ccc(Cl)cc2)c2ccccc12)Nc1ccncc1,C1=CC=C2C(=C1)C(=CN2CC3=CC=C(C=C3)Cl)CCC(=O)NC...,9,1,...,0.5,1,1.0,0.87,1,1.0,0.78,1.0,1.0,2
1,14,4782931,2,1,326.8,4.1,O=C(CCc1c[nH]c2ccccc12)NCCc1ccc(Cl)cc1,C1=CC=C2C(=C1)C(=CN2)CCC(=O)NCCC3=CC=C(C=C3)Cl,9,1,...,0.56,1,,0.68,0,,0.63,,0.0,0
2,39,145950271,4,3,450.0,3.6,NC(Cc1c[nH]c2ccccc12)C(=O)NC1CCN(Cc2c[nH]c3cc(...,C1CN(CCC1NC(=O)[C@H](CC2=CNC3=CC=CC=C32)N)CC4=...,11,1,...,0.51,1,1.0,0.98,1,1.0,0.93,1.0,1.0,2
3,46,118705964,2,4,448.0,4.3,CN1CCN(C(=O)C(Cc2c[nH]c3ccccc23)Nc2ccnc3cc(Cl)...,CN1CCN(CC1)C(=O)[C@H](CC2=CNC3=CC=CC=C32)NC4=C...,12,1,...,0.51,1,1.0,1.0,1,1.0,0.9,1.0,1.0,2
4,55,42743586,2,3,381.9,4.7,O=C(CCc1ccccc1)NCCCCNc1ccnc2cc(Cl)ccc12,C1=CC=C(C=C1)CCC(=O)NCCCCNC2=C3C=CC(=CC3=NC=C2)Cl,5,1,...,0.58,1,1.0,1.0,1,1.0,0.86,1.0,1.0,2


In [16]:
consensus_morgan.drop(consensus_morgan.columns[[7,8,9,10,11,12,13,14,15]], axis=1, inplace=True)
consensus_sirms.drop(consensus_sirms.columns[[7,8,9,10,11,12,13,14,15]], axis=1, inplace=True) 
consensus_rdkit.drop(consensus_rdkit.columns[[7,8,9,10,11,12,13,14,15]], axis=1, inplace=True) 
consensus_rdkit

Unnamed: 0.1,Unnamed: 0,CID,HBondDonorCount,HBondAcceptorCount,MolecularWeight,XLogP,CanonicalSMILES,svm_score,mlp,mlp_ad,mlp_score,tf,tf_ad,tf_score,consensus,consensus_ad,count
0,6,16723801,1,2,389.90,4.2,O=C(CCc1cn(Cc2ccc(Cl)cc2)c2ccccc12)Nc1ccncc1,0.79,1.0,1.0,0.95,1.0,1.0,0.97,1.0,1.0,2
1,14,4782931,2,1,326.80,4.1,O=C(CCc1c[nH]c2ccccc12)NCCc1ccc(Cl)cc1,0.55,1.0,1.0,0.76,0.0,,0.44,0.0,1.0,1
2,39,145950271,4,3,450.00,3.6,NC(Cc1c[nH]c2ccccc12)C(=O)NC1CCN(Cc2c[nH]c3cc(...,0.53,1.0,1.0,0.97,1.0,1.0,0.99,1.0,1.0,2
3,46,118705964,2,4,448.00,4.3,CN1CCN(C(=O)C(Cc2c[nH]c3ccccc23)Nc2ccnc3cc(Cl)...,0.60,0.0,0.0,0.71,1.0,1.0,0.97,,,0
4,55,42743586,2,3,381.90,4.7,O=C(CCc1ccccc1)NCCCCNc1ccnc2cc(Cl)ccc12,0.91,1.0,1.0,1.00,1.0,1.0,1.00,1.0,1.0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101092,158588,140116108,1,4,261.39,3.5,c1ccc2c(NCCN3CCCCC3)nsc2c1,0.92,0.0,0.0,1.00,0.0,,0.22,0.0,0.0,0
101093,158589,140732880,0,5,232.33,2.6,[CH2-]N1CCN(c2nsc3ccccc23)CC1,0.99,1.0,1.0,0.99,0.0,,0.42,0.0,0.0,0
101094,158590,143285344,2,4,221.32,3.1,CNCCCNc1nsc2ccccc12,0.71,1.0,1.0,1.00,1.0,1.0,0.91,1.0,1.0,2
101095,158591,144116719,1,5,346.50,4.0,CCC(CN)C(CC)CN1CCN(c2nsc3ccccc23)CC1,0.81,0.0,0.0,1.00,0.0,,0.24,0.0,0.0,0


In [17]:
consensus_morgan.rename(columns={'consensus': 'consensus_morgan', 'consensus_ad': 'consensus_morgan_ad'}, inplace=True)
consensus_sirms.rename(columns={'consensus': 'consensus_sirms', 'consensus_ad': 'consensus_sirms_ad'}, inplace=True)
consensus_rdkit.rename(columns={'consensus': 'consensus_rdkit', 'consensus_ad': 'consensus_rdkit_ad'}, inplace=True)

In [18]:
merged = pd.merge(consensus_morgan, consensus_sirms[['CID', 'consensus_sirms', 'consensus_sirms_ad']], how='inner', on='CID')
merged = pd.merge(merged, consensus_rdkit[['CID', 'consensus_rdkit', 'consensus_rdkit_ad']], how='inner', on='CID')
merged

Unnamed: 0.1,Unnamed: 0,CID,HBondDonorCount,HBondAcceptorCount,MolecularWeight,XLogP,CanonicalSMILES,svm_score,mlp,mlp_ad,...,tf,tf_ad,tf_score,consensus_morgan,consensus_morgan_ad,count,consensus_sirms,consensus_sirms_ad,consensus_rdkit,consensus_rdkit_ad
0,6,16723801,1,2,389.90,4.2,O=C(CCc1cn(Cc2ccc(Cl)cc2)c2ccccc12)Nc1ccncc1,0.62,0,0.0,...,0,,0.20,0.0,0.0,0,1.0,1.0,1.0,1.0
1,14,4782931,2,1,326.80,4.1,O=C(CCc1c[nH]c2ccccc12)NCCc1ccc(Cl)cc1,0.99,1,1.0,...,1,1.0,0.94,1.0,1.0,2,,0.0,0.0,1.0
2,39,145950271,4,3,450.00,3.6,NC(Cc1c[nH]c2ccccc12)C(=O)NC1CCN(Cc2c[nH]c3cc(...,0.66,0,,...,0,,0.41,0.0,0.0,0,1.0,1.0,1.0,1.0
3,46,118705964,2,4,448.00,4.3,CN1CCN(C(=O)C(Cc2c[nH]c3ccccc23)Nc2ccnc3cc(Cl)...,0.95,1,1.0,...,1,1.0,0.98,1.0,1.0,2,1.0,1.0,,
4,55,42743586,2,3,381.90,4.7,O=C(CCc1ccccc1)NCCCCNc1ccnc2cc(Cl)ccc12,0.85,1,1.0,...,0,,0.52,,1.0,1,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101092,158588,140116108,1,4,261.39,3.5,c1ccc2c(NCCN3CCCCC3)nsc2c1,0.78,0,,...,0,,0.13,0.0,1.0,1,,1.0,0.0,0.0
101093,158589,140732880,0,5,232.33,2.6,[CH2-]N1CCN(c2nsc3ccccc23)CC1,0.86,1,1.0,...,0,,0.51,,1.0,1,,1.0,0.0,0.0
101094,158590,143285344,2,4,221.32,3.1,CNCCCNc1nsc2ccccc12,0.57,0,0.0,...,0,,0.19,0.0,0.0,0,,1.0,1.0,1.0
101095,158591,144116719,1,5,346.50,4.0,CCC(CN)C(CC)CN1CCN(c2nsc3ccccc23)CC1,0.97,1,1.0,...,0,,0.67,,1.0,1,1.0,1.0,0.0,0.0


In [19]:
merged['count']=pd.concat([merged['consensus_morgan'],merged['consensus_sirms'],merged['consensus_rdkit']],axis=1).sum(axis=1)
merged['count_ad']=pd.concat([merged['consensus_morgan_ad'],merged['consensus_sirms_ad'],merged['consensus_rdkit_ad']],axis=1).sum(axis=1)
merged

Unnamed: 0.1,Unnamed: 0,CID,HBondDonorCount,HBondAcceptorCount,MolecularWeight,XLogP,CanonicalSMILES,svm_score,mlp,mlp_ad,...,tf_ad,tf_score,consensus_morgan,consensus_morgan_ad,count,consensus_sirms,consensus_sirms_ad,consensus_rdkit,consensus_rdkit_ad,count_ad
0,6,16723801,1,2,389.90,4.2,O=C(CCc1cn(Cc2ccc(Cl)cc2)c2ccccc12)Nc1ccncc1,0.62,0,0.0,...,,0.20,0.0,0.0,2.0,1.0,1.0,1.0,1.0,2.0
1,14,4782931,2,1,326.80,4.1,O=C(CCc1c[nH]c2ccccc12)NCCc1ccc(Cl)cc1,0.99,1,1.0,...,1.0,0.94,1.0,1.0,1.0,,0.0,0.0,1.0,2.0
2,39,145950271,4,3,450.00,3.6,NC(Cc1c[nH]c2ccccc12)C(=O)NC1CCN(Cc2c[nH]c3cc(...,0.66,0,,...,,0.41,0.0,0.0,2.0,1.0,1.0,1.0,1.0,2.0
3,46,118705964,2,4,448.00,4.3,CN1CCN(C(=O)C(Cc2c[nH]c3ccccc23)Nc2ccnc3cc(Cl)...,0.95,1,1.0,...,1.0,0.98,1.0,1.0,2.0,1.0,1.0,,,2.0
4,55,42743586,2,3,381.90,4.7,O=C(CCc1ccccc1)NCCCCNc1ccnc2cc(Cl)ccc12,0.85,1,1.0,...,,0.52,,1.0,2.0,1.0,1.0,1.0,1.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101092,158588,140116108,1,4,261.39,3.5,c1ccc2c(NCCN3CCCCC3)nsc2c1,0.78,0,,...,,0.13,0.0,1.0,0.0,,1.0,0.0,0.0,2.0
101093,158589,140732880,0,5,232.33,2.6,[CH2-]N1CCN(c2nsc3ccccc23)CC1,0.86,1,1.0,...,,0.51,,1.0,0.0,,1.0,0.0,0.0,2.0
101094,158590,143285344,2,4,221.32,3.1,CNCCCNc1nsc2ccccc12,0.57,0,0.0,...,,0.19,0.0,0.0,1.0,,1.0,1.0,1.0,2.0
101095,158591,144116719,1,5,346.50,4.0,CCC(CN)C(CC)CN1CCN(c2nsc3ccccc23)CC1,0.97,1,1.0,...,,0.67,,1.0,1.0,1.0,1.0,0.0,0.0,2.0


In [20]:
hits = merged.sort_values(by=['count'], ascending=False)
with pd.ExcelWriter('../dataset/screened/pubchem_hits_descriptors_consensus.xlsx') as writer:
    hits.to_excel(writer, sheet_name='consensus', index=False)
hits

Unnamed: 0.1,Unnamed: 0,CID,HBondDonorCount,HBondAcceptorCount,MolecularWeight,XLogP,CanonicalSMILES,svm_score,mlp,mlp_ad,...,tf_ad,tf_score,consensus_morgan,consensus_morgan_ad,count,consensus_sirms,consensus_sirms_ad,consensus_rdkit,consensus_rdkit_ad,count_ad
57083,90626,88374080,3,3,309.40,3.1,O=C(NCNC1CCCNC1c1ccccc1)c1ccccc1,0.94,1,1.0,...,1.0,0.96,1.0,1.0,3.0,1.0,1.0,1.0,1.0,3.0
79200,127922,46555856,1,2,286.37,2.4,CNC(=O)c1ccc(C=CC(=O)N2CCCC(C)C2)cc1,0.72,1,1.0,...,1.0,0.98,1.0,1.0,3.0,1.0,1.0,1.0,1.0,3.0
79160,127881,46465375,2,3,393.50,3.5,Cc1ccccc1C(=O)NCC(=O)NCc1ccc(CN2CCCC(C)C2)cc1,0.93,1,1.0,...,1.0,0.91,1.0,1.0,3.0,1.0,1.0,1.0,1.0,3.0
13100,26036,51192106,0,3,328.40,2.3,Cc1ccc2c(c1)C(=O)N(CCCC(=O)N1CCCC(C)C1)C2=O,0.54,1,1.0,...,1.0,0.71,1.0,1.0,3.0,1.0,1.0,1.0,1.0,3.0
34155,55594,24547858,2,3,381.50,3.3,CCCC(NC(=O)CCn1c(=S)[nH]c2ccccc2c1=O)c1ccccc1,0.97,1,1.0,...,1.0,0.89,1.0,1.0,3.0,1.0,1.0,1.0,1.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40284,64335,3591937,0,2,232.36,2.7,CC(C)(C)N1CCN(Cc2ccccc2)CC1,0.58,0,0.0,...,,0.03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
55145,88270,79156496,1,2,289.40,2.5,CCCNC1CCN(C(=O)N(CC)CC)c2ccccc21,0.64,0,,...,,0.24,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
40285,64336,3466189,0,1,177.29,2.9,CCN(CC)C(C)c1ccccc1,0.73,0,0.0,...,,0.29,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
74474,121238,113167583,1,3,312.40,2.5,COc1cccc(NC(=O)CN(C(C)=O)c2cccc(C)c2)c1,0.79,1,,...,,0.54,,1.0,0.0,0.0,0.0,0.0,0.0,1.0


In [21]:
# selecting rows based on condition 
hits = merged[merged['count_ad'] == 3.0] 
print(hits.shape)
hits

(12745, 22)


Unnamed: 0.1,Unnamed: 0,CID,HBondDonorCount,HBondAcceptorCount,MolecularWeight,XLogP,CanonicalSMILES,svm_score,mlp,mlp_ad,...,tf_ad,tf_score,consensus_morgan,consensus_morgan_ad,count,consensus_sirms,consensus_sirms_ad,consensus_rdkit,consensus_rdkit_ad,count_ad
4,55,42743586,2,3,381.90,4.7,O=C(CCc1ccccc1)NCCCCNc1ccnc2cc(Cl)ccc12,0.85,1,1.0,...,,0.52,,1.0,2.0,1.0,1.0,1.0,1.0,3.0
5,56,42743569,2,3,367.90,4.4,O=C(Cc1ccccc1)NCCCCNc1ccnc2cc(Cl)ccc12,0.82,1,1.0,...,,0.68,,1.0,2.0,1.0,1.0,1.0,1.0,3.0
10,64,168278591,3,4,429.90,4.5,Cc1nc(CNC(=O)CNc2ccnc3cc(Cl)ccc23)cc2c1[nH]c1c...,0.86,1,1.0,...,,0.39,,1.0,2.0,1.0,1.0,1.0,1.0,3.0
14,86,142055963,1,2,395.90,4.3,CN(C)CC1Cc2c(Cl)cccc2N(C(=O)CCc2c[nH]c3ccccc23)C1,0.99,1,1.0,...,1.0,1.00,1.0,1.0,3.0,1.0,1.0,1.0,1.0,3.0
15,88,141588898,2,3,410.90,3.4,CN(C)CC1Cc2c(Cl)cccc2N(C(=O)C(N)Cc2c[nH]c3cccc...,0.95,1,1.0,...,1.0,0.99,1.0,1.0,3.0,1.0,1.0,1.0,1.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101051,158475,10902956,0,4,309.40,4.1,c1ccc(CN2CCN(c3nsc4ccccc34)CC2)cc1,0.80,1,1.0,...,,0.52,,1.0,0.0,,1.0,,1.0,3.0
101057,158481,7831611,2,3,222.33,2.7,C[NH+](C)CCNc1nsc2ccccc12,0.77,0,,...,,0.42,0.0,1.0,0.0,,1.0,,1.0,3.0
101062,158494,71636811,2,4,247.36,3.1,c1ccc2c(NCC3CCCCN3)nsc2c1,0.75,1,1.0,...,1.0,0.89,1.0,1.0,1.0,,1.0,,1.0,3.0
101063,158496,71636815,2,4,247.36,3.0,c1ccc2c(NCC3CCCNC3)nsc2c1,0.61,0,,...,1.0,0.71,,1.0,1.0,1.0,1.0,,1.0,3.0
