In [2]:
from rdkit import Chem
from rdkit import RDLogger
import matplotlib.pyplot as plt
import pandas as pd
import gzip
from rdkit.Chem import Descriptors
from rdkit.Chem import Lipinski
import seaborn as sb
from itertools import chain
from rdkit.Chem import MACCSkeys
import numpy as np

RDLogger.DisableLog('rdApp.error')

In [3]:
supp = Chem.SDMolSupplier('../data/drugbank.sdf')
drug_bank = [[mol, 'DrugBank'] for mol in supp if mol]

with gzip.open('actives_final.sdf.gz') as sdf:
    supp_actives = Chem.ForwardSDMolSupplier(sdf)
    actives = [[mol, 'actives'] for mol in supp_actives if mol]

with gzip.open('decoys_final.sdf.gz') as sdf:
    supp_decoys = Chem.ForwardSDMolSupplier(sdf)
    decoys = [[mol, 'decoys'] for mol in supp_decoys if mol]

df_db = pd.DataFrame(drug_bank, columns=['Mol', 'Source'])
df_ac = pd.DataFrame(actives, columns=['Mol', 'Source'])
df_de = pd.DataFrame(decoys, columns=['Mol', 'Source'])

df = pd.concat((df_db, df_ac, df_de), ignore_index=True)
df



Unnamed: 0,Mol,Source
0,<rdkit.Chem.rdchem.Mol object at 0x7fac1ff5ec10>,DrugBank
1,<rdkit.Chem.rdchem.Mol object at 0x7fac1ff5ec60>,DrugBank
2,<rdkit.Chem.rdchem.Mol object at 0x7fac1ff5ecb0>,DrugBank
3,<rdkit.Chem.rdchem.Mol object at 0x7fac1ff5ed00>,DrugBank
4,<rdkit.Chem.rdchem.Mol object at 0x7fac1ff5ed50>,DrugBank
...,...,...
36236,<rdkit.Chem.rdchem.Mol object at 0x7fac1f913030>,decoys
36237,<rdkit.Chem.rdchem.Mol object at 0x7fac1f9130d0>,decoys
36238,<rdkit.Chem.rdchem.Mol object at 0x7fac1f913170>,decoys
36239,<rdkit.Chem.rdchem.Mol object at 0x7fac1f913210>,decoys


In [4]:
df['MACCS_keys'] = df['Mol'].apply(lambda x: [bool(y) for y in MACCSkeys.GenMACCSKeys(x)])
df

Unnamed: 0,Mol,Source,MACCS_keys
0,<rdkit.Chem.rdchem.Mol object at 0x7fac1ff5ec10>,DrugBank,"[False, False, False, False, False, False, Fal..."
1,<rdkit.Chem.rdchem.Mol object at 0x7fac1ff5ec60>,DrugBank,"[False, False, False, False, False, False, Fal..."
2,<rdkit.Chem.rdchem.Mol object at 0x7fac1ff5ecb0>,DrugBank,"[False, False, False, False, False, False, Fal..."
3,<rdkit.Chem.rdchem.Mol object at 0x7fac1ff5ed00>,DrugBank,"[False, False, False, False, False, False, Fal..."
4,<rdkit.Chem.rdchem.Mol object at 0x7fac1ff5ed50>,DrugBank,"[False, False, False, False, False, False, Fal..."
...,...,...,...
36236,<rdkit.Chem.rdchem.Mol object at 0x7fac1f913030>,decoys,"[False, False, False, False, False, False, Fal..."
36237,<rdkit.Chem.rdchem.Mol object at 0x7fac1f9130d0>,decoys,"[False, False, False, False, False, False, Fal..."
36238,<rdkit.Chem.rdchem.Mol object at 0x7fac1f913170>,decoys,"[False, False, False, False, False, False, Fal..."
36239,<rdkit.Chem.rdchem.Mol object at 0x7fac1f913210>,decoys,"[False, False, False, False, False, False, Fal..."


In [5]:
def tanimoto(fp1, fp2):
    intersection = sum(np.array(fp1) & np.array(fp2))
    union = sum(fp1) + sum(fp2) - intersection
    score = intersection / union
    return score

In [6]:
tanimoto([False,True,False,True], [True,False,False,True])

0.3333333333333333

In [7]:
from rdkit.Chem import AllChem

df['FP'] = df['Mol'].apply(lambda x: AllChem.GetMorganFingerprintAsBitVect(x, 2, nBits = 1024))
df

Unnamed: 0,Mol,Source,MACCS_keys,fp
0,<rdkit.Chem.rdchem.Mol object at 0x7fac1ff5ec10>,DrugBank,"[False, False, False, False, False, False, Fal...","[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,<rdkit.Chem.rdchem.Mol object at 0x7fac1ff5ec60>,DrugBank,"[False, False, False, False, False, False, Fal...","[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,<rdkit.Chem.rdchem.Mol object at 0x7fac1ff5ecb0>,DrugBank,"[False, False, False, False, False, False, Fal...","[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ..."
3,<rdkit.Chem.rdchem.Mol object at 0x7fac1ff5ed00>,DrugBank,"[False, False, False, False, False, False, Fal...","[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,<rdkit.Chem.rdchem.Mol object at 0x7fac1ff5ed50>,DrugBank,"[False, False, False, False, False, False, Fal...","[0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, ..."
...,...,...,...,...
36236,<rdkit.Chem.rdchem.Mol object at 0x7fac1f913030>,decoys,"[False, False, False, False, False, False, Fal...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
36237,<rdkit.Chem.rdchem.Mol object at 0x7fac1f9130d0>,decoys,"[False, False, False, False, False, False, Fal...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
36238,<rdkit.Chem.rdchem.Mol object at 0x7fac1f913170>,decoys,"[False, False, False, False, False, False, Fal...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
36239,<rdkit.Chem.rdchem.Mol object at 0x7fac1f913210>,decoys,"[False, False, False, False, False, False, Fal...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."


In [8]:
para = Chem.MolFromSmiles('CC(=O)Nc1ccc(O)cc1')

para_fp = AllChem.GetMorganFingerprintAsBitVect(para, 2, nBits = 1024)

In [9]:
from rdkit import DataStructs

df['tanimoto'] = df['fp'].apply(lambda x: DataStructs.TanimotoSimilarity(x, para_fp))
df.sort_values(by='tanimoto', ascending=False, ignore_index=True)
#sorted_df

Unnamed: 0,Mol,Source,MACCS_keys,fp,tanimoto
0,<rdkit.Chem.rdchem.Mol object at 0x7fac1ff5ad50>,DrugBank,"[False, False, False, False, False, False, Fal...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1.000000
1,<rdkit.Chem.rdchem.Mol object at 0x7fac1fe8a850>,DrugBank,"[False, False, False, False, False, False, Fal...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.600000
2,<rdkit.Chem.rdchem.Mol object at 0x7fac1feed710>,DrugBank,"[False, False, False, False, False, False, Fal...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.517241
3,<rdkit.Chem.rdchem.Mol object at 0x7fac1fe10b70>,DrugBank,"[False, False, False, False, False, False, Fal...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.421053
4,<rdkit.Chem.rdchem.Mol object at 0x7fac1fe0e170>,DrugBank,"[False, False, False, False, False, False, Fal...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.416667
...,...,...,...,...,...
36236,<rdkit.Chem.rdchem.Mol object at 0x7fac1feccd50>,DrugBank,"[False, False, False, False, False, False, Fal...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.000000
36237,<rdkit.Chem.rdchem.Mol object at 0x7fac1ff242b0>,DrugBank,"[False, False, False, False, False, False, Fal...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.000000
36238,<rdkit.Chem.rdchem.Mol object at 0x7fac1fef1170>,DrugBank,"[False, False, False, False, False, False, Fal...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.000000
36239,<rdkit.Chem.rdchem.Mol object at 0x7fac1fe9d0d0>,DrugBank,"[False, False, False, False, False, False, Fal...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.000000


In [6]:
D = pd.DataFrame(np.load('dist_matrix.npy'))
D

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7903,7904,7905,7906,7907,7908,7909,7910,7911,7912
0,1.000000,0.371069,0.407143,0.492754,0.115385,0.314685,0.201258,0.078740,0.115578,0.182482,...,0.098592,0.106383,0.105634,0.099291,0.127820,0.102941,0.084615,0.175182,0.182482,0.076389
1,0.371069,1.000000,0.306250,0.415584,0.120482,0.265823,0.260870,0.095588,0.125000,0.160000,...,0.127517,0.127517,0.126667,0.128378,0.148936,0.132867,0.125000,0.153333,0.167785,0.105960
2,0.407143,0.306250,1.000000,0.358621,0.130137,0.721154,0.281690,0.075630,0.157609,0.176923,...,0.122137,0.113636,0.121212,0.114504,0.137097,0.128000,0.109244,0.151515,0.159091,0.081481
3,0.492754,0.415584,0.358621,1.000000,0.101266,0.278912,0.201258,0.087302,0.115578,0.140845,...,0.106383,0.090909,0.105634,0.091549,0.119403,0.119403,0.101562,0.219697,0.227273,0.091549
4,0.115385,0.120482,0.130137,0.101266,1.000000,0.138686,0.152174,0.060606,0.124260,0.111111,...,0.078261,0.087719,0.068376,0.098214,0.072727,0.082569,0.079208,0.112069,0.130435,0.078947
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7908,0.102941,0.132867,0.128000,0.119403,0.082569,0.118644,0.106557,0.125000,0.099338,0.152174,...,0.428571,0.351351,0.422535,0.356164,0.678571,1.000000,0.133333,0.141304,0.152174,0.207317
7909,0.084615,0.125000,0.109244,0.101562,0.079208,0.118182,0.095652,0.180328,0.075342,0.127907,...,0.109756,0.109756,0.108434,0.111111,0.133333,0.133333,1.000000,0.142857,0.141176,0.058824
7910,0.175182,0.153333,0.151515,0.219697,0.112069,0.144000,0.140625,0.082353,0.099379,0.147059,...,0.132653,0.132653,0.131313,0.134021,0.166667,0.141304,0.142857,1.000000,0.746269,0.134021
7911,0.182482,0.167785,0.159091,0.227273,0.130435,0.142857,0.148438,0.081395,0.105590,0.156863,...,0.131313,0.131313,0.118812,0.144330,0.152174,0.152174,0.141176,0.746269,1.000000,0.121212
