# Structural similarity

In [1]:
from rdkit import Chem
from rdkit import RDLogger
import matplotlib.pyplot as plt
import pandas as pd
import gzip
from rdkit.Chem import Descriptors
from rdkit.Chem import Lipinski
import seaborn as sb
from itertools import chain
from rdkit.Chem import MACCSkeys
import numpy as np

RDLogger.DisableLog('rdApp.error')

In [2]:
# load dataset
suppl = Chem.SDMolSupplier("../data/drugbank.sdf")
drug_bank = [[mol, 'drugbank'] for mol in suppl if mol]

with gzip.open("../data/actives_final.sdf.gz") as sdf:
    supp_actives = Chem.ForwardSDMolSupplier(sdf)
    actives = [[mol, 'acives'] for mol in supp_actives if mol]

with gzip.open("../data/decoys_final.sdf.gz") as sdf:
    supp_decoys = Chem.ForwardSDMolSupplier(sdf)
    decoys = [[mol, 'decoys'] for mol in supp_decoys if mol]

df_db = pd.DataFrame(drug_bank, columns=['Mol', 'Source'])
df_ac = pd.DataFrame(actives, columns=['Mol', 'Source'])
df_dc = pd.DataFrame(decoys, columns=['Mol', 'Source'])

# pick random 877 molecules from decoys in a way to have much diversity as possible
df_dc_small = df_dc.sample(n=877, random_state=42)

df = pd.concat([df_db, df_ac, df_dc_small], ignore_index=True)
df

Unnamed: 0,Mol,Source
0,<rdkit.Chem.rdchem.Mol object at 0x7fba675f95b0>,drugbank
1,<rdkit.Chem.rdchem.Mol object at 0x7fba675f9770>,drugbank
2,<rdkit.Chem.rdchem.Mol object at 0x7fba675f9690>,drugbank
3,<rdkit.Chem.rdchem.Mol object at 0x7fba675f97e0>,drugbank
4,<rdkit.Chem.rdchem.Mol object at 0x7fba675f9850>,drugbank
...,...,...
8866,<rdkit.Chem.rdchem.Mol object at 0x7fba66fea5e0>,decoys
8867,<rdkit.Chem.rdchem.Mol object at 0x7fba670b3610>,decoys
8868,<rdkit.Chem.rdchem.Mol object at 0x7fba66d8b5a0>,decoys
8869,<rdkit.Chem.rdchem.Mol object at 0x7fba66e8ee30>,decoys


In [3]:
from rdkit.Chem import AllChem

df['FP'] = df['Mol'].apply(lambda x: AllChem.GetMorganFingerprintAsBitVect(x, 2, nBits=1024))
df

Unnamed: 0,Mol,Source,FP
0,<rdkit.Chem.rdchem.Mol object at 0x7fba675f95b0>,drugbank,"[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,<rdkit.Chem.rdchem.Mol object at 0x7fba675f9770>,drugbank,"[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,<rdkit.Chem.rdchem.Mol object at 0x7fba675f9690>,drugbank,"[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ..."
3,<rdkit.Chem.rdchem.Mol object at 0x7fba675f97e0>,drugbank,"[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,<rdkit.Chem.rdchem.Mol object at 0x7fba675f9850>,drugbank,"[0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, ..."
...,...,...,...
8866,<rdkit.Chem.rdchem.Mol object at 0x7fba66fea5e0>,decoys,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, ..."
8867,<rdkit.Chem.rdchem.Mol object at 0x7fba670b3610>,decoys,"[0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
8868,<rdkit.Chem.rdchem.Mol object at 0x7fba66d8b5a0>,decoys,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
8869,<rdkit.Chem.rdchem.Mol object at 0x7fba66e8ee30>,decoys,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [4]:
np.save("../data/df.npy", df)

In [5]:
df['MACCS_keys'] = df['Mol'].apply(lambda x: [bool(y) for y in MACCSkeys.GenMACCSKeys(x)])
df

Unnamed: 0,Mol,Source,FP,MACCS_keys
0,<rdkit.Chem.rdchem.Mol object at 0x7fba675f95b0>,drugbank,"[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[False, False, False, False, False, False, Fal..."
1,<rdkit.Chem.rdchem.Mol object at 0x7fba675f9770>,drugbank,"[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[False, False, False, False, False, False, Fal..."
2,<rdkit.Chem.rdchem.Mol object at 0x7fba675f9690>,drugbank,"[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...","[False, False, False, False, False, False, Fal..."
3,<rdkit.Chem.rdchem.Mol object at 0x7fba675f97e0>,drugbank,"[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[False, False, False, False, False, False, Fal..."
4,<rdkit.Chem.rdchem.Mol object at 0x7fba675f9850>,drugbank,"[0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...","[False, False, False, False, False, False, Fal..."
...,...,...,...,...
8866,<rdkit.Chem.rdchem.Mol object at 0x7fba66fea5e0>,decoys,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, ...","[False, False, False, False, False, False, Fal..."
8867,<rdkit.Chem.rdchem.Mol object at 0x7fba670b3610>,decoys,"[0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[False, False, False, False, False, False, Fal..."
8868,<rdkit.Chem.rdchem.Mol object at 0x7fba66d8b5a0>,decoys,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[False, False, False, False, False, False, Fal..."
8869,<rdkit.Chem.rdchem.Mol object at 0x7fba66e8ee30>,decoys,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[False, False, False, False, False, False, Fal..."


In [6]:
df

Unnamed: 0,Mol,Source,FP,MACCS_keys
0,<rdkit.Chem.rdchem.Mol object at 0x7fba675f95b0>,drugbank,"[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[False, False, False, False, False, False, Fal..."
1,<rdkit.Chem.rdchem.Mol object at 0x7fba675f9770>,drugbank,"[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[False, False, False, False, False, False, Fal..."
2,<rdkit.Chem.rdchem.Mol object at 0x7fba675f9690>,drugbank,"[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...","[False, False, False, False, False, False, Fal..."
3,<rdkit.Chem.rdchem.Mol object at 0x7fba675f97e0>,drugbank,"[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[False, False, False, False, False, False, Fal..."
4,<rdkit.Chem.rdchem.Mol object at 0x7fba675f9850>,drugbank,"[0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...","[False, False, False, False, False, False, Fal..."
...,...,...,...,...
8866,<rdkit.Chem.rdchem.Mol object at 0x7fba66fea5e0>,decoys,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, ...","[False, False, False, False, False, False, Fal..."
8867,<rdkit.Chem.rdchem.Mol object at 0x7fba670b3610>,decoys,"[0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[False, False, False, False, False, False, Fal..."
8868,<rdkit.Chem.rdchem.Mol object at 0x7fba66d8b5a0>,decoys,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[False, False, False, False, False, False, Fal..."
8869,<rdkit.Chem.rdchem.Mol object at 0x7fba66e8ee30>,decoys,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[False, False, False, False, False, False, Fal..."


In [7]:
# numbers of molecules by source
df['Source'].value_counts()

drugbank    7117
acives       877
decoys       877
Name: Source, dtype: int64

In [8]:
para = Chem.MolFromSmiles("CC(=O)Nc1ccc(O)cc1")
para_fp = AllChem.GetMorganFingerprintAsBitVect(para, 2, nBits=1024)

In [9]:
from rdkit import DataStructs

df['tanimoto'] = df['FP'].apply(lambda x: DataStructs.TanimotoSimilarity(x, para_fp))
df.sort_values(by='tanimoto', ascending=False, ignore_index=True)

Unnamed: 0,Mol,Source,FP,MACCS_keys,tanimoto
0,<rdkit.Chem.rdchem.Mol object at 0x7fba674076f0>,drugbank,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[False, False, False, False, False, False, Fal...",1.000000
1,<rdkit.Chem.rdchem.Mol object at 0x7fba674bc3c0>,drugbank,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[False, False, False, False, False, False, Fal...",0.600000
2,<rdkit.Chem.rdchem.Mol object at 0x7fba6749e3b0>,drugbank,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[False, False, False, False, False, False, Fal...",0.517241
3,<rdkit.Chem.rdchem.Mol object at 0x7fba6734c820>,drugbank,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[False, False, False, False, False, False, Fal...",0.421053
4,<rdkit.Chem.rdchem.Mol object at 0x7fba67301770>,drugbank,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...","[False, False, False, False, False, False, Fal...",0.416667
...,...,...,...,...,...
8866,<rdkit.Chem.rdchem.Mol object at 0x7fba6733ece0>,drugbank,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[False, False, False, False, False, False, Fal...",0.000000
8867,<rdkit.Chem.rdchem.Mol object at 0x7fba6733f680>,drugbank,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[False, False, False, False, False, False, Fal...",0.000000
8868,<rdkit.Chem.rdchem.Mol object at 0x7fba674a0120>,drugbank,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[False, False, False, False, False, False, Fal...",0.000000
8869,<rdkit.Chem.rdchem.Mol object at 0x7fba6745d7e0>,drugbank,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...","[False, False, False, False, False, False, Fal...",0.000000


In [21]:
dist_matrix = pd.DataFrame(np.load("../data/dist_matrix.npy")) 
dist_matrix
# I have a large number of structures and I want to select those that are least similar to each other and
# to cover as much chemical diversity as possible. 5% most diverse structures are selected.
# D is a distance matrix, where D[i,j] is the distance between the ith and jth structures.
# 8871 structures and 1% is 88 , 5% is 444, 10% is 887



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,8861,8862,8863,8864,8865,8866,8867,8868,8869,8870
0,1.000000,0.371069,0.407143,0.492754,0.115385,0.314685,0.201258,0.078740,0.115000,0.182482,...,0.117647,0.121429,0.111940,0.094203,0.124138,0.138889,0.156716,0.101351,0.155556,0.058824
1,0.371069,1.000000,0.306250,0.415584,0.120482,0.265823,0.260870,0.095588,0.124402,0.160000,...,0.108108,0.134228,0.102740,0.086667,0.143791,0.142857,0.151724,0.143791,0.150685,0.083333
2,0.407143,0.306250,1.000000,0.358621,0.130137,0.721154,0.281690,0.075630,0.156757,0.176923,...,0.117188,0.165354,0.102362,0.109375,0.132353,0.139706,0.168000,0.107914,0.176000,0.054688
3,0.492754,0.415584,0.358621,1.000000,0.101266,0.278912,0.201258,0.087302,0.120603,0.140845,...,0.134328,0.137681,0.103704,0.102190,0.234848,0.138889,0.156716,0.108844,0.147059,0.099237
4,0.115385,0.120482,0.130137,0.101266,1.000000,0.138686,0.152174,0.060606,0.130178,0.111111,...,0.111111,0.126126,0.083333,0.081818,0.082645,0.118644,0.088496,0.100840,0.097345,0.066667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8866,0.138889,0.142857,0.139706,0.138889,0.118644,0.140625,0.146154,0.117647,0.096970,0.142857,...,0.100000,0.084906,0.150538,0.123711,0.120370,1.000000,0.141414,0.130841,0.151515,0.120879
8867,0.156716,0.151724,0.168000,0.156716,0.088496,0.161017,0.166667,0.102564,0.075000,0.193548,...,0.160920,0.164835,0.166667,0.123596,0.166667,0.141414,1.000000,0.131313,0.179775,0.094118
8868,0.101351,0.143791,0.107914,0.108844,0.100840,0.106870,0.112782,0.105882,0.104294,0.122642,...,0.112245,0.117647,0.115789,0.113402,0.224490,0.130841,0.131313,1.000000,0.141414,0.063158
8869,0.155556,0.150685,0.176000,0.147059,0.097345,0.200000,0.137097,0.129870,0.101911,0.204301,...,0.159091,0.188889,0.137931,0.122222,0.153061,0.151515,0.179775,0.141414,1.000000,0.105882


In [22]:
# dist matrix to diagonal matrix with 0s
np.fill_diagonal(dist_matrix.values, 0)
dist_matrix

# Ensure that the distances are represented as float or integer data types, and that the values are positive.
dist_matrix = dist_matrix.astype(np.float64)
dist_matrix



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,8861,8862,8863,8864,8865,8866,8867,8868,8869,8870
0,0.000000,0.371069,0.407143,0.492754,0.115385,0.314685,0.201258,0.078740,0.115000,0.182482,...,0.117647,0.121429,0.111940,0.094203,0.124138,0.138889,0.156716,0.101351,0.155556,0.058824
1,0.371069,0.000000,0.306250,0.415584,0.120482,0.265823,0.260870,0.095588,0.124402,0.160000,...,0.108108,0.134228,0.102740,0.086667,0.143791,0.142857,0.151724,0.143791,0.150685,0.083333
2,0.407143,0.306250,0.000000,0.358621,0.130137,0.721154,0.281690,0.075630,0.156757,0.176923,...,0.117188,0.165354,0.102362,0.109375,0.132353,0.139706,0.168000,0.107914,0.176000,0.054688
3,0.492754,0.415584,0.358621,0.000000,0.101266,0.278912,0.201258,0.087302,0.120603,0.140845,...,0.134328,0.137681,0.103704,0.102190,0.234848,0.138889,0.156716,0.108844,0.147059,0.099237
4,0.115385,0.120482,0.130137,0.101266,0.000000,0.138686,0.152174,0.060606,0.130178,0.111111,...,0.111111,0.126126,0.083333,0.081818,0.082645,0.118644,0.088496,0.100840,0.097345,0.066667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8866,0.138889,0.142857,0.139706,0.138889,0.118644,0.140625,0.146154,0.117647,0.096970,0.142857,...,0.100000,0.084906,0.150538,0.123711,0.120370,0.000000,0.141414,0.130841,0.151515,0.120879
8867,0.156716,0.151724,0.168000,0.156716,0.088496,0.161017,0.166667,0.102564,0.075000,0.193548,...,0.160920,0.164835,0.166667,0.123596,0.166667,0.141414,0.000000,0.131313,0.179775,0.094118
8868,0.101351,0.143791,0.107914,0.108844,0.100840,0.106870,0.112782,0.105882,0.104294,0.122642,...,0.112245,0.117647,0.115789,0.113402,0.224490,0.130841,0.131313,0.000000,0.141414,0.063158
8869,0.155556,0.150685,0.176000,0.147059,0.097345,0.200000,0.137097,0.129870,0.101911,0.204301,...,0.159091,0.188889,0.137931,0.122222,0.153061,0.151515,0.179775,0.141414,0.000000,0.105882


 I have a large number of structures and I want to select those that are least similar to each other and
 to cover as much chemical diversity as possible. 5% most diverse structures are selected.
 D is a distance matrix, where D[i,j] is the distance between the ith and jth structures.
 8871 structures and 1% is 88 , 5% is 444, 10% is 887

In [29]:
# The distance matrix is a square matrix, so the number of rows and columns should be equal.
dist_matrix.shape

# distance matrix have computed distances between all pairs of structures using the Tanimoto similarity metric on fingerprints.
# now we can compute how similar each structure is to all other structures in the dataset.
# our goal is to select a subset of structures that are as diverse as possible.

dist_matrix.sum(axis=1)  # sum of each row

# The sum of the distances between each structure and all other structures in the dataset is called the diversity of the structure.
# The diversity of a structure is the sum of the distances between the structure and all other structures in the dataset.

# The diversity of each structure is stored in the diversity array, and we want to keep indexes of the most diverse structures.
diversity = dist_matrix.sum(axis=1)
diversity = diversity.values

# using the diversity array, we can select the most diverse structures in the dataset.
# The most diverse structures are those with the lowest diversity values.
# The diversity values are sorted in ascending order, and the indices of the sorted diversity values are stored in the sorted_diversity array.
sorted_diversity = diversity.argsort() 
sorted_diversity = sorted_diversity.tolist()
sorted_diversity

# select 1% most diverse structures and save them to a new dataframe
df_most_diverse = df.iloc[sorted_diversity[:88]]
df_most_diverse


Unnamed: 0,Mol,Source,FP,MACCS_keys,tanimoto
6854,<rdkit.Chem.rdchem.Mol object at 0x7fba6733ed50>,drugbank,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[False, False, False, False, False, False, Fal...",0.000000
6984,<rdkit.Chem.rdchem.Mol object at 0x7fba673466c0>,drugbank,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[False, False, False, False, False, False, Fal...",0.000000
1199,<rdkit.Chem.rdchem.Mol object at 0x7fba67436d50>,drugbank,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[False, False, False, False, False, False, Fal...",0.000000
4665,<rdkit.Chem.rdchem.Mol object at 0x7fba674da730>,drugbank,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[False, False, False, False, False, False, Fal...",0.000000
7020,<rdkit.Chem.rdchem.Mol object at 0x7fba67347680>,drugbank,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[False, False, False, False, False, False, Fal...",0.000000
...,...,...,...,...,...
4516,<rdkit.Chem.rdchem.Mol object at 0x7fba674d2570>,drugbank,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[False, False, False, False, False, False, Fal...",0.033333
6707,<rdkit.Chem.rdchem.Mol object at 0x7fba6733ac70>,drugbank,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[False, False, False, False, False, False, Fal...",0.037037
6840,<rdkit.Chem.rdchem.Mol object at 0x7fba6733e730>,drugbank,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[False, False, False, False, False, False, Fal...",0.040000
6824,<rdkit.Chem.rdchem.Mol object at 0x7fba6733e030>,drugbank,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[False, False, False, False, False, False, Fal...",0.038462
