# pdist vs pairwise_distances

In [11]:
import numpy as np
from scipy.spatial.distance import pdist, squareform
from sklearn.metrics.pairwise import pairwise_distances

# Generate a random set of points
points = np.random.rand(5, 2)

# Calculate the pairwise distances using pdist
condensed_distances = pdist(points)
square_distances = squareform(condensed_distances)

# Calculate the pairwise distances using pairwise_distances
square_distances_sklearn = pairwise_distances(points)

print("Square distances (pdist):")
print(square_distances)

print("Square distances (pairwise_distances):")
print(square_distances_sklearn)

Square distances (pdist):
[[0.         0.52084027 1.10653194 0.03936829 0.64664006]
 [0.52084027 0.         0.69551554 0.50929445 0.12581602]
 [1.10653194 0.69551554 0.         1.11368655 0.61952833]
 [0.03936829 0.50929445 1.11368655 0.         0.63495678]
 [0.64664006 0.12581602 0.61952833 0.63495678 0.        ]]
Square distances (pairwise_distances):
[[0.         0.52084027 1.10653194 0.03936829 0.64664006]
 [0.52084027 0.         0.69551554 0.50929445 0.12581602]
 [1.10653194 0.69551554 0.         1.11368655 0.61952833]
 [0.03936829 0.50929445 1.11368655 0.         0.63495678]
 [0.64664006 0.12581602 0.61952833 0.63495678 0.        ]]


## TANIMOTO VS JACCARD

In [2]:
import pandas as pd
import numpy as np

from thesis_work.clustering.utils import (
    efcp_distance_matrix,
    generic_distance_matrix,
)
from thesis_work.utils.utils import get_ecfp_descriptors

In [3]:
smiles_compounds = [
    "O=C(Cc1cccc2ccccc12)Nc1n[nH]c2ccc(N3CCCS3(=O)=O)cc12",
    "COC(=O)NC[C@@H](NC(=O)c1ccc(-c2nc(C3CCOCC3)cnc2N)cc1F)c1cccc(Br)c1",
    "COc1ccccc1Nc1cc(Oc2cc(C)c(C)nc2-c2ccccn2)ccn1",
    "O=C(/C=C/CN1CCCC1)N1CCOc2cc3ncnc(Nc4ccc(F)c(Cl)c4)c3cc21",
    "O=C(Nc1cccc(Nc2cc3c(=O)[nH][nH]c(=O)c3cc2Cl)c1)c1cccc(Cl)c1",
    "Cc1cc(CNc2nc(Nc3cc(C4CC4)[nH]n3)cc(NC3CC4CCC(C3)N4C)n2)on1",
    "Cc1cc(-c2cc(O)ccc2Cl)cc2nnc(Nc3ccc(S(N)(=O)=O)cc3)nc12",
    "NS(=O)(=O)c1cccc(N/C=C2\C(=O)Nc3ccccc32)c1",
    "CC(=O)Nc1ccc2cnn(-c3cc(NC4CC4)n4ncc(C#N)c4n3)c2c1",
    "CS(=O)(=O)c1cccc(Nc2nccc(N(CC#N)c3c(Cl)ccc4c3OCO4)n2)c1",
    "Cc1cnc(-c2ccnc(C(C)(C)O)n2)cc1-n1c(C)cc(OCc2ccc(F)cc2F)c(Cl)c1=O",
    "Cc1ccc(C(=O)Nc2cc(C(F)(F)F)ccn2)cc1/C=C/n1cnc2cncnc21",
    "CNC(=O)c1cnn2ccc(N3C[C@@H](O)C[C@@H]3c3cccc(F)c3)nc12",
    "COc1cc2c(cc1OC1CCOC1)Cc1c-2n[nH]c1-c1ccc(C#N)cc1",
]

smiles_series = pd.Series(smiles_compounds)

In [4]:
return_type = "original"  # BUTINA
# return_type = "numpy" # OTHERS

vector_embeddings = get_ecfp_descriptors(
    smiles_series=smiles_series,
    radius=2,
    nBits=2048,
    inner_return_type=return_type,
)

vector_embeddings

[<rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7f2e20c39000>,
 <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7f2e20c39070>,
 <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7f2e20c390e0>,
 <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7f2e20c39150>,
 <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7f2e20c391c0>,
 <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7f2e20c39230>,
 <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7f2e20c392a0>,
 <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7f2e20c39310>,
 <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7f2e20c39380>,
 <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7f2e20c393f0>,
 <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7f2e20c39460>,
 <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7f2e20c394d0>,
 <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7f2e20c39540>,
 <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7f2e20c395b0>]

In [5]:
distance_matrix_rdkit = efcp_distance_matrix(
    ecfps=vector_embeddings, method="fast", return_upper_tringular=True
)

distance_matrix_rdkit

array([0.87826087, 0.90721649, 0.87155963, 0.85046729, 0.85245902,
       0.91666667, 0.85057471, 0.89622642, 0.89411765, 0.81914894,
       0.87254902, 0.87179487, 0.8877551 , 0.9137931 , 0.93814433,
       0.90909091, 0.88392857, 0.87912088, 0.83333333, 0.85714286,
       0.9009901 , 0.88505747, 0.90384615, 0.91666667, 0.90909091,
       0.83561644, 0.94736842, 0.79220779, 0.87      , 0.86956522,
       0.90816327, 0.88288288, 0.87777778, 0.87378641, 0.87628866,
       0.8988764 , 0.85849057, 0.85950413, 0.87254902, 0.83185841,
       0.82795699, 0.89285714, 0.79381443, 0.82222222, 0.85981308,
       0.88888889, 0.85833333, 0.83673469, 0.84070796, 0.90909091,
       0.91150442, 0.8627451 , 0.93939394, 0.9       , 0.86842105,
       0.8952381 , 0.88235294, 0.84210526, 0.84545455, 0.83146067,
       0.90825688, 0.85714286, 0.90217391, 0.85294118, 0.88392857,
       0.8411215 , 0.85858586, 0.80733945, 0.88541667, 0.87272727,
       0.87777778, 0.8952381 , 0.8989899 , 0.8988764 , 0.8    

In [4]:
# return_type = "original" # BUTINA
return_type = "numpy"  # OTHERS

vector_embeddings_numpy = get_ecfp_descriptors(
    smiles_series=smiles_series,
    radius=2,
    nBits=2048,
    inner_return_type=return_type,
)

vector_embeddings_numpy

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [5]:
distance_matrix_sklearn = generic_distance_matrix(
    x=vector_embeddings_numpy, metric="jaccard", return_upper_tringular=True
)

distance_matrix_sklearn

array([0.87826087, 0.90721649, 0.85046729, 0.85057471, 0.87254902,
       0.90909091, 0.88505747, 0.87      , 0.85849057, 0.88888889,
       0.8952381 , 0.85858586, 0.89320388, 0.87155963, 0.85245902,
       0.89622642, 0.87179487, 0.88392857, 0.90384615, 0.86956522,
       0.85950413, 0.85833333, 0.88235294, 0.80733945, 0.84070796,
       0.91666667, 0.89411765, 0.8877551 , 0.87912088, 0.91666667,
       0.90816327, 0.87254902, 0.83673469, 0.84210526, 0.88541667,
       0.85106383, 0.81914894, 0.9137931 , 0.83333333, 0.90909091,
       0.88288288, 0.83185841, 0.84070796, 0.84545455, 0.87272727,
       0.92241379, 0.93814433, 0.85714286, 0.83561644, 0.87777778,
       0.82795699, 0.90909091, 0.83146067, 0.87777778, 0.94845361,
       0.9009901 , 0.94736842, 0.87378641, 0.89285714, 0.91150442,
       0.90825688, 0.8952381 , 0.86407767, 0.79220779, 0.87628866,
       0.79381443, 0.8627451 , 0.85714286, 0.8989899 , 0.92156863,
       0.8988764 , 0.82222222, 0.93939394, 0.90217391, 0.89887

In [11]:
distance_matrix_sklearn == distance_matrix_rdkit

np.allclose(distance_matrix_sklearn, distance_matrix_rdkit)

False