In [None]:
# Install conda
!pip install -q condacolab
import condacolab
condacolab.install()

In [None]:
# Check conda installation
import condacolab
condacolab.check()

# Install required packages
!mamba install -c conda-forge pandas numpy matplotlib rdkit seaborn scikit-learn

# Download required files
!wget https://github.com/lillgroup/AIiDD/releases/download/v2.0.0/lab1.tar.gz
!tar -xzvf lab1.tar.gz

# Exploring the chemical space by Principal Component Analysis (PCA) and clustering

In this workflow, we will analyze compounds binding to different targets with the aim to cluster molecules with similar properties.

Can the compounds be separated using unsupervised learning (PCA & clustering) based on their target class?


In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import gridspec

from rdkit import Chem, DataStructs
from rdkit.Chem import Descriptors, Crippen, AllChem

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

import matplotlib.cm as cm
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.cluster import KMeans

### Databases of molecules

The library contains molecules in SMILES codes (and other representations and information) binding to 5 different protein targets.

In [None]:
tmp = pd.read_table("BDBcomp_50025598.tsv", sep="\t")
tmp.loc[:, "TargetID"] = 0
tmp1 = pd.read_table("BDBcomp_191.tsv", sep="\t")
tmp1.loc[:, "TargetID"] = 1
tmp = pd.concat([tmp, tmp1], axis=0)
tmp1 = pd.read_table("BDBpoly_1201.tsv", sep="\t")
tmp1.loc[:, "TargetID"] = 2
tmp = pd.concat([tmp, tmp1], axis=0)
tmp1 = pd.read_table("BDBpoly_1949.tsv", sep="\t")
tmp1.loc[:, "TargetID"] = 3
tmp = pd.concat([tmp, tmp1], axis=0)
tmp1 = pd.read_table("BDBcomp_304.tsv", sep="\t")
tmp1.loc[:, "TargetID"] = 4
tmp = pd.concat([tmp, tmp1], axis=0)

table = tmp.loc[
    :,
    (
        "Ligand SMILES",
        "Ligand InChI",
        "Target Name Assigned by Curator or DataSource",
        "TargetID",
        "IC50 (nM)",
    ),
]
table.rename(
    columns={
        "Ligand SMILES": "SMILES",
        "Target Name Assigned by Curator or DataSource": "Target",
        "Ligand InChI": "InChI",
    },
    inplace=True,
)
table.reset_index(drop=True, inplace=True)

class0 = table.index[table.TargetID == 0]
class1 = table.index[table.TargetID == 1]
class2 = table.index[table.TargetID == 2]
class3 = table.index[table.TargetID == 3]
class4 = table.index[table.TargetID == 4]
print(len(class0), len(class1), len(class2), len(class3), len(class4))

Let´s take a look at the first 10 elements of our table of compounds.

In [None]:
table.head(10)

### Calculation of molecular descriptors

We use RDKIT to calculate a few molecular descriptors (1D & 2D).

Moreover, a list of all descriptor that can be calculated using RDKIT can be found https://www.rdkit.org/docs/GettingStartedInPython.html.

=========================================================================================================================================================
## PLACE TO CHANGE:
Add or replace descriptors;
see http://rdkit.org/docs/source/rdkit.Chem.rdMolDescriptors.html for other desciptors.

For addition: CalcNumAliphaticRings, CalcNumAromaticRings, CalcNumHeterocycles, CalcNumRings, CalcRadiusOfGyration, CalcSpherocityIndex

For replacement: GetUSR, GetUSRCAT, GetMorganFingerprint, GetMACCSKeysFingerprint, GetAtomPairFingerprint, CalcWHIM

Example:
```python
m2=Chem.AddHs(mol)
AllChem.EmbedMolecule(m2)
desc_vec = Chem.rdMolDescriptors.GetUSR(m2)
```
desc_vec is in form of a vector and needs to be translated into individual columns of pandas table to be used in the same manner as individual descriptors.

=========================================================================================================================================================

In [None]:
# We calculate the descriptors and add them to our table
for i in table.index:
    mol = Chem.MolFromSmiles(table.loc[i, "SMILES"])
    table.loc[i, "MolWt"] = Descriptors.ExactMolWt(mol)
    table.loc[i, "TPSA"] = Chem.rdMolDescriptors.CalcTPSA(
        mol
    )  # Topological Polar Surface Area
    table.loc[i, "nRotB"] = Descriptors.NumRotatableBonds(
        mol
    )  # Number of rotable bonds
    table.loc[i, "HBD"] = Descriptors.NumHDonors(mol)  # Number of H bond donors
    table.loc[i, "HBA"] = Descriptors.NumHAcceptors(mol)  # Number of H bond acceptors
    table.loc[i, "LogP"] = Descriptors.MolLogP(mol)  # LogP

As a result, we will get a table with all descriptors for each molecule (SMILES code).

In [None]:
table.head(10)

----------------------------------------------------------------------------------------------
### Principal Component Analysis of calculated molecular descriptors (PCA)



=========================================================================================================================================================
## PLACE TO CHANGE

other descriptors

=========================================================================================================================================================

### Descriptors

In [None]:
descriptors = table.loc[:, ["MolWt", "TPSA", "nRotB", "HBD", "HBA", "LogP"]].values

### Standardization

A very important step is performing a standardization of the scales of the descriptors. Scales differences in PCA modify the variance distribution during PCA. More info about this topic can be found https://scikit-learn.org/stable/auto_examples/preprocessing/plot_scaling_importance.html

In [None]:
descriptors_std = StandardScaler().fit_transform(descriptors)
print(descriptors_std)

### PCA

Now, we are ready to calculate the PCA

In [None]:
pca = PCA()
descriptors_2d = pca.fit_transform(descriptors_std)

Let´s add the PCA data to a new table

In [None]:
descriptors_pca = pd.DataFrame(descriptors_2d)
descriptors_pca.index = table.index
descriptors_pca.columns = ["PC{}".format(i + 1) for i in descriptors_pca.columns]
descriptors_pca.head(10)

### Explained variance

We can check the explained variance to see the variance explained by each component from PCA

In [None]:
print(pca.explained_variance_ratio_)
print(sum(pca.explained_variance_ratio_))

And also, we can plot such data

In [None]:
plt.rcParams["axes.linewidth"] = 1.5
plt.figure(figsize=(8, 6))
fig, ax = plt.subplots(figsize=(8, 6))

var = np.cumsum(np.round(pca.explained_variance_ratio_, decimals=3) * 100)
plt.plot([i + 1 for i in range(len(var))], var, "k-", linewidth=2)
plt.xticks([i + 1 for i in range(len(var))])
plt.ylabel("% Variance Explained", fontsize=16, fontweight="bold")
plt.xlabel("Pincipal Component (PC)", fontsize=16, fontweight="bold")
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
plt.tight_layout()
plt.tick_params("both", width=2, labelsize=12)

As you can see, the PC1 and PC2 explain 89 % of the variability. So we can plot PC1 vs PC2 to see the distribution of our compounds.

In [None]:
fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot(111)

ax.plot(descriptors_pca["PC1"][class0], descriptors_pca["PC2"][class0], "o", color="k")
ax.plot(descriptors_pca["PC1"][class1], descriptors_pca["PC2"][class1], "o", color="b")
ax.plot(descriptors_pca["PC1"][class2], descriptors_pca["PC2"][class2], "o", color="g")
ax.plot(descriptors_pca["PC1"][class3], descriptors_pca["PC2"][class3], "o", color="r")
ax.plot(descriptors_pca["PC1"][class4], descriptors_pca["PC2"][class4], "o", color="y")
ax.set_title(
    "Principal Component Analysis", fontsize=16, fontweight="bold", family="sans-serif"
)
ax.set_xlabel("PC1", fontsize=14, fontweight="bold")
ax.set_ylabel("PC2", fontsize=14, fontweight="bold")

plt.tick_params("both", width=2, labelsize=12)

plt.tight_layout()
plt.show()

However, this plot is simple and we cannot identify compound clusters easily. For such purpose, we can perform clustering analysis using the PCA values to identify compound groups by mathematical approaches. Moreover, PC1 vs PC2 (or any other combination) won´t give us information about which feature (descriptor) is more important to explain the variance of our values.

For this example, we will identify the most important feature (descriptor), and we identify compound clusters by the k-means clustering algorithm. For more info about k-means, you can look at skit-learn.

### K-means clustering and main features identification

The first step for this analysis is to re-scale our PCA values from -1 to 1. This is a typical procedure for distance-based clustering such as k-means clustering where both componenets are equally weighted in the distance calculation. It also allows us to analyze our data inside of the covariance cycle of the features (descriptors). Fur such purpose we type:

In [None]:
# This normalization will be performed just for PC1 and PC2, but can be done for all the components.
scale1 = 1.0 / (max(descriptors_pca["PC1"]) - min(descriptors_pca["PC1"]))
scale2 = 1.0 / (max(descriptors_pca["PC2"]) - min(descriptors_pca["PC2"]))

# And we add the new values to our PCA table
descriptors_pca["PC1_normalized"] = [i * scale1 for i in descriptors_pca["PC1"]]
descriptors_pca["PC2_normalized"] = [i * scale2 for i in descriptors_pca["PC2"]]

In [None]:
descriptors_pca.head(10)

In [None]:
fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot(111)

# ax.plot(descriptors_pca['PC1_normalized'],descriptors_pca['PC2_normalized'],'o',color='k')
ax.plot(
    descriptors_pca["PC1_normalized"][class0],
    descriptors_pca["PC2_normalized"][class0],
    "o",
    color="k",
)
ax.plot(
    descriptors_pca["PC1_normalized"][class1],
    descriptors_pca["PC2_normalized"][class1],
    "o",
    color="b",
)
ax.plot(
    descriptors_pca["PC1_normalized"][class2],
    descriptors_pca["PC2_normalized"][class2],
    "o",
    color="g",
)
ax.plot(
    descriptors_pca["PC1_normalized"][class3],
    descriptors_pca["PC2_normalized"][class3],
    "o",
    color="r",
)
ax.plot(
    descriptors_pca["PC1_normalized"][class4],
    descriptors_pca["PC2_normalized"][class4],
    "o",
    color="y",
)

ax.set_title(
    "Principal Component Analysis", fontsize=16, fontweight="bold", family="sans-serif"
)
ax.set_xlabel("PC1", fontsize=14, fontweight="bold")
ax.set_ylabel("PC2", fontsize=14, fontweight="bold")

plt.tick_params("both", width=2, labelsize=12)

plt.tight_layout()
plt.show()

As you can see, the distribution of the points is the same as before, however, the scale now is from -1 to 1.

### K-means clustering

K-means clustering is an algorithm in which the user must define the number of clusters. However, in order to mathematically select a number of clusters for a group of points based on distribution, different algorithms can be applied. For instance, we will use the silhouette-based algorithm to identify the best number of clusters for our distribution. More info about silhouette algorithm can be found https://scikit-learn.org/stable/modules/generated/sklearn.metrics.silhouette_score.html and here.

In [None]:
range_n_clusters = [2, 3, 4, 5, 6, 7]
for n_clusters in range_n_clusters:
    fig, (ax1, ax2, ax3) = plt.subplots(1, 3)
    fig.set_size_inches(8, 4)

    kmeans = KMeans(n_clusters=n_clusters, random_state=10)
    cluster_labels = kmeans.fit_predict(
        descriptors_pca[["PC1_normalized", "PC2_normalized"]]
    )
    silhouette_avg = silhouette_score(
        descriptors_pca[["PC1_normalized", "PC2_normalized"]], cluster_labels
    )
    print(
        "For n_clusters =",
        n_clusters,
        "The average silhouette_score is :",
        silhouette_avg,
    )

    sample_silhouette_values = silhouette_samples(
        descriptors_pca[["PC1_normalized", "PC2_normalized"]], cluster_labels
    )

    y_lower = 10

    for i in range(n_clusters):
        # Aggregate the silhouette scores for samples belonging to
        # cluster i, and sort them
        ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.nipy_spectral(float(i) / n_clusters)
        ax1.fill_betweenx(
            np.arange(y_lower, y_upper),
            0,
            ith_cluster_silhouette_values,
            facecolor=color,
            edgecolor=color,
            alpha=0.7,
        )

        # Label the silhouette plots with their cluster numbers at the middle
        ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples

    ax1.set_title("The silhouette plot for the various clusters.")
    ax1.set_xlabel("The silhouette coefficient values")
    ax1.set_ylabel("Cluster label")

    # The vertical line for average silhouette score of all the values
    ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

    # 2nd Plot showing the actual clusters formed
    colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters)
    ax2.scatter(
        descriptors_pca["PC1_normalized"],
        descriptors_pca["PC2_normalized"],
        marker=".",
        s=30,
        lw=0,
        alpha=0.7,
        c=colors,
        edgecolor="k",
    )

    # Labeling the clusters
    centers = kmeans.cluster_centers_
    # Draw white circles at cluster centers
    ax2.scatter(
        centers[:, 0],
        centers[:, 1],
        marker="o",
        c="white",
        alpha=1,
        s=200,
        edgecolor="k",
    )

    for i, c in enumerate(centers):
        ax2.scatter(c[0], c[1], marker="$%d$" % i, alpha=1, s=50, edgecolor="k")

    ax2.set_title("The visualization of the clustered data.")
    ax2.set_xlabel("PC1")
    ax2.set_ylabel("PC2")

    ax3.scatter(
        descriptors_pca["PC1_normalized"][class0],
        descriptors_pca["PC2_normalized"][class0],
        marker=".",
        c="k",
        alpha=0.7,
        s=30,
        edgecolor="k",
    )
    ax3.scatter(
        descriptors_pca["PC1_normalized"][class1],
        descriptors_pca["PC2_normalized"][class1],
        marker=".",
        c="b",
        alpha=0.7,
        s=30,
        edgecolor="b",
    )
    ax3.scatter(
        descriptors_pca["PC1_normalized"][class2],
        descriptors_pca["PC2_normalized"][class2],
        marker=".",
        c="g",
        alpha=0.7,
        s=30,
        edgecolor="g",
    )
    ax3.scatter(
        descriptors_pca["PC1_normalized"][class3],
        descriptors_pca["PC2_normalized"][class3],
        marker=".",
        c="r",
        alpha=0.7,
        s=30,
        edgecolor="r",
    )
    ax3.scatter(
        descriptors_pca["PC1_normalized"][class4],
        descriptors_pca["PC2_normalized"][class4],
        marker=".",
        c="y",
        alpha=0.7,
        s=30,
        edgecolor="y",
    )

    ax3.set_title("Target classes.")
    ax3.set_xlabel("PC1")
    ax3.set_ylabel("PC2")

    plt.suptitle(
        (
            "Silhouette analysis for KMeans clustering on sample data "
            "with n_clusters = %d" % n_clusters
        ),
        fontsize=14,
        fontweight="bold",
    )


plt.show()

As higher the silhouette_score the better cluster distribution.

Despite this fact, let us here use 5 clusters for the following analysis (since we know that we have five different protein targets; although its use would be "supervising" the procedure).

In [None]:
kmeans = KMeans(n_clusters=5, random_state=10)  # We define the best number of clusters
clusters = kmeans.fit(
    descriptors_pca[["PC1_normalized", "PC2_normalized"]]
)  # PC1 vs PC2 (normalized values)

Once the calculation of clusters is done, we can add the result to our PCA table.

In [None]:
descriptors_pca["Cluster_PC1_PC2"] = pd.Series(clusters.labels_, index=table.index)

descriptors_pca.head(10)

Now everything together

We will plot PC1 vs PC2 data. Each cluster will have a different color, and we will find the main feature for each principal component.

=========================================================================================================================================================
## PLACE TO CHANGE (OR HIDE)
=========================================================================================================================================================

In [None]:
plt.rcParams["axes.linewidth"] = 1.5
plt.figure(figsize=(10, 8))

fig, ax = plt.subplots(figsize=(7, 7))

color_code = {
    0: "magenta",
    1.0: "orange",
    2.0: "cyan",
    3.0: "green",
    4.0: "blue",
    5.0: "yellow",
    6.0: "red",
    7.0: "brown",
}

for i in descriptors_pca.index:
    ax.plot(
        descriptors_pca.loc[i].at["PC1_normalized"],
        descriptors_pca.loc[i].at["PC2_normalized"],
        c=color_code[descriptors_pca.loc[i].at["Cluster_PC1_PC2"]],
        marker="o",
        markersize=8,
        markeredgecolor="k",
        alpha=0.3,
    )


plt.xlabel("PC1", fontsize=14, fontweight="bold")
ax.xaxis.set_label_coords(0.98, 0.45)
plt.ylabel("PC2", fontsize=14, fontweight="bold")
ax.yaxis.set_label_coords(0.45, 0.98)
plt.tick_params("both", width=2, labelsize=12)
ax.spines["left"].set_position(("data", 0))
ax.spines["bottom"].set_position(("data", 0))
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)

lab = ["MolWt", "TPSA", "nRotB", "HBD", "HBA", "LogP"]  # Feature labels

l = np.transpose(
    pca.components_[0:2, :]
)  ## We will get the components eigenvectors (main features) for PC1 and PC2

n = l.shape[0]
for i in range(n):
    plt.arrow(
        0, 0, l[i, 0], l[i, 1], color="k", alpha=0.6, linewidth=1.2, head_width=0.025
    )
    plt.text(
        l[i, 0] * 1.25,
        l[i, 1] * 1.25,
        lab[i],
        color="k",
        va="center",
        ha="center",
        fontsize=11,
    )

circle = plt.Circle(
    (0, 0), 1, color="gray", fill=False, clip_on=True, linewidth=1.5, linestyle="--"
)
ax.add_artist(circle)
plt.xlim(-1.2, 1.2)
plt.ylim(-1.2, 1.2)
plt.tight_layout()
plt.show()

As a result, we can identify the features that correlate positively and negatively with PC1  and with PC2. Additionally, we can identify the "most important" feature (descriptor) because of the vector length. And also we can see the different clusters we identified by the silhouette-based algorithm.

Finally, we can merge our tables to keep the data in a single table or file.

In [None]:
table = table.join(descriptors_pca)

table.head(10)

Saving values from a pandas table to a .csv file is very easy. You just need to type:

In [None]:
table.to_csv("UnsupervisedML.csv")