## PRIME Virtual Library Clustering
Task: Cluster the PRIME VL to select compounds to scale up for assessment of response factors.

n.b. this is a legacy notebook and was used to produce some informational materials for interim presentations.
None of this was used in the paper.

We take this way:
- Import ECFP4 (generated with radius 4 and 2048 bits)
- Decompose with UMAP
- Clustering with HDBSCAN


In [None]:
import numpy as np
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_score
from scipy.spatial.distance import cdist
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import umap
%matplotlib inline


### Preprocessing

Import molecules from virtual library files

In [None]:
df = pd.read_csv("../data/prime_vl_with_ecfp.csv.bz2")

In [None]:
# spread the fingerprints into an array
fps = np.array([[int(n) for n in s] for s in df["ecfp4"]])

In [None]:
fps.shape

### UMAP

Dimensionality reduction with UMAP

In [None]:
reducer = umap.UMAP(
    n_neighbors=125,
    min_dist=0.3,
    n_components=2,
    metric='jaccard',
)

In [None]:
embedding = reducer.fit_transform(X=fps)

In [None]:
embedding.shape

Show UMAP embedding

In [None]:
bode_colors = ['#161638', '#923c10', '#000000']
bode_colors_light = ['#e3e3f0', '#f0f0db']
bode_blues = ['#161638', '#4a4a68', '#7d7e96', '#b0b0be', '#e3e3f0']
bode_oranges = ['#923c10', '#ae5526', '#bb754d', '#c99470', '#e2d1b6', '#f0f0db']

colors = [bode_blues[2], bode_oranges[1]]

In [None]:
plt.figure(figsize=(3,3))
plt.scatter(
    embedding[:, 0],
    embedding[:, 1],
    c=[colors[x] for x in df["synthesized"]],
    s=.05,
)
ax = plt.gca()
ax.set_aspect('auto', 'datalim')
ax.set_frame_on(False)
ax.set_xticks([])
ax.set_yticks([])
#plt.title(f'UMAP projection of the SynFerm PRIME VL\n{reducer.n_neighbors}_{reducer.min_dist}_{reducer.n_components}_{reducer.metric}', fontsize=12);
plt.tight_layout()
plt.savefig(f"../results/UMAP_{reducer.n_neighbors}_{reducer.min_dist}_{reducer.n_components}_{reducer.metric}.png", dpi=300)

In [None]:
# we do't need to repeat saving the pictures as long as the input stays the same

#from rdkit import Chem
#for i, row in df.iterrows():
#    Chem.Draw.MolToFile(Chem.MolFromSmiles(row['SMILES']), f"../results/mol_images/id{row['id']}.png")

In [None]:
df[["x", "y"]] = embedding

In [None]:
df['img_filename'] = df['id'].apply(lambda i: f"../results/mol_images/id{i}.png")

In [None]:
df["color"] = df["synthesized"].map({0: colors[0], 1: colors[1]})

In [None]:
df.head()

In [None]:
from bokeh.plotting import figure, show, ColumnDataSource
from bokeh.models import CategoricalColorMapper
from bokeh.io import output_notebook, output_file, export_png, export_svg
from bokeh import palettes
from bokeh.transform import linear_cmap
output_notebook()

In [None]:
output_file(f"../results/umap_{reducer.n_neighbors}_{reducer.min_dist}_{reducer.n_components}_{reducer.metric}_2023-03-13.html")
cds = ColumnDataSource(df)
TOOLTIPS = """
    <div>
        <div>
            <img
                src=@img_filename height="200" alt="molecule" width="200"
                style="float: left;"
                border="2"
            ></img>
        </div>
        <div>
            <span style="font-size: 10px; font-family: Helvetica, sans-serif;">@id</span>
        </div>
    </div>
"""

p = figure(width=800, 
           height=800, 
           tooltips=TOOLTIPS,
           title="UMAP projection of the SynFerm PRIME VL")
p.circle(x='x', 
         y='y', 
         size=3, 
         source=cds, 
         color="color",
         fill_alpha=.6,
         line_alpha=.4,
         #line_color='line_color', 
         legend_group="synthesized"
        )
# now some styling
p.title.text_font = 'helvetica'
p.title.text_font_size = '20px'
p.legend.label_text_font = "helvetica"
p.legend.label_text_font_size = "18px"

p.xaxis.visible = False
p.yaxis.visible = False
p.grid.visible = False

p.background_fill_color = None
p.border_fill_color = None


show(p)

In [None]:
export_png(p, filename=f"../results/umap_{reducer.n_neighbors}_{reducer.min_dist}_{reducer.n_components}_{reducer.metric}_2023-03-14.png")


In [None]:
p.output_backend = "svg"

export_svg(p, filename=f"../results/umap_{reducer.n_neighbors}_{reducer.min_dist}_{reducer.n_components}_{reducer.metric}_2023-03-14.svg")

In [None]:
scores = []

In [None]:
for k in range(2, 201):
    if k % 5 == 0:
        print(k)
    clusterer = MiniBatchKMeans(n_clusters=k, n_init="auto")
    clusterer.fit(embedding)
    scores.append(silhouette_score(embedding, clusterer.labels_, sample_size=30000))

In [None]:
plt.plot(list(range(2, 52)), scores[:50])

In [None]:
scores[17]

In [None]:
# remember that index 0 means n_clusters 2.
scores.index(max(scores))

In [None]:
print("Values of n_clusters with highest silhouette scores (top 5):")
for i, v in enumerate(sorted(scores, reverse=True)):
    if i > 4:
        break
    print(scores.index(v) + 2)

In [None]:
n_clusters = 191
clusterer = MiniBatchKMeans(n_clusters=n_clusters, n_init="auto")

In [None]:
clusterer.fit(embedding)

In [None]:
clusterer.labels_

In [None]:
plt.figure(figsize=(3, 3))
plt.scatter(
    embedding[:, 0],
    embedding[:, 1],
    c=clusterer.labels_,
    cmap="tab20",
    s=.05,
)
ax = plt.gca()
ax.set_aspect('auto', 'datalim')
ax.set_frame_on(False)
ax.set_xticks([])
ax.set_yticks([])
#plt.title(f'UMAP projection of the SynFerm PRIME VL\n{reducer.n_neighbors}_{reducer.min_dist}_{reducer.n_components}_{reducer.metric}', fontsize=12);
plt.tight_layout()
plt.savefig(f"../results/UMAP_{reducer.n_neighbors}_{reducer.min_dist}_{reducer.n_components}_{reducer.metric}_{n_clusters}clusters.png", dpi=300)

In [None]:
# we save the embedding and the labels for later use
df_results = pd.DataFrame(data=embedding, columns=["x", "y"])
df_results["labels"] = clusterer.labels_
df_cluster_centers = pd.DataFrame(clusterer.cluster_centers_, columns=["x", "y"])

In [None]:
df_results.to_csv(f"../results/UMAP_{reducer.n_neighbors}_{reducer.min_dist}_{reducer.n_components}_{reducer.metric}_{n_clusters}clusters_embedding.csv")
df_cluster_centers.to_csv(f"../results/UMAP_{reducer.n_neighbors}_{reducer.min_dist}_{reducer.n_components}_{reducer.metric}_{n_clusters}clusters_centers.csv")


### Identify representative molecules to synthesize

We switch gears a bit and use the ideal clustering we obtained to identify representative molecules we want to synthesize on a bigger scale.

In [None]:
# re-load the embedding and clusters for the best clustering
n_neighbors = 125
min_dist = 0.3
n_components = 2
metric = "jaccard"
n_clusters = 15
emb = pd.read_csv(f"../results/UMAP_{n_neighbors}_{min_dist}_{n_components}_{metric}_{n_clusters}clusters_embedding.csv", index_col=0)
centers = pd.read_csv(f"../results/UMAP_{n_neighbors}_{min_dist}_{n_components}_{metric}_{n_clusters}clusters_centers.csv", index_col=0)


In [None]:
emb

In [None]:
centers

In [None]:
# plot clusters with centroids
plt.figure(figsize=(3, 3))
plt.scatter(
    emb["x"],
    emb["y"],
    c=emb["labels"],
    cmap="tab20",
    s=.05,
)
plt.scatter(
    centers["x"],
    centers["y"],
    c="black",
    s=1,
)
#for i in range(len(centers)):
#    plt.annotate(i, (centers.at[i, "x"], centers.at[i, "y"] + 0.2))
    
ax = plt.gca()
ax.set_aspect('auto', 'datalim')
ax.set_frame_on(False)
ax.set_xticks([])
ax.set_yticks([])
plt.tight_layout()
plt.savefig(f"../results/UMAP_{n_neighbors}_{min_dist}_{n_components}_{metric}_{n_clusters}clusters_with_centroids_no_annot.png", dpi=300)

In [None]:
# calculate euclidian distance for all points with the cluster centroids
dist_arr = cdist(emb[["x", "y"]].values, centers[["x", "y"]].values)

In [None]:
# closest point for each cluster
closest = dist_arr.argmin(axis=0)
closest

In [None]:
closest_smiles = df["SMILES"][closest].values.tolist()

In [None]:
from rdkit.Chem import Draw
from rdkit import Chem

In [None]:
Draw.MolsToGridImage([Chem.MolFromSmiles(smi) for smi in closest_smiles], subImgSize=(400, 400), legends=[str(i) for i in range(0,16)])
                                                                                                         

In [None]:
for s in closest_smiles:
    print(s)

In [None]:
# Visualize 

In [None]:
df

In [None]:
df_synthesized = df.loc[df["synthesized"] == 1]
df_synthesized

In [None]:
idxs = df_synthesized["id"].values.tolist()
    

In [None]:
idxs

In [None]:
import sqlite3

In [None]:
con = sqlite3.connect("../data/50k_project.db")

In [None]:
cur = con.cursor()

In [None]:
res = cur.execute(f"SELECT vl_id, product_A_lcms_ratio FROM experiments").fetchall()

In [None]:
df_res =pd.DataFrame(res, columns=["id", "product_A_lcms_ratio"])

In [None]:
df_res["product_A_normalized"] = df_res["product_A_lcms_ratio"] / df_res["product_A_lcms_ratio"].max()

In [None]:
df_res["success"] = np.nan
df_res.loc[df_res["product_A_normalized"] > 5e-6, "success"] = 1.
df_res.loc[df_res["product_A_normalized"] < 1e-10, "success"] = 0.

In [None]:
df_res["success"].value_counts()

In [None]:
df_res["success"].isna().value_counts()

In [None]:
(df_res["product_A_normalized"] < 5e-6).value_counts()

In [None]:
df_all = df_synthesized.merge(df_res, how="left")

In [None]:
from matplotlib.colors import ListedColormap

In [None]:
# plot clusters with reaction outcome
plt.figure(figsize=(3, 3))
plt.scatter(
    df_all["x"],
    df_all["y"],
    c=df_all["success"],
    cmap="Paired_r",
    s=.05,
)
 
ax = plt.gca()
ax.set_aspect('auto', 'datalim')
ax.set_frame_on(False)
ax.set_xticks([])
ax.set_yticks([])
plt.tight_layout()
plt.savefig(f"../results/UMAP_{n_neighbors}_{min_dist}_{n_components}_{metric}_{n_clusters}clusters_with_reaction_outcome.png", dpi=300)

In [None]:
colors