## PRIME Virtual Library Decomposition
Task: Decompose PRIME VL along with other libraries (e.g. Enamine) to compare chemical space coverage 

We take this way:
- Import MorganFP (generated with radius 3 and 1024 bits)
- Decompose with UMAP

In [None]:
import random
import pathlib

import numpy as np
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_score
from scipy.spatial.distance import cdist
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import umap
%matplotlib inline


In [None]:
plt.rcParams.update({
    "savefig.transparent": True,   # Transparent background for saving figures
    "axes.grid": False,            # No grid on axes
    "axes.spines.bottom": True,    # Show bottom spine
    "axes.spines.left": False,     # Hide left spine
    "axes.spines.right": False,    # Hide right spine
    "axes.spines.top": False,      # Hide top spine

    # Font settings
    "font.size": 6,                # Font size for the entire figure
    "font.family": 'sans-serif',
    "font.sans-serif": ["Helvetica", "Arial"],
    "text.color": 'black',         # Set default text color to black

    # Tick settings
    "xtick.major.pad": 0.0,        # Padding for major x-ticks
    "xtick.minor.pad": 0.0,        # Padding for minor x-ticks
    "ytick.major.pad": 0.0,        # Padding for major y-ticks
    "ytick.minor.pad": 0.0,        # Padding for minor y-ticks

    # Axis label settings
    "axes.labelweight": "bold",    # Bold axis labels
    "axes.labelpad": 2.5,          # Padding between axis and label
    "axes.xmargin": 0.05,          # Margin on the x-axis

    # Additional settings
    'axes.labelsize': 6,           # Size for axis labels
    'axes.titlesize': 6,           # Size for plot titles
    'xtick.labelsize': 6,          # Size for x-axis tick labels
    'ytick.labelsize': 6,          # Size for y-axis tick labels
    'legend.fontsize': 6,          # Font size for the legend
    'svg.fonttype': 'none',        # Necessary to have editable text in SVGs
    'axes.labelcolor': 'black',    # Color of axis labels
    'xtick.color': 'black',        # Color of x-axis tick labels
    'ytick.color': 'black',        # Color of y-axis tick labels
})

### Preprocessing

Import molecules from virtual library files

In [None]:
df = pd.read_csv("../data/dataS4_with_fps.csv.bz2")
# spread the fingerprints into an array
fps_synferm = np.array([[int(n) for n in s] for s in df["MorganFP"]])
fps_synferm.shape

In [None]:
df_enamine = pd.read_csv("../data/Enamine_Hit_Locator_Library_HLL-460_460160cmpds_20220221_with_fps.csv")
# spread the fingerprints into an array
fps_enamine = np.array([[int(n) for n in s] for s in df_enamine["MorganFP"]])
fps_enamine.shape

In [None]:
df_fda = pd.read_csv("../data/Enamine_FDA_with_fps.csv.bz2")
# spread the fingerprints into an array
fps_fda = np.array([[int(n) for n in s] for s in df_fda["MorganFP"]])
fps_fda.shape

In [None]:
df_chembl = pd.read_csv("../data/chembl_34_50k-random_with_FP.csv")
# spread the fingerprints into an array
fps_chembl = np.array([[int(n) for n in s] for s in df_chembl["MorganFP"]])
fps_chembl.shape

In [None]:
# we sample 50000 of each SynFerm and Enamine
random.seed(999)
random_nrs_synferm = random.sample(list(range(fps_synferm.shape[0])), 50000)
random_nrs_enamine = random.sample(list(range(fps_enamine.shape[0])), 50000)                              
random_fps = np.concatenate([
    fps_synferm[random_nrs_synferm, :], 
    #fps_enamine[random_nrs_enamine, :], 
    #fps_chembl, 
    fps_fda
], axis=0)
random_fps.shape

### UMAP

Dimensionality reduction with UMAP

In [None]:
reducer = umap.UMAP(
    n_neighbors=300,
    min_dist=0.5,
    n_components=2,
    metric='jaccard',
)

In [None]:
embedding = reducer.fit_transform(X=random_fps)
embedding.shape

Show UMAP embedding

In [None]:
palette = ["#5790fc", "#f89c20", "#e42536", "#a1212c"]  # works for colorblind

bode_colors = ['#161638', '#923c10', '#000000']
bode_colors_light = ['#e3e3f0', '#f0f0db']
bode_blues = ['#161638', '#4a4a68', '#7d7e96', '#b0b0be', '#e3e3f0']
bode_oranges = ['#923c10', '#ae5526', '#bb754d', '#c99470', '#e2d1b6', '#f0f0db']

colors = [bode_blues[2], bode_oranges[1]]

colors = ([palette[0]] * len(random_nrs_synferm) 
           + [palette[1]] * len(random_nrs_enamine) 
           #+ [palette[2]] * len(fps_chembl) 
           #+ [palette[3]] * len(fps_fda)
         )
#colors = [colors[x] for x in df["pred_A"]]

In [None]:
idx_enamine_start = len(random_nrs_synferm)
idx_chembl_start = len(random_nrs_synferm) + len(random_nrs_enamine)
idx_fda_start = len(random_nrs_synferm) + len(random_nrs_enamine) + len(fps_chembl)

plt.figure(figsize=(8,6))
plt.scatter(
    embedding[0:idx_enamine_start, 0],
    embedding[0:idx_enamine_start, 1],
    c=palette[0],
    label="SynFerm",
    s=1,
    marker=".",
)
#plt.scatter(
#    embedding[idx_enamine_start:idx_chembl_start, 0],
#    embedding[idx_enamine_start:idx_chembl_start, 1],
#    c=palette[1],
#    label="Enamine Hit Locator",
#    s=1,
#    marker=".",
#)
#plt.scatter(
#    embedding[idx_chembl_start:idx_fda_start, 0],
#    embedding[idx_chembl_start:idx_fda_start, 1],
#    c=palette[2],
#    label="ChEMBL",
#    s=1,
#    marker=".",
#)
plt.scatter(
    embedding[idx_enamine_start:, 0],
    embedding[idx_enamine_start:, 1],
    c=palette[3],
    label="FDA",
    s=1,
    marker=".",
)
ax = plt.gca()
ax.set_aspect('auto', 'datalim')
ax.set_frame_on(False)
ax.set_xticks([])
ax.set_yticks([])
legend = plt.legend(bbox_to_anchor=(1, 1), fontsize=8)
# Adjust the size of the legend markers
for handle in legend.legendHandles:
    handle._sizes = [8]  # Size of markers in the legend only
#plt.title(f'UMAP projection of the SynFerm PRIME VL\n{reducer.n_neighbors}_{reducer.min_dist}_{reducer.n_components}_{reducer.metric}', fontsize=12);
plt.tight_layout()

save_path = pathlib.Path(f"../results/2024-04-18/{len(random_nrs_synferm)}synferm_{len(fps_fda)}fda/UMAP_{reducer.n_neighbors}_{reducer.min_dist}_{reducer.n_components}_{reducer.metric}.png")
save_path.parent.mkdir(exist_ok=True)
plt.savefig(save_path, dpi=300)
plt.savefig(save_path.with_suffix(".svg"))