In [None]:
import base64
from io import BytesIO
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem, Draw
import pandas as pd
from sklearn.preprocessing import StandardScaler
from umap import UMAP

from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource, HoverTool
from bokeh.io import output_notebook

In [None]:
# Generate molecule images as base64 encoded strings
def mol_to_base64(mol, size=(300, 300)):
    """Convert RDKit mol to base64 encoded image"""
    img = Draw.MolToImage(mol, size=size)
    buffer = BytesIO()
    img.save(buffer, format="png")
    # return base64.b64encode(buffer.getvalue()).decode()
    for_encoding = buffer.getvalue()
    return 'data:image/png;base64,' + base64.b64encode(for_encoding).decode()

In [None]:
def create_interactive_molecular_map(smiles, featurization_method='morgan', width=800, height=600):
    """
    Create an interactive 2D scatter plot of molecules in chemical space using Bokeh.
    
    Parameters:
    -----------
    smiles : list of SMILES strings
        List of molecular structures to visualize
    featurization_method : str, optional (default='morgan')
        Method to convert molecules to feature vectors
    
    Returns:
    --------
    bokeh.plotting.Figure: Interactive molecular space visualization
    """

    molecules = [Chem.MolFromSmiles(smi) for smi in smiles]

    # Featurize molecules
    if featurization_method == 'morgan':
        # Morgan fingerprints (extended connectivity fingerprints)
        features = [AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048) 
                    for mol in molecules]
        features = np.array([list(x.ToBitString()) for x in features], dtype=float)

    elif featurization_method == 'descriptors':
        # Molecular descriptors (alternative approach)
        features = np.array([
            AllChem.CalcMolDescriptors(mol) 
            for mol in molecules
        ])

    else:
        raise ValueError("Invalid featurization method")


    # Standardize features
    scaler = StandardScaler()
    features_scaled = scaler.fit_transform(features)

    # Dimensionality reduction using UMAP
    reducer = UMAP(
        n_components=2,  # 2D visualization
        n_neighbors=15,  # Controls local vs global structure
        min_dist=0.1,    # Controls how tightly points are packed
        random_state=42  # For reproducibility
    )
    embedding = reducer.fit_transform(features_scaled)

    # Prepare molecule images and SMILES
    mol_images = [mol_to_base64(mol) for mol in molecules]
    # smiles_list = [Chem.MolToSmiles(mol) for mol in molecules]

    # Prepare data for Bokeh
    data=dict(
        x=embedding[:, 0],
        y=embedding[:, 1],
        smiles=smiles,
        images=mol_images
    )
    mol_images_df = pd.DataFrame(data)

    datasource = ColumnDataSource(mol_images_df)
    # color_mapping = CategoricalColorMapper(factors=[str(9 - x) for x in digits.target_names],
    #                                        palette=Spectral10)

    # Create Bokeh figure
    plot_figure = figure(
        title='Interactive Molecular Structures in Chemical Space',
        x_axis_label='UMAP Dimension 1',
        y_axis_label='UMAP Dimension 2',
        width=width,
        height=height,
        tools="pan, wheel_zoom, box_zoom, reset, save"
        # tools="pan, wheel_zoom, save"
    )

    # Add hover tool to the plot
    plot_figure.add_tools(HoverTool(
        # renderers=[scatter],
        # tooltips=[
        #     ("SMILES", "@smiles"),
        #     ("Molecule", "<img src='data:image/png;base64,@image' width='300' height='300'>")
        # ],
        mode='mouse',  # display info for closest point
        tooltips="""
        <div>
            <div>
                <span style='font-size: 16px; color: #224499'>SMILES:</span>
                <span style='font-size: 18px'>@smiles</span>
            </div>
            <div>
                <img src='@images' style='float: left; margin: 5px 5px 5px 5px'/>
            </div>
        </div>
        """
    ))

    scatter = plot_figure.circle(
        'x', 'y', 
        size=10, 
        color='navy', 
        alpha=0.5, 
        source=datasource
    )

    show(plot_figure)

# Evaluate Starting Molecules

In [None]:
# Starting molecules: 
# smiles_df = pd.read_csv("./.......csv")
start_smiles = smiles_df["SMILES"].to_list()
print(len(start_smiles))

In [None]:
# Configure Bokeh output for Jupyter Notebook
output_notebook()

# Example molecules (SMILES strings)
# smiles_list = [
#     'CC(=O)OC1=CC=CC=C1C(=O)O',   # Aspirin
#     'CC(C)(C)NCC(O)C1=CC(=C(C=C1)O)CO',  # Salbutamol
#     'CN1C=NC2=C1C(=O)N(C(=O)N2C)C',  # Caffeine
#     'CC12CCC3C(C1CCC2O)CCC4=C3C=CC(=C4)O',  # Estradiol
#     'C1=CC=C(C=C1)C(=O)CCCN2CCC(CC2)OC3=CC=C(C=C3)F'  # Haloperidol
# ]
smiles_list = start_smiles

In [None]:
create_interactive_molecular_map(smiles_list, featurization_method="morgan")

# Evaluate Generated Molecules

In [None]:
# molgen_results_df = pd.read_csv("./......csv")
molgen_smiles = molgen_results_df["SMILES"]

### CHECK:
print(len(molgen_smiles))

In [None]:
create_interactive_molecular_map(molgen_smiles)