# Explore chemical space generated by each representation

In [44]:
from deepmol.compound_featurization import NPClassifierFP
from deepmol.loaders import CSVLoader

dataset = CSVLoader(dataset_path="../test_dataset.csv",
            smiles_field="SMILES").create_dataset()


In [45]:
from deepmol.compound_featurization import NPClassifierFP

dataset = NPClassifierFP().featurize(dataset)

NPClassifierFP: 100%|██████████| 15541/15541 [00:50<00:00, 309.42it/s]


In [47]:
dataset.get_shape()

2025-06-06 17:03:53,800 — INFO — Mols_shape: (15541,)
2025-06-06 17:03:53,802 — INFO — Features_shape: (15541, 6144)
2025-06-06 17:03:53,802 — INFO — Labels_shape: None


((15541,), (15541, 6144), None)

In [48]:
# create a tsne plot
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
X = dataset.X
tsne = TSNE(n_components=2, random_state=42)
X_embedded = tsne.fit_transform(X)


In [50]:
import pandas as pd

pandas_dataset = pd.read_csv("../test_dataset.csv")
pandas_dataset

Unnamed: 0,key,SMILES,Alkaloids,Amino acids and Peptides,Carbohydrates,Fatty acids,Polyketides,Shikimates and Phenylpropanoids,Terpenoids,Alkylresorsinols,...,Vitamin D3 and derivatives,Wax diesters,Wax monoesters,Xeniaphyllane diterpenoids,Xenicane diterpenoids,Yohimbine-like alkaloids,Zearalenones,Zizaane sesquiterpenoids,m-Terphenyls,p-Terphenyls
0,XFBQQSLGCORCGH,CCCCC/C=C\C/C=C\CCCCCCCC(=O)OC[C@@H](COC(=O)CC...,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,UBJDLKGJDZONCO,COc1cc(C2Oc3cc4c(cc3C2COC(C)=O)-c2c(cc(O)cc2OC...,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,WCXMGPNELNHROM,COc1cc2c(cc1O)OC(c1ccccc1)CC2=O,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,XRBNWTBBMTUZLA,COc1cccc(CCOC2OC(CO)C(OC(=O)C=Cc3ccc(O)c(O)c3)...,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,WDDXJPVDQFRGNC,CC(C)c1c(O)ccc2c1C(=O)CC1C(C)(C(=O)O)CCCC21C,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15536,COVDAEBDAVGGFK,CC(C)=CCC/C(C)=C/C/C=C(\C)[C@H](O)Cc1c[nH]c([N...,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
15537,TXTGITRXQUOAJM,COC1Oc2ccccc2-c2c1oc1cc(O)c(C)c(O)c1c2=O,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
15538,VUAIYBIBQXIJKV,Cc1c2c(cc3c1CC(C)(C)C3=O)C(=O)OCC2,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
15539,HFPLCFMATPQWMD,O=C(O)CC(Cl)Cl,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [51]:
# For each row, find columns with value 1 and assign their names to a new column 'Assigned_Class'
def get_assigned_classes(row):
    return [col for col in pandas_dataset.columns if row[col] == 1][0]

pandas_dataset['Assigned_Class'] = pandas_dataset.apply(get_assigned_classes, axis=1)
pandas_dataset['Assigned_Class']

0                            Fatty acids
1        Shikimates and Phenylpropanoids
2        Shikimates and Phenylpropanoids
3        Shikimates and Phenylpropanoids
4                             Terpenoids
                      ...               
15536                         Terpenoids
15537    Shikimates and Phenylpropanoids
15538                         Terpenoids
15539                        Fatty acids
15540                         Terpenoids
Name: Assigned_Class, Length: 15541, dtype: object

In [1]:
import matplotlib.pyplot as plt
import numpy as np
from deepmol.loaders import CSVLoader
from sklearn.manifold import TSNE
import pandas as pd

def generate_tsne_with_featurizer(featurizer, name=None):

    if not name:
        name = featurizer.__class__.__name__

    dataset = CSVLoader(dataset_path="../test_dataset.csv",
                smiles_field="SMILES").create_dataset()

    dataset = featurizer.featurize(dataset)
    X = dataset.X

    tsne = TSNE(n_components=2, random_state=42)
    X_embedded = tsne.fit_transform(X)

    pandas_dataset = pd.read_csv("../test_dataset.csv")

    def get_assigned_classes(row):
        return [col for col in pandas_dataset.columns if row[col] == 1][0]

    pandas_dataset['Assigned_Class'] = pandas_dataset.apply(get_assigned_classes, axis=1)

    opacity = 0.5
    colors = {
        'blue': [55, 126, 184],
        'orange': [255, 127, 0],
        'green': [77, 175, 74],
        'pink': [247, 129, 191],
        'brown': [166, 86, 40],
        'purple': [152, 78, 163],
        'gray': [153, 153, 153],
        'red': [228, 26, 28],
        'yellow': [222, 222, 0]
    }

    c_str = {k:f'rgba({v[0]},{v[1]},{v[2]},{opacity})'
            for (k, v) in colors.items()}

    unique_classes = pandas_dataset["Assigned_Class"].unique()

    colors_keys = list(c_str.keys())
    c_str = {
        cls: colors_keys[i]
        for i, cls in enumerate(unique_classes)
    }

    # Create a figure for the t-SNE plot
    plt.figure(figsize=(10, 8))

    for cls in unique_classes:
        mask = pandas_dataset['Assigned_Class'] == cls
        plt.scatter(X_embedded[mask, 0], X_embedded[mask, 1], color=c_str[cls], label=cls, s=5)

    plt.title(f't-SNE plot of {name} features')
    plt.clim(-0.5, len(unique_classes) - 0.5)
    plt.xticks(rotation=45)
    plt.grid(True)
    plt.xlabel('t-SNE Component 1')
    plt.ylabel('t-SNE Component 2')

    plt.savefig(f"tsne_plot_{name}.png", bbox_inches='tight', dpi=300)
    plt.close()

    # Create a separate figure for the legend
    plt.figure(figsize=(10, 0.5))
    plt.figlegend(handles=[plt.Line2D([0], [0], marker='o', color='w', label=cls, markerfacecolor=c_str[cls], markersize=7) for cls in unique_classes],
                 loc='center', ncol=1, title="Assigned Class")

    plt.savefig(f"tsne_legend_{name}.png", bbox_inches='tight', dpi=300)
    plt.close()

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
from deepmol.compound_featurization import NPClassifierFP

featurizer = NPClassifierFP()
generate_tsne_with_featurizer(featurizer)

NPClassifierFP: 100%|██████████| 15541/15541 [00:45<00:00, 339.77it/s]


In [10]:
from deepmol.compound_featurization import NeuralNPFP

featurizer = NeuralNPFP()
generate_tsne_with_featurizer(featurizer)

NeuralNPFP: 100%|██████████| 15541/15541 [02:26<00:00, 106.34it/s]


In [11]:
from deepmol.compound_featurization import BiosynfoniKeys

featurizer = BiosynfoniKeys()
generate_tsne_with_featurizer(featurizer)

BiosynfoniKeys: 100%|██████████| 15541/15541 [00:55<00:00, 281.23it/s]


In [12]:
import os
from deepmol.compound_featurization import LLM
from transformers import BertConfig, BertModel

from deepmol.standardizer import ChEMBLStandardizer

from deepmol.tokenizers import NPBERTTokenizer

transformer = LLM(model_path="../../NPBERT", model=BertModel, config_class=BertConfig,
                          tokenizer=NPBERTTokenizer(vocab_file=os.path.join("../../NPBERT", "vocab.txt")), device="cuda:0")

generate_tsne_with_featurizer(transformer)

Some weights of BertModel were not initialized from the model checkpoint at ../../NPBERT/model.pt and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 15541/15541 [03:57<00:00, 65.43it/s]


In [2]:
import os
from deepmol.compound_featurization import LLM

from deepmol.compound_featurization import LLM
from transformers import ModernBertModel, ModernBertConfig

transformer = LLM(model_path="../../ModernBERT", model=ModernBertModel, config_class=ModernBertConfig, device="cuda:1")

generate_tsne_with_featurizer(transformer, name="ModernBERT")

No normalization for SPS. Feature removed!
No normalization for AvgIpc. Feature removed!
2025-06-09 16:01:07.458643: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-06-09 16:01:07.458741: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-06-09 16:01:07.461140: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-06-09 16:01:07.476030: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild Tens

Instructions for updating:
experimental_relax_shapes is deprecated, use reduce_retracing instead


Skipped loading modules with pytorch-lightning dependency, missing a dependency. No module named 'lightning'
Skipped loading some Jax models, missing a dependency. No module named 'jax'
100%|██████████| 15541/15541 [36:36<00:00,  7.08it/s]
