In [6]:
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem import Descriptors
import numpy as np
import pandas as pd
from rdkit.Chem.rdFingerprintGenerator import GetMorganGenerator, GetAtomPairGenerator
from rdkit.Chem import MACCSkeys

import umap
import base64
from io import BytesIO

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
df = pd.read_csv('SMILES_values.csv')

In [8]:
df

Unnamed: 0,Compounds,SMILES,logP,Molecular Weight
0,Compound 1,[H][C@]12CC[C@]3([H])[C@]([H])(CC[C@]4(C)[C@@H...,3.7509,292.463
1,Compound 2,[H][C@@]12CCC3=CC(=O)CC[C@]3(C)[C@@]1([H])CC[C...,4.5153,316.485
2,Compound 3,[H][C@@]12CC[C@]([H])([C@H](C)CC[C@H](O)C(C)C)...,6.2796,400.647
3,Compound 4,*C1CCC2C3CCC4CCCCC4(C)C3[C@@H](O)CC12C,4.7278,275.456
4,Compound 5,CC(C)CC[C@H](O)C(C)C1CCC2C3CCC4CCCCC4(C)C3CCC21C,7.4686,388.680
...,...,...,...,...
795,Compound 796,[H][C@]12CC[C@]3([H])[C@]([H])(CC[C@@]4(C)[C@@...,3.6180,578.787
796,Compound 797,[H][C@@]12CCC3C[C@H](O)CC[C@]3(C)[C@@]1([H])CC...,3.7509,292.463
797,Compound 798,[H][C@@]12CC[C@]([H])(C(=O)C=O)[C@@]1(C)CC(=O)...,3.0815,342.435
798,Compound 799,[H][C@@]12CC[C@]3([H])[C@]([H])(CC[C@]4(C)C(=O...,3.3463,302.414


In [9]:
# Initialize new generators
morgan_gen = GetMorganGenerator(radius=2, fpSize=1024)
atom_pair_gen = GetAtomPairGenerator(fpSize=1024)

# Function to compute molecular fingerprints
def compute_fingerprints(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if not mol:
        return None, None, None

    # Morgan Fingerprint
    morgan_fp = morgan_gen.GetFingerprint(mol)
    morgan_fp_array = np.array(morgan_fp)

    # MACCS Keys Fingerprint
    maccs_fp = MACCSkeys.GenMACCSKeys(mol)
    maccs_fp_array = np.array(maccs_fp)

    # Atom-Pair Fingerprint
    atom_pair_fp = atom_pair_gen.GetFingerprint(mol)
    atom_pair_fp_array = np.array(atom_pair_fp)

    return morgan_fp_array, maccs_fp_array, atom_pair_fp_array

# Compute fingerprints for each molecule
df["Morgan_FP"], df["MACCS_FP"], df["AtomPair_FP"] = zip(*df["SMILES"].apply(compute_fingerprints))

# Save or Display the updated dataframe
# df.to_csv("steroid_fingerprints.csv", index=False)
df.head()

Unnamed: 0,Compounds,SMILES,logP,Molecular Weight,Morgan_FP,MACCS_FP,AtomPair_FP
0,Compound 1,[H][C@]12CC[C@]3([H])[C@]([H])(CC[C@]4(C)[C@@H...,3.7509,292.463,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,Compound 2,[H][C@@]12CCC3=CC(=O)CC[C@]3(C)[C@@]1([H])CC[C...,4.5153,316.485,"[0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,Compound 3,[H][C@@]12CC[C@]([H])([C@H](C)CC[C@H](O)C(C)C)...,6.2796,400.647,"[0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, ..."
3,Compound 4,*C1CCC2C3CCC4CCCCC4(C)C3[C@@H](O)CC12C,4.7278,275.456,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,Compound 5,CC(C)CC[C@H](O)C(C)C1CCC2C3CCC4CCCCC4(C)C3CCC21C,7.4686,388.68,"[0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, ..."


In [11]:
# Convert fingerprint list to NumPy array
fingerprints = df["Morgan_FP"].tolist()  # Using Morgan Fingerprints
fingerprints_np = np.array(fingerprints)
umap_model = umap.UMAP(n_neighbors=5, min_dist=0.3, metric="euclidean")
embedding = umap_model.fit_transform(fingerprints_np)

# Add UMAP coordinates to the dataframe
df["UMAP_1"] = embedding[:, 0]
df["UMAP_2"] = embedding[:, 1]



In [12]:
df

Unnamed: 0,Compounds,SMILES,logP,Molecular Weight,Morgan_FP,MACCS_FP,AtomPair_FP,UMAP_1,UMAP_2
0,Compound 1,[H][C@]12CC[C@]3([H])[C@]([H])(CC[C@]4(C)[C@@H...,3.7509,292.463,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",-5.742396,11.522367
1,Compound 2,[H][C@@]12CCC3=CC(=O)CC[C@]3(C)[C@@]1([H])CC[C...,4.5153,316.485,"[0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",-5.823915,5.520349
2,Compound 3,[H][C@@]12CC[C@]([H])([C@H](C)CC[C@H](O)C(C)C)...,6.2796,400.647,"[0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, ...",5.748560,-3.768179
3,Compound 4,*C1CCC2C3CCC4CCCCC4(C)C3[C@@H](O)CC12C,4.7278,275.456,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",-4.894306,12.032340
4,Compound 5,CC(C)CC[C@H](O)C(C)C1CCC2C3CCC4CCCCC4(C)C3CCC21C,7.4686,388.680,"[0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...",-4.974775,2.083085
...,...,...,...,...,...,...,...,...,...
795,Compound 796,[H][C@]12CC[C@]3([H])[C@]([H])(CC[C@@]4(C)[C@@...,3.6180,578.787,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, ...",4.534284,3.701876
796,Compound 797,[H][C@@]12CCC3C[C@H](O)CC[C@]3(C)[C@@]1([H])CC...,3.7509,292.463,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",-5.786797,11.466669
797,Compound 798,[H][C@@]12CC[C@]([H])(C(=O)C=O)[C@@]1(C)CC(=O)...,3.0815,342.435,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",-3.951260,6.038994
798,Compound 799,[H][C@@]12CC[C@]3([H])[C@]([H])(CC[C@]4(C)C(=O...,3.3463,302.414,"[0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",-6.874006,9.479789


In [13]:
df.to_csv('umap_smiles.csv')