In [1]:
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem import Descriptors
import numpy as np
import pandas as pd
from rdkit.Chem.rdFingerprintGenerator import GetMorganGenerator, GetAtomPairGenerator
from rdkit.Chem import MACCSkeys
from rdkit.Chem import AllChem

import umap
import base64
from io import BytesIO

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_excel('steroids_11_25_24 (1).xlsx', engine='openpyxl')
df.head()


Unnamed: 0,Entry,Entry Name,Protein names,Gene Names,Organism,Length,Sequence,Annotation,source,ChEBI ID,Rhea ID,SMILES
0,A0A016VA76,A0A016VA76_9BILA,Sterol carrier protein 2 (EC 2.3.1.155) (EC 2....,Acey_s0014.g2217 Acey-daf-22 Y032_0014g2217,Ancylostoma ceylanicum,531,MTKPKVFVIGVGMTKFCKPGSRDWDYPDMVKEAVTTALDDCKLKYS...,5,ChEBI,17759,62960,[H][C@@]1(CC[C@@]2([H])C3=CC=C4C[C@@H](O)CC[C@...
1,A0A023EUU2,A0A023EUU2_AEDAL,Sterol carrier protein 2 (EC 2.3.1.155) (EC 2....,,Aedes albopictus (Asian tiger mosquito) (Stego...,544,MGVPKVYVVGVGMTKFEKPGRRENFDYPQMAKEAVTKALNDARIQY...,5,ChEBI,17759,62960,[H][C@@]1(CC[C@@]2([H])C3=CC=C4C[C@@H](O)CC[C@...
2,A0A023EUU7,A0A023EUU7_AEDAL,Sterol carrier protein 2 (EC 2.3.1.155) (EC 2....,,Aedes albopictus (Asian tiger mosquito) (Stego...,544,MGVPKVYVVGVGMTKFEKPGRRENFDYPQMAKEAVTKALNDARIQY...,5,ChEBI,17759,62960,[H][C@@]1(CC[C@@]2([H])C3=CC=C4C[C@@H](O)CC[C@...
3,A0A023FCJ4,A0A023FCJ4_TRIIF,Sterol carrier protein 2 (EC 2.3.1.155) (EC 2....,,Triatoma infestans (Assassin bug),540,RVKVYVVGVGMTKFYKPGKSDKDYPELAKEAIMKALEDARINHDDV...,5,ChEBI,17759,62960,[H][C@@]1(CC[C@@]2([H])C3=CC=C4C[C@@H](O)CC[C@...
4,A0A026WVM4,A0A026WVM4_OOCBI,Sterol carrier protein 2 (EC 2.3.1.155) (EC 2....,X777_13892,Ooceraea biroi (Clonal raider ant) (Cerapachys...,541,MVYKPKVYVIGVGMTKFEKPGRRDDFDYPQMAKEAVSKALQDSRIY...,5,ChEBI,17759,62960,[H][C@@]1(CC[C@@]2([H])C3=CC=C4C[C@@H](O)CC[C@...


In [3]:
# Step 1: Drop rows with missing SMILES or ChEBI
df_clean = df.dropna(subset=['SMILES', 'ChEBI ID']).copy()

# Step 2: Split SMILES and ChEBI ID by semicolon
df_clean['SMILES'] = df_clean['SMILES'].str.split(';').apply(lambda lst: [s.strip() for s in lst])
df_clean['ChEBI ID'] = df_clean['ChEBI ID'].astype(str).str.split(';').apply(lambda lst: [s.strip() for s in lst])

# Step 3: Truncate to match shortest length
def align_smiles_chebi(row):
    min_len = min(len(row['SMILES']), len(row['ChEBI ID']))
    row['SMILES'] = row['SMILES'][:min_len]
    row['ChEBI ID'] = row['ChEBI ID'][:min_len]
    return row

df_aligned = df_clean.apply(align_smiles_chebi, axis=1)

# Step 4: Explode both
df_exploded = df_aligned.explode(['SMILES', 'ChEBI ID'], ignore_index=True)
df_exploded = df_exploded[df_exploded['SMILES'].notna()]


In [4]:
df_exploded.head()

Unnamed: 0,Entry,Entry Name,Protein names,Gene Names,Organism,Length,Sequence,Annotation,source,ChEBI ID,Rhea ID,SMILES
0,A0A016VA76,A0A016VA76_9BILA,Sterol carrier protein 2 (EC 2.3.1.155) (EC 2....,Acey_s0014.g2217 Acey-daf-22 Y032_0014g2217,Ancylostoma ceylanicum,531,MTKPKVFVIGVGMTKFCKPGSRDWDYPDMVKEAVTTALDDCKLKYS...,5,ChEBI,17759,62960,[H][C@@]1(CC[C@@]2([H])C3=CC=C4C[C@@H](O)CC[C@...
1,A0A023EUU2,A0A023EUU2_AEDAL,Sterol carrier protein 2 (EC 2.3.1.155) (EC 2....,,Aedes albopictus (Asian tiger mosquito) (Stego...,544,MGVPKVYVVGVGMTKFEKPGRRENFDYPQMAKEAVTKALNDARIQY...,5,ChEBI,17759,62960,[H][C@@]1(CC[C@@]2([H])C3=CC=C4C[C@@H](O)CC[C@...
2,A0A023EUU7,A0A023EUU7_AEDAL,Sterol carrier protein 2 (EC 2.3.1.155) (EC 2....,,Aedes albopictus (Asian tiger mosquito) (Stego...,544,MGVPKVYVVGVGMTKFEKPGRRENFDYPQMAKEAVTKALNDARIQY...,5,ChEBI,17759,62960,[H][C@@]1(CC[C@@]2([H])C3=CC=C4C[C@@H](O)CC[C@...
3,A0A023FCJ4,A0A023FCJ4_TRIIF,Sterol carrier protein 2 (EC 2.3.1.155) (EC 2....,,Triatoma infestans (Assassin bug),540,RVKVYVVGVGMTKFYKPGKSDKDYPELAKEAIMKALEDARINHDDV...,5,ChEBI,17759,62960,[H][C@@]1(CC[C@@]2([H])C3=CC=C4C[C@@H](O)CC[C@...
4,A0A026WVM4,A0A026WVM4_OOCBI,Sterol carrier protein 2 (EC 2.3.1.155) (EC 2....,X777_13892,Ooceraea biroi (Clonal raider ant) (Cerapachys...,541,MVYKPKVYVIGVGMTKFEKPGRRDDFDYPQMAKEAVSKALQDSRIY...,5,ChEBI,17759,62960,[H][C@@]1(CC[C@@]2([H])C3=CC=C4C[C@@H](O)CC[C@...


In [5]:
grouped_df = df_exploded.groupby('SMILES').agg({
    'ChEBI ID': lambda x: list(set(x)),             # Unique ChEBI IDs
    'Entry': list,
    'Entry Name': list,
    'Protein names': list,
    'Gene Names': list,
    'Organism': list,
    'Length': list,
    'Sequence': list
}).reset_index()


In [6]:
grouped_df

Unnamed: 0,SMILES,ChEBI ID,Entry,Entry Name,Protein names,Gene Names,Organism,Length,Sequence
0,**,[17792],"[P00502, P04904, P08263, P0CG30, P13745, P3011...","[GSTA1_RAT, GSTA3_RAT, GSTA1_HUMAN, GSTT2_HUMA...",[Glutathione S-transferase alpha-1 (EC 2.5.1.1...,"[Gsta1, Gsta3 Gstyc1, GSTA1, GSTT2B GSTT2, Gst...","[Rattus norvegicus (Rat), Rattus norvegicus (R...","[222, 221, 222, 244, 223, 221, 222, 222]",[MSGKPVLHYFNARGRMECIRWLLAAAGVEFDEKFIQSPEDLEKLK...
1,C(SCCNC(CCNC(=O)[C@@H](C(COP([O-])(=O)[O-])(C)...,[132021],"[P0C024, Q99P30]","[NUDT7_HUMAN, NUDT7_MOUSE]",[Peroxisomal coenzyme A diphosphatase NUDT7 (E...,"[NUDT7, Nudt7]","[Homo sapiens (Human), Mus musculus (Mouse)]","[238, 236]",[MSRLGLPEEPVRNSLLDDAKARLRKYDIGGKYSHLPYNKYSVLLP...
2,C12(C=CC(C=C1CCC3C2CCC4(C3CCC4*)C)=O)C,[77166],[P71864],[3O1D_MYCTU],[3-oxosteroid 1-dehydrogenase (EC 1.3.99.4) (3...,[kstD Rv3537],[Mycobacterium tuberculosis (strain ATCC 25618...,[563],[MTVQEFDVVVVGSGAAGMVAALVAAHRGLSTVVVEKAPHYGGSTA...
3,C12(CCCCC1CCC3C2C(CC4(C3CCC4*)C)=O)C,[47787],"[A0A096MXN2, A0A218UG05, A0A2J8V6J2, A0A2K5CJR...","[A0A096MXN2_PAPAN, A0A218UG05_9PASE, A0A2J8V6J...",[11-beta-hydroxysteroid dehydrogenase 1 (EC 1....,"[HSD11B1, HSD11B1_1 RLOC_00001311, HSD11B1 CR2...","[Papio anubis (Olive baboon), Lonchura striata...","[292, 363, 292, 291, 292, 292, 292, 292, 292, ...",[MAFMKKYLLPILGLFMAYYYYSAYEEFRPEMLQGKKVIVTGASKG...
4,C12(CCCCC1CCC3C2[C@H](CC4(C3CCC4*)C)O)C,[35346],"[A0A096MXN2, A0A218UG05, A0A2J8V6J2, A0A2K5CJR...","[A0A096MXN2_PAPAN, A0A218UG05_9PASE, A0A2J8V6J...",[11-beta-hydroxysteroid dehydrogenase 1 (EC 1....,"[HSD11B1, HSD11B1_1 RLOC_00001311, HSD11B1 CR2...","[Papio anubis (Olive baboon), Lonchura striata...","[292, 363, 292, 291, 292, 292, 292, 292, 292, ...",[MAFMKKYLLPILGLFMAYYYYSAYEEFRPEMLQGKKVIVTGASKG...
...,...,...,...,...,...,...,...,...,...
546,[H][C@]12CC[C@]3([H])[C@]([H])(C[C@@H](O)[C@]4...,[71002],[C8WLM1],[CGR2_EGGLE],[Digoxin reductase (EC 1.3.2.-) (Cardenolide r...,[cgr2 Elen_2529],[Eggerthella lenta (strain ATCC 25559 / DSM 22...,[560],[MEYGKCRGIERGMGRRDFLKAATLLGATAAGAGMLAGCAPKSASE...
547,[H][C@]12C[C@@H](O)[C@@]3([H])[C@]4([H])CC[C@]...,[2288],"[P31210, P51857, Q8VCX1, Q9TV64]","[AK1D1_RAT, AK1D1_HUMAN, AK1D1_MOUSE, AK1D1_RA...",[Aldo-keto reductase family 1 member D1 (EC 1....,"[Akr1d1, AKR1D1 SRD5B1, Akr1d1, AKR1D1]","[Rattus norvegicus (Rat), Homo sapiens (Human)...","[326, 326, 325, 326]",[MNLSTANHHIPLNDGNSIPIIGLGTYSDPRPVPGKTFIAVKTAID...
548,[H][C@]12C[C@@H](O)[C@]3([H])[C@]([H])(CC[C@]4...,[2290],"[P31210, P51857, Q8VCX1, Q9TV64]","[AK1D1_RAT, AK1D1_HUMAN, AK1D1_MOUSE, AK1D1_RA...",[Aldo-keto reductase family 1 member D1 (EC 1....,"[Akr1d1, AKR1D1 SRD5B1, Akr1d1, AKR1D1]","[Rattus norvegicus (Rat), Homo sapiens (Human)...","[326, 326, 325, 326]",[MNLSTANHHIPLNDGNSIPIIGLGTYSDPRPVPGKTFIAVKTAID...
549,[H][C@]12C[C@@]3([H])[C@]4([H])CC=C5C[C@H](CC[...,[2938],[Q38786],[AVCO1_AVESA],[Avenacosidase 1 (EC 3.2.1.188) (26-desgluco-a...,[P60A GLU1],[Avena sativa (Oat)],[574],[MALLCSALSNSTHPSFRSHIGANSENLWHLSADPAQKSKRRCNLT...


In [7]:
# Create the generator once
morgan_gen = GetMorganGenerator(radius=2, fpSize=2048)

# Convert SMILES to mols if not already done
grouped_df['mol'] = grouped_df['SMILES'].apply(Chem.MolFromSmiles)

# Generate fingerprints with the new generator
grouped_df['fp'] = grouped_df['mol'].apply(
    lambda mol: morgan_gen.GetFingerprint(mol) if mol else None
)

In [8]:
# Only keep valid fingerprints
valid_fps = grouped_df['fp'].dropna()
fp_array = np.array([np.asarray(fp) for fp in valid_fps])


In [9]:
import umap
print(umap.__file__)


/Users/akshayuppal/.pyenv/versions/3.12.8/lib/python3.12/site-packages/umap/__init__.py


In [10]:
from umap import UMAP

In [11]:
umap_coords = umap.UMAP(n_neighbors=15, min_dist=0.1, metric='jaccard').fit_transform(fp_array)

# Insert UMAP back into the dataframe (filtered to match valid_fps)
grouped_df_valid = grouped_df.loc[valid_fps.index].copy()
grouped_df_valid['UMAP_1'] = umap_coords[:, 0]
grouped_df_valid['UMAP_2'] = umap_coords[:, 1]

  warn(


In [12]:
grouped_df_valid.columns

Index(['SMILES', 'ChEBI ID', 'Entry', 'Entry Name', 'Protein names',
       'Gene Names', 'Organism', 'Length', 'Sequence', 'mol', 'fp', 'UMAP_1',
       'UMAP_2'],
      dtype='object')

In [13]:
umap_df = grouped_df_valid[['SMILES', 'ChEBI ID', 'Entry', 'Entry Name', 'Protein names',
       'Gene Names', 'Organism', 'Length', 'Sequence', 'UMAP_1',
       'UMAP_2']].copy()

In [14]:
umap_df.to_csv("small_molecule.csv", index=False)