<a href="https://colab.research.google.com/github/jeremycheminf/colab_notebooks/blob/main/SureChEMBL_FPSim2_similarity_search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Run a similarity search using SureChEMBL's FPSim2 database
- First of all, install fpsim2


In [1]:
%%time
!pip install fpsim2 rdkit==2024.03.5

Collecting fpsim2
  Downloading FPSim2-0.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.5 kB)
Collecting rdkit==2024.03.5
  Downloading rdkit-2024.3.5-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.9 kB)
Collecting tables>=3.10 (from fpsim2)
  Downloading tables-3.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.2 kB)
Collecting blosc2>=2.3.0 (from tables>=3.10->fpsim2)
  Downloading blosc2-2.7.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.1 kB)
Collecting ndindex>=1.4 (from blosc2>=2.3.0->tables>=3.10->fpsim2)
  Downloading ndindex-1.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.4 kB)
Downloading rdkit-2024.3.5-cp310-cp310-manylinux_2_28_x86_64.whl (33.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.1/33.1 MB[0m [31m26.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading FPSim2-0.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.3 MB)


# Download SureChEMBL's FPSim2 file (created with data from November 2024)

In [2]:
%%time
import gdown

file_id = '1R3eGLYBWVkckkHNZkJOQRN5lJ4K39-Yl'
fp_filename = 'surechembl_11_2024.h5'

download_url = f'https://drive.google.com/uc?id={file_id}'
gdown.download(download_url, fp_filename, quiet=False)

Downloading...
From (original): https://drive.google.com/uc?id=1R3eGLYBWVkckkHNZkJOQRN5lJ4K39-Yl
From (redirected): https://drive.google.com/uc?id=1R3eGLYBWVkckkHNZkJOQRN5lJ4K39-Yl&confirm=t&uuid=82529a16-75b5-4a5b-b3c1-5b09ebfb6e89
To: /content/surechembl_11_2024.h5
100%|██████████| 1.34G/1.34G [00:17<00:00, 76.4MB/s]

CPU times: user 3.45 s, sys: 3.37 s, total: 6.81 s
Wall time: 21.4 s





'surechembl_11_2024.h5'

# Load the FPSim2 file in memory

In [3]:
%%time
from FPSim2 import FPSim2Engine

fpe = FPSim2Engine(fp_filename)
print(f"{fpe.fps.shape[0]} molecules")

28587451 molecules
CPU times: user 2.66 s, sys: 1.04 s, total: 3.7 s
Wall time: 9.25 s


# Run the search!

In [15]:
%%time
query = 'CC(=O)Oc1ccccc1C(=O)O'
results = fpe.similarity(query, 0.7)
df = pd.DataFrame(results)
df['mol_id'] = df['mol_id'].astype(str)
df

CPU times: user 67.3 ms, sys: 0 ns, total: 67.3 ms
Wall time: 65.8 ms




Unnamed: 0,mol_id,coeff
0,28300944,1.0
1,29038900,1.0
2,27908457,1.0
3,7580,1.0
4,28268966,1.0
...,...,...
204,3239715,0.7
205,1156927,0.7
206,8050658,0.7
207,16966572,0.7


In [16]:
# prompt: From the df above I want to get info from the API https://www.surechembl.org/api/chemical/id/10408439 where ids are mol_id from the df. The outcome is a json like {"status":"OK","timestamp":"2024-11-12 15:55:27 +0000","error_message":"","data":[{"id":"10408439","chemical_id":"10408439","name":"2-(acetyloxy)benzoic acid 3-methylphenyl acetate","smiles":"CC(=O)OC1=CC(C)=CC=C1.CC(=O)OC1=C(C=CC=C1)C(O)=O","inchi":"InChI=1S/C9H8O4.C9H10O2/c1-6(10)13-8-5-3-2-4-7(8)9(11)12;1-7-4-3-5-9(6-7)11-8(2)10/h2-5H,1H3,(H,11,12);3-6H,1-2H3","inchi_key":"PGTLCPXGJDOQSY-UHFFFAOYSA-N","mol_weight":330.3320007324219,"is_element":"0","global_frequency":2,"mchem_struct_alert":"1","is_radical":"0","is_fragment":"0","is_connected":"0","is_singleton":"0","is_simple":"0","is_lipinski":"1","is_lead_likeness":"1","is_bio_availability":"1","log_p":1.2380900382995605,"donor_count":1,"accept_count":3,"ring_count":2,"rotatable_bond_count":5}]}  I want this data to be new columns

import pandas as pd
import requests

def get_surechembl_data(mol_id):
  url = f"https://www.surechembl.org/api/chemical/id/{mol_id}"
  response = requests.get(url)
  if response.status_code == 200:
    return response.json()
  else:
    print(f"Error fetching data for ID {mol_id}: Status code {response.status_code}")
    return None

# Assuming your DataFrame is named 'df' and has a column named 'mol_id'
# Create empty lists to store the new columns data
new_columns_data = {}

for col in ["chemical_id", "name", "smiles", "inchi", "inchi_key", "mol_weight", "is_element",
            "global_frequency", "mchem_struct_alert", "is_radical", "is_fragment", "is_connected",
            "is_singleton", "is_simple", "is_lipinski", "is_lead_likeness", "is_bio_availability",
            "log_p", "donor_count", "accept_count", "ring_count", "rotatable_bond_count"]:
    new_columns_data[col] = []

for index, row in df.iterrows():
    mol_id = row['mol_id']
    data = get_surechembl_data(mol_id)
    if data and data['status'] == 'OK' and data['data']:
        chemical_data = data['data'][0]
        for col in new_columns_data:
            new_columns_data[col].append(chemical_data.get(col))
    else:
        # Handle cases where data is not available or status is not OK
        for col in new_columns_data:
            new_columns_data[col].append(None)  # Or another suitable default value

# Add the new columns to the DataFrame
for col, data in new_columns_data.items():
    df[col] = data

In [17]:
df

Unnamed: 0,mol_id,coeff,chemical_id,name,smiles,inchi,inchi_key,mol_weight,is_element,global_frequency,...,is_singleton,is_simple,is_lipinski,is_lead_likeness,is_bio_availability,log_p,donor_count,accept_count,ring_count,rotatable_bond_count
0,28300944,1.0,28300944,2-(acetyloxy)benzoic acid; propan-2-one,CC(C)=O.CC(=O)OC1=CC=CC=C1C(O)=O,InChI=1S/C9H8O4.C3H6O/c1-6(10)13-8-5-3-2-4-7(8...,JZAWQZDCNCWEHV-UHFFFAOYSA-N,238.238998,0,1,...,0,0,1,1,1,1.238090,1,3,1,3
1,29038900,1.0,29038900,2-(acetyloxy)benzoic acid alumanyl,[AlH2].CC(=O)OC1=C(C=CC=C1)C(O)=O,InChI=1S/C9H8O4.Al.2H/c1-6(10)13-8-5-3-2-4-7(8...,UEIUQRUWVHQQCT-UHFFFAOYSA-N,209.156998,0,1,...,0,0,1,1,1,1.238090,1,3,1,3
2,27908457,1.0,27908457,2-(acetyloxy)benzoic acid platinum,[Pt].CC(=O)OC1=CC=CC=C1C(O)=O,InChI=1S/C9H8O4.Pt/c1-6(10)13-8-5-3-2-4-7(8)9(...,LRDVCUFIJAJFAT-UHFFFAOYSA-N,375.243011,0,1,...,0,0,1,1,1,1.238090,1,3,1,3
3,7580,1.0,7580,bis(2-(acetyloxy)benzoic acid),CC(=O)OC1=CC=CC=C1C(O)=O.CC(=O)OC1=C(C=CC=C1)C...,InChI=1S/2C9H8O4/c2*1-6(10)13-8-5-3-2-4-7(8)9(...,LSVICRMDTZSTDC-UHFFFAOYSA-N,360.315002,0,37,...,0,0,1,1,1,1.238090,1,3,2,6
4,28268966,1.0,28268966,2-(acetyloxy)benzoic acid neodymium,[Nd].CC(=O)OC1=CC=CC=C1C(O)=O,InChI=1S/C9H8O4.Nd/c1-6(10)13-8-5-3-2-4-7(8)9(...,BZKNRFSFQIEBAP-UHFFFAOYSA-N,324.401001,0,1,...,0,0,1,1,1,1.238090,1,3,1,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204,3239715,0.7,3239715,2-(but-1-en-2-yl)phenyl acetate,CCC(=C)C1=C(OC(C)=O)C=CC=C1,InChI=1S/C12H14O2/c1-4-9(2)11-7-5-6-8-12(11)14...,AFCGUQKQHRDZPR-UHFFFAOYSA-N,190.238007,0,1,...,0,0,1,1,1,3.061170,0,1,1,4
205,1156927,0.7,1156927,"3-(acetyloxy)benzene-1,2-dicarboxylic acid",CC(=O)OC1=C(C(O)=O)C(=CC=C1)C(O)=O,InChI=1S/C10H8O6/c1-5(11)16-7-4-2-3-6(9(12)13)...,OUIUFQLVPHLLOY-UHFFFAOYSA-N,224.167007,0,22,...,0,0,1,0,1,0.895673,2,5,1,4
206,8050658,0.7,8050658,2-(2-bromoacetyl)phenyl acetate,CC(=O)OC1=CC=CC=C1C(=O)CBr,InChI=1S/C10H9BrO3/c1-7(12)14-10-5-3-2-4-8(10)...,DGGQZQQASYINLT-UHFFFAOYSA-N,257.080994,0,5,...,0,0,1,1,1,1.860990,0,2,1,4
207,16966572,0.7,16966572,bis(2-[(2-methylprop-2-enoyl)oxy]benzoic acid),CC(=C)C(=O)OC1=CC=CC=C1C(O)=O.CC(=C)C(=O)OC1=C...,InChI=1S/2C11H10O4/c2*1-7(2)11(14)15-9-6-4-3-5...,RKOMFEJKYXWXCF-UHFFFAOYSA-N,412.389008,0,1,...,0,0,1,1,1,2.623270,1,3,2,8
