### Manipulating Molecules and Proteins within OpenBioMed

OpenBioMed develops specific data structures for handling molecules, pockets, proteins and other data modalities in chemistry and life science. We provide easy-to-use APIs for input loading, data manipulation and format transformation.

In [1]:
# Change working directory
import os
import sys
parent = os.path.dirname(os.path.abspath(''))
print(parent)
sys.path.append(parent)
os.chdir(parent)

import logging
logging.basicConfig(level=logging.ERROR)

/AIRvePFS/dair/luoyz-data/projects/OpenBioMed/OpenBioMed_arch


Here we present several basic usage of the `Molecule` data structure.

In [2]:
# Manipulating molecules
from open_biomed.data.molecule import Molecule

# Initialize a molecule with a SMILES string
molecule = Molecule.from_smiles("CN1CCC[C@H]1COC2=NC3=C(CCN(C3)C4=CC=CC5=C4C(=CC=C5)Cl)C(=N2)N6CCN([C@H](C6)CC#N)C(=O)C(=C)F")
# Add a RDKit molecule object
molecule._add_rdmol()
# Then, by calling molecule.rdmol, you can get or modify information with RDKit
print(molecule.rdmol.GetNumAtoms(), molecule.rdmol.GetNumBonds())
# Add a 3D conformer
molecule._add_conformer(mode='3D')
# You can directly obtain an ndarray of 3D coordinates, which is synchronized in molecule.rdmol
print(molecule.conformer)
# Save the molecule as a .sdf file
print(molecule.save_sdf("./tmp/molecule_with_conformation.sdf"))

43 48
[[ 7.15979818  0.80874434 -1.1777022 ]
 [ 6.03656999  0.98159063 -0.25523318]
 [ 6.5018441   1.18971222  1.11943242]
 [ 6.66523363 -0.20081618  1.6896087 ]
 [ 5.54799761 -1.00059728  1.0349241 ]
 [ 5.15551891 -0.20738398 -0.21709433]
 [ 3.67186942  0.23835869 -0.15975325]
 [ 2.79871947 -0.89883247 -0.09910995]
 [ 1.46873112 -0.51443636 -0.1461924 ]
 [ 1.15329933  0.75341864  0.16184495]
 [-0.15212908  1.07702788  0.11271499]
 [-1.13060623  0.13843592 -0.18651713]
 [-2.56195997  0.60224809 -0.26848812]
 [-2.61589085  1.98880147 -0.91233925]
 [-1.7825011   2.96467228 -0.19616682]
 [-0.53343712  2.50816882  0.45025218]
 [-2.50230208  3.96691016  0.47985061]
 [-3.13052426  3.64024452  1.69501868]
 [-3.93447426  4.55568201  2.36219254]
 [-4.12855294  5.818542    1.8206921 ]
 [-3.51034937  6.19212391  0.61558734]
 [-2.67001728  5.2823797  -0.0931741 ]
 [-2.08308779  5.79607309 -1.29704122]
 [-2.33174422  7.10892277 -1.74528179]
 [-3.17644012  7.95082858 -1.04553662]
 [-3.7554685   7.49

In [3]:
import asyncio
from open_biomed.core.web_request import PubChemRequester, PubChemStructureRequester
from open_biomed.data.molecule import check_identical_molecules, molecule_fingerprint_similarity

# We provide APIs for obtaining molecules from PubChem via molecule name or ID
requester = PubChemRequester(db_url="https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{accession}/SDF")
# Web request is performed via asynchronous calls
pubchem_molecule = (await asyncio.create_task(requester.run("Adagrasib")))[0][0]
# Check if Adagrasib is the same molecule as the one we construct earlier
print(pubchem_molecule, check_identical_molecules(molecule, pubchem_molecule))

# We also provide APIs for searching struturally similar molecules on PubChem
requester = PubChemStructureRequester()
similar_molecules = (await asyncio.create_task(requester.run(
    molecule=pubchem_molecule,    # query molecule
    threshold=0.8,                # structural similarity threshold
    max_records=5,                # maximum number of molecules
)))
# The first molecule is the requested molecule itself (identical)
for i in range(1, 5):
    # Check the SMILES and saved files of similar molecules
    print(similar_molecules[0][i], similar_molecules[1][i])
    # Let's check the Morgan fingerprint similarity of returned molecule!
    # NOTE: the threshold value used in Pubchem is not Morgan fingerprint similarity, so the output scores could be lower than 0.8
    print(molecule_fingerprint_similarity(pubchem_molecule, similar_molecules[0][i], fingerprint_type="morgan"))

C=C(F)C(=O)N1CCN(c2nc(OC[C@@H]3CCCN3C)nc3c2CCN(c2cccc4cccc(Cl)c24)C3)C[C@@H]1CC#N True
C=CC(=O)N1CCN(c2nc(OC[C@@H]3CCCN3C)nc3c(F)c(-c4nc(N)cc(C)c4C(F)(F)F)c(Cl)cc23)[C@@H](C)C1 ./tmp/pubchem_146624881.pkl
0.31693989071038253
C=CC(=O)N1CCN(c2nc(OC[C@@H]3CCCN3C)nc3c2CCN(c2cccc4cccc(Cl)c24)C3)C[C@@H]1CC#N ./tmp/pubchem_134325731.pkl
0.8769230769230769
C=CC(=O)N1CCN(c2nc(OC[C@@H]3CCCN3C)nc3c2CCN(c2cccc4cccc(C)c24)C3)C[C@@H]1CC#N ./tmp/pubchem_134326084.pkl
0.7941176470588235
CN1CCC[C@H]1COc1nc(N2CCNCC2)c2cnc(-c3cccc4cccc(Cl)c34)c(F)c2n1 ./tmp/pubchem_155233433.pkl
0.46153846153846156


Here we present some basic usage of the `Protein` data structure

In [4]:
# Manipulating proteins
from open_biomed.data.protein import Protein, protein_sequence_similarity

# Initalize the protein object with its amino acid sequence
protein1 = Protein.from_fasta("MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHHYREQIKRVKDSEDVPMVLVGNKCDLPSRTVDTKQAQDLARSYGIPFIETSAKTRQRVEDAFYTLVREIRQYRLKKISKEEKTPGCVKIKKCIIM")
protein2 = Protein.from_fasta("MTEYKLVVVGAVGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNIKSFEDIHHYREQIKRVKDSEDVPMVLVGNKCALPSRTVDTKQAQDLARSYGIPFIETSAKTRQGVDDAFYTLVREIRKHKEKMSKDGKKKKKSKTKCSIL")
# Protein sequence alignment
outputs = protein_sequence_similarity(protein1, protein2)
print(outputs[0])
print(outputs[1])
print(outputs[2])


0.8164251207729468
MTEYKLVVVGA-GGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNT-KSFEDIHHYREQIKRVKDSEDVPMVLVGNKCD-LPSRTVDTKQAQDLARSYGIPFIETSAKTRQR-VED-AFYTLVREIRQYRLK-KISKEEKTP----GCVKIKK------C-IIM-
MTEYKLVVVGAVG-VGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINN-IKSFEDIHHYREQIKRVKDSEDVPMVLVGNKC-ALPSRTVDTKQAQDLARSYGIPFIETSAKTRQ-GV-DDAFYTLVREIR----KHK---E-K--MSKDG--K-KKKKSKTKCSI--L


In [5]:
import asyncio
from open_biomed.core.web_request import PDBRequester, MSARequester, FoldSeekRequester

# We provide APIs for obtaining proteins on PDB, AlphafoldDB and UniProt
requester = PDBRequester()
# Web request is performed via asynchronous calls
protein_with_structure = (await asyncio.create_task(requester.run("4EPT")))[0][0]
print(protein_with_structure)
# If the protein object has 3D structure, you can save it as a pdb file
file = protein_with_structure.save_pdb()
# We also provide APIs for loading a protein from a pdb file
loaded_protein = Protein.from_pdb_file(file)
print(loaded_protein)

# We also provide APIs for performing MSA search and FoldSeek with web services
requester = MSARequester()
# NOTE: Performing MSA may take a long time and may occasionally fail
outputs = (await asyncio.create_task(requester.run(loaded_protein)))[0][0]
# Check the .a3m file for MSA results!
print(outputs)

requester = FoldSeekRequester(database=["afdb50"], timeout=60)
# NOTE: Performing FoldSeek may take a long time and may occasionally fail
outputs = (await asyncio.create_task(requester.run(loaded_protein)))[0][0]
# Check the folder for FoldSeek results!
print(outputs)

MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHHYREQIKRVKDSEDVPMVLVGNKSDLPSRTVDTKQAQDLARSYGIPFIETSAKTRQGVDDAFYTLVREIRKH
MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHHYREQIKRVKDSEDVPMVLVGNKSDLPSRTVDTKQAQDLARSYGIPFIETSAKTRQGVDDAFYTLVREIRKH
./tmp/msa_results_1741148484988/uniref.a3m
./tmp/foldseek_results_1741148485003
