# PubChem Molecular Formula Search example

In this example, we perform a `MolecularFormulaSearch` to request the smiles of all compounds on PubChem that contain C, H, B and Al elements exclusively.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pubchem_api_crawler.molecular_search import MolecularFormulaSearch

In [3]:
import logging

logger = logging.getLogger('pubchem_api_crawler')
logger.setLevel(logging.INFO)
ch = logging.StreamHandler()
ch.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
logger.addHandler(ch)

In [12]:
mf = MolecularFormulaSearch()
df = mf.search(["C1-", "H1-", "B1-", "Al2-"],allow_other_elements=False,properties=["MolecularFormula", "CanonicalSMILES"])

2024-01-29 13:43:21,540 - pubchem_api_crawler.molecular_search - INFO - Exceuting Molecular Formula Search request: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/fastformula/C1-H1-B1-Al2-/property/MolecularFormula,CanonicalSMILES/JSON?AllowOtherElements=false&MaxRecords=2000000
2024-01-29 13:43:23,995 - pubchem_api_crawler.molecular_search - INFO - Request Count status: Green (0%), Request Time status: Green (0%), Service status: Green (13%)


In [13]:
df

Unnamed: 0_level_0,MolecularFormula,CanonicalSMILES
CID,Unnamed: 1_level_1,Unnamed: 2_level_1
160469542,C8H26Al2B2,[B](C)C.[B](C)C.C[AlH]C.C[AlH]C
159970515,C20H48Al2B2,B(C)(CCB(C)CCC)CCC.CCC[Al](C)CC[Al](C)CCC


In [38]:
df = mf.search(["C1-", "H1-", "B1-", "Al1-"], allow_other_elements=False,properties=["MolecularFormula", "CanonicalSMILES"])
df

2024-01-29 14:17:20,717 - pubchem_api_crawler.molecular_search - INFO - Exceuting Molecular Formula Search request: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/fastformula/C1-H1-B1-Al1-/property/MolecularFormula,CanonicalSMILES/JSON?AllowOtherElements=false&MaxRecords=2000000
2024-01-29 14:17:22,055 - pubchem_api_crawler.molecular_search - INFO - Request Count status: Green (0%), Request Time status: Green (0%), Service status: Green (17%)


Unnamed: 0_level_0,MolecularFormula,CanonicalSMILES
CID,Unnamed: 1_level_1,Unnamed: 2_level_1
168084494,CH5AlB2,[BH].[BH].C[Al]
163556649,C16H14AlB,[B]CCC1=C2CCC=CC2=C(C3=CC=CC=C31)[Al]
161576177,C27H30AlB,[H+].[B-](C1=CC=CC=C1)(C2=CC=CC=C2)(C3=CC=CC=C...
160352291,C6H15AlB,[B].CC[Al](CC)CC
159123289,C10H28AlB2,[B](C)C.[B](C)C.CCCC.C[Al]C
158802573,C11H29AlB,B(C)(C)C.CCCC.CC[Al]CC
158250967,C3H9AlB,[B].C[Al](C)C
158044531,C2H6AlB,[B].C[Al]C
157093180,C3H9AlB,B(C)(C)C.[Al]
156888304,C12H14AlB,[B]C1=CC=CC=C1C2CCCCC2[Al]


In [25]:
df.shape

(19, 2)

In [17]:
res = mf._pug_search(["C1-", "H1-", "B-", "Al-"],allow_other_elements=False,properties=["MolecularFormula", "CanonicalSMILES"])

2024-01-29 13:44:56,097 - pubchem_api_crawler.molecular_search - INFO - Checking status for query 1078300897660591343.
2024-01-29 13:44:57,006 - pubchem_api_crawler.molecular_search - INFO - Query 1078300897660591343 is success.


In [29]:
res

168084494
163556649
161576177
160352291
159123289
158802573
158250967
158044531
157093180
156888304
129859217


In [35]:
mf._get_properties_for_cids(res, ["MolecularFormula", "CanonicalSMILES"])

cid=168084494,163556649,161576177,160352291,159123289,158802573,158250967,158044531,157093180,156888304,129859217,129657578,129657197,59992955,22996618,19734271,155575130


In [36]:
res

Unnamed: 0_level_0,MolecularFormula,CanonicalSMILES
CID,Unnamed: 1_level_1,Unnamed: 2_level_1
168084494,CH5AlB2,[BH].[BH].C[Al]
163556649,C16H14AlB,[B]CCC1=C2CCC=CC2=C(C3=CC=CC=C31)[Al]
161576177,C27H30AlB,[H+].[B-](C1=CC=CC=C1)(C2=CC=CC=C2)(C3=CC=CC=C...
160352291,C6H15AlB,[B].CC[Al](CC)CC
159123289,C10H28AlB2,[B](C)C.[B](C)C.CCCC.C[Al]C
158802573,C11H29AlB,B(C)(C)C.CCCC.CC[Al]CC
158250967,C3H9AlB,[B].C[Al](C)C
158044531,C2H6AlB,[B].C[Al]C
157093180,C3H9AlB,B(C)(C)C.[Al]
156888304,C12H14AlB,[B]C1=CC=CC=C1C2CCCCC2[Al]
