##### Import libraries


In [1]:
import pubchempy as pcp  # https://pubchempy.readthedocs.io/en/latest/guide/gettingstarted.html
import requests
import xml.etree.ElementTree as ET


##### Search for pubchem cid ('Compound ID')


In [2]:
cas_nr = '64-19-7'    # acetic acid   >>> pKa = 4.76 at 25 °C
# cas_nr = '2950-43-8'    # Hydroxylamine-O-sulfonic acid, no result
# cas_nr = '75-75-2'    # methanesulfonic acid   >>> pKa = -1.86

headers = {
    'user-agent': 'Mozilla/5.0 (X11; CentOS; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36'}

print('Searching Pubchem...')

# Using pubchem api for python
# Getting CID number, the result of this, by default is exact match. The result is returned as a list.
# cid = pcp.get_cids(cas_nr, 'name', 'substance', list_return='flat')
cid = pcp.get_cids(cas_nr, 'name')
print(cid)

Searching Pubchem...
[176]



##### if cid is found, then double check with the CAS# using 'synonyms'


In [3]:
#  this api return an empty list if it cannot find cas_nr. This is to check if pubchem has this chemical.
if len(cid) > 0:
    # if Pubchem found the result, get the first result of the list
    cid = cid[0]
    # print('Compound ID (CID) from PubChem is: {} and type is: {}'.format(cid, type(cid)))

    # To double check if the CAS number is correct:
    # using pubchem api, get a list of synonym. The result is a list of dict.
    # choose the first result and check all values for 'Synonym' key:
    synonyms = pcp.get_synonyms(cid)[0]['Synonym']
#     print('List of synonyms is: {}'.format(synonyms))


##### Raise Exception if it is not an exact match
##### If it is an exact match, put a requests to Pubchem pug with 'Dissociation Constants' heading


In [4]:
    if cas_nr not in synonyms:
        raise ValueError('\tThis is not an exact match!')

    '''
    get url from Pubchem to get pka lookup result
    'XML' can be replaced with 'JSON' but it is harder to parse later on
    for more info about Pubchem output types: https://pubchemdocs.ncbi.nlm.nih.gov/pug-rest$_Toc494865558
    '''
    pka_lookup_result_xml = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/{}/XML?heading=Dissociation+Constants'.format(cid)

    # Get the html request info using CID number from pubchem
    r = requests.get(pka_lookup_result_xml, headers=headers, timeout=15)
    # Check to see if give OK status (200) and not redirect
    if r.status_code == 200 and len(r.history) == 0:
        print(r.text)
        # Use python XML to parse the return result
        tree = ET.fromstring(r.text)

<?xml version="1.0" encoding="UTF-8"?>
<Record
    xmlns="http://pubchem.ncbi.nlm.nih.gov/pug_view"
    xmlns:xs="http://www.w3.org/2001/XMLSchema-instance"
    xs:schemaLocation="http://pubchem.ncbi.nlm.nih.gov/pug_view https://pubchem.ncbi.nlm.nih.gov/pug_view/pug_view.xsd"
>
  <RecordType>CID</RecordType>
  <RecordNumber>176</RecordNumber>
  <RecordTitle>Acetic acid</RecordTitle>
  <Section>
    <TOCHeading>Chemical and Physical Properties</TOCHeading>
    <Description>Chemical and physical properties such as melting point, molecular weight, etc.</Description>
    <Section>
      <TOCHeading>Experimental Properties</TOCHeading>
      <Description>Properties determined experimentally (See also Safety and Hazard Properties section for more information if available)</Description>
      <Section>
        <TOCHeading>Dissociation Constants</TOCHeading>
        <Description>A specific type of equilibrium constant that measures the propensity of a larger object to separate (dissociate) rev

In [5]:
        pka_result = ''
    
        for node in tree.iter('{http://pubchem.ncbi.nlm.nih.gov/pug_view}String'):
#             print(node.text)
            pka_result = node.text
        
        print(pka_result)



pKa = 4.76 at 25 °C


---
The next part is an example of searching for substance in Pubchem in case compounds cannot be found.
**Note that the below part has not been adapted to searching for pKa**

In [6]:
# # If not, try to find substances as well
# elif len(cid) == 0:
#     '''pcp.get_substances(cas_nr, 'name') returns a list of Substances if found: 
#     Ref: https://github.com/mcs07/PubChemPy/blob/e3c4f4a9b6120433e5cc3383464c7a79e9b2b86e/pubchempy.py#L328'''
#     substances = pcp.get_substances(cas_nr, 'name')   
#     # print(sid); exit(0)

#     if len(substances) == 0:
#         # print('nothing here')
#         raise ValueError('Could not find any compounds or substances with this CAS {} on Pubchem.'.format(cas_nr))
#     else:
#         for substance in substances:
#             # print('Substance ID (SID) from PubChem is: {} and type is: {}'.format(substance, type(substance)))

#             '''Ref: https://github.com/mcs07/PubChemPy/blob/e3c4f4a9b6120433e5cc3383464c7a79e9b2b86e/pubchempy.py#L735'''
#             # substance_synonyms = substance.to_dict(properties=['synonyms'])['synonyms']
#             '''
#             substance.to_dict(properties=['synonyms']) return example:
#             {'synonyms': ['12259-21-1', 'Iron oxide (Fe2O3), hydrate', 'Ferric oxide hydrate', 
#                             'Ferrox', 'Hydrated ferric oxide', 'Hydrous ferric oxide', 
#                             'Iron oxide (Fe203), hydrate']}
#             '''

#             substance_synonyms = substance.synonyms   # https://github.com/mcs07/PubChemPy/blob/e3c4f4a9b6120433e5cc3383464c7a79e9b2b86e/pubchempy.py#L1095
#             '''
#             substance.synonyms' return example:
#                 ['12259-21-1', 'Iron oxide (Fe2O3), hydrate', 'Ferric oxide hydrate', 
#                 'Ferrox', 'Hydrated ferric oxide', 'Hydrous ferric oxide', 
#                 'Iron oxide (Fe203), hydrate']
#             '''

#             # Check to make sure the substance has the same CAS#
#             if cas_nr in substance_synonyms:
#                 sdf = pcp.get_sdf(identifier=substance.sid, namespace='sid', domain='substance')
#                 # print(sdf)
#                 if sdf:    # pcp.get_sdf return None if not found SDF                               
#                     download_file.write_text(data=sdf)

#                     # Check if the mol file is a binary string (some error during downloading) or empty mol file:
#                     if is_binary_string(open(download_file, 'rb').read(1024)) or is_empty_mol_file(download_file):
#                         os.remove(download_file)    # remove the error mol file
#                     else:
#                         return 0