In [1]:
# https://mg.readthedocs.io/importing-local-python-modules-from-jupyter-notebooks/sys-path-in-notebook/path-notebook.html
# This is required to be able to import local python file in different folder
import os
import sys
sys.path.insert(0, os.path.abspath('../src'))
# sys.path

#### Import libraries and packages

In [2]:
from pka_lookup_pubchem import pka_lookup_pubchem
import pubchempy as pcp
import re
import json

## Standard usage:

In [3]:
print('Example looking up by CAS:')
cas_nr = '64-19-7'    # acetic acid   >>> pKa = 4.76 at 25 °C
print(pka_lookup_pubchem(cas_nr))
# cas_nr = '75-75-2'    # methanesulfonic acid   >>> pKa = -1.86
cas_nr = '2950-43-8'    # Hydroxylamine-O-sulfonic acid, no result
print(pka_lookup_pubchem(cas_nr))

Example looking up by CAS:
{'input': '64-19-7', 'source': 'Pubchem', 'Pubchem CID': 176, 'pka': '4.76 at 25 °C', 'reference': 'Serjeant, E.P., Dempsey B.; Ionisation Constants of Organic  Acids in Aqueous Solution. International Union of Pure and  Applied Chemistry (IUPAC). IUPAC Chemical Data Series No.  23, 1979. New York, New York: Pergamon Press, Inc., p. 989'}
None


In [4]:
print('Example looking up by SMILES:')
# smiles_string = 'OC=1(N(N=C(C=1)C2(=CC=CC=C2))C)'
smiles_string = 'OC1=CC=CC=C1'

# # Look up pKa using pka_lookup_pubchem():
# print(f'pKa from Pubchem using smiles:\n{pka_lookup_pubchem(smiles_string)}')
print(f'pKa from Pubchem using smiles:\n{pka_lookup_pubchem(smiles_string, "smiles")}')
      
smiles_string = 'C1=CC(=CC=C1F)S'
print(f'pKa from Pubchem using smiles:\n{pka_lookup_pubchem(smiles_string, "smiles")}')

Example looking up by SMILES:
pKa from Pubchem using smiles:
{'input': 'OC1=CC=CC=C1', 'source': 'Pubchem', 'Pubchem CID': 996, 'pka': '9.99 @ 25 °C', 'reference': 'Lide, D.R. (ed.). CRC Handbook of Chemistry and Physics. 83rd ed. Boca Raton, Fl: CRC Press Inc., 2002-2003., p. 8-49'}
pKa from Pubchem using smiles:
None


In [5]:
print('Example looking up by InChI:')

# this is NOT an exact match from Pubchem return search
inchi_string = 'InChI=1S/C10H10N2O/c1-12-10(13)7-9(11-12)8-5-3-2-4-6-8/h2-7,13H,1H3'
print(f'pKa from Pubchem using InChI:\n{pka_lookup_pubchem(inchi_string, "inchi")}')

# this is an exact match from Pubchem with no pKa 
inchi_string = 'InChI=1S/C10H10N2O/c1-12-10(13)7-9(11-12)8-5-3-2-4-6-8/h2-7,11H,1H3'
# print(f'pKa from Pubchem using InChI:\n{pka_lookup_pubchem(inchi_string)}')
print(f'pKa from Pubchem using InChI:\n{pka_lookup_pubchem(inchi_string, "inchi")}')
# print(f'pKa from Pubchem using InChI:\n{pka_lookup_pubchem(inchi_string, "smiles")}')    # this function call has wrong 'namespace' (should be 'inchi', not 'smiles'). Therefore, return Pubchem CID even though it is not an exact match.

inchi_string = 'InChI=1S/C6H6S/c7-6-4-2-1-3-5-6/h1-5,7H'    # thiophenol
print(f'pKa from Pubchem using InChI:\n{pka_lookup_pubchem(inchi_string, "inchi")}')

Example looking up by InChI:
pKa from Pubchem using InChI:
None
pKa from Pubchem using InChI:
None
pKa from Pubchem using InChI:
{'input': 'InChI=1S/C6H6S/c7-6-4-2-1-3-5-6/h1-5,7H', 'source': 'Pubchem', 'Pubchem CID': 7969, 'pka': '6.62', 'reference': 'Serjeant, E.P., Dempsey B.; Ionisation Constants of Organic  Acids in Aqueous Solution. International Union of Pure and  Applied Chemistry (IUPAC). IUPAC Chemical Data Series No.  23, 1979. New York, New York: Pergamon Press, Inc., p. 165'}


In [6]:
inchikey_string = 'OKKJLVBELUTLKV-UHFFFAOYSA-N'    # methanol
print(f'pKa from Pubchem using InChIKey:\n{pka_lookup_pubchem(inchikey_string, "inchikey")}')
      
inchikey_string = 'SNAOXHWORPTDTJ-UHFFFAOYNA-N'    # structure not found in pubchem
print(f'pKa from Pubchem using InChIKey:\n{pka_lookup_pubchem(inchikey_string, "inchikey")}')

pKa from Pubchem using InChIKey:
{'input': 'OKKJLVBELUTLKV-UHFFFAOYSA-N', 'source': 'Pubchem', 'Pubchem CID': 887, 'pka': '15.3', 'reference': 'Serjeant, E.P., Dempsey B.; Ionisation Constants of Organic  Acids in Aqueous Solution. International Union of Pure and  Applied Chemistry (IUPAC). IUPAC Chemical Data Series No.  23, 1979. New York, New York: Pergamon Press, Inc., p. 989'}
pKa from Pubchem using InChIKey:
None


## More usage cases:

### Using smiles to look up CAS number then search pKa from pubchem
**Several SMILES strings can represent the same molecule**

In [7]:
def get_cas(identifier, namespace=None, domain='compound', searchtype=None, **kwargs):
    """ Look up CAS number using Pubchem
    
    Possible input for 'namespace':
    https://pubchempy.readthedocs.io/en/latest/guide/advanced.html#summary-of-possible-inputs
        <identifier> = list of cid, sid, aid, source, inchikey, listkey; string of name, smiles, xref, inchi, sdf;
        <domain> = substance | compound | assay

        compound domain
        <namespace> = cid | name | smiles | inchi | sdf | inchikey | <structure search> | <xref> | listkey | formula
        <operation> = record | property/[comma-separated list of property tags] | synonyms | sids | cids | aids | assaysummary | classification

        substance domain
        <namespace> = sid | sourceid/<source name> | sourceall/<source name> | name | <xref> | listkey
        <operation> = record | synonyms | sids | cids | aids | assaysummary | classification

        assay domain
        <namespace> = aid | listkey | type/<assay type> | sourceall/<source name>
        <assay type> = all | confirmatory | doseresponse | onhold | panel | rnai | screening | summary
        <operation> = record | aids | sids | cids | description | targets/{ProteinGI, ProteinName, GeneID, GeneSymbol} | doseresponse/sid

    """
    cids = []
    if not namespace:
        for name in ['name', 'smiles', 'inchi', 'inchikey']:
            lookup = pcp.get_cids(identifier, namespace=name)
            if lookup:
                cids.append(lookup[0])
                break
    else:
        cids = pcp.get_cids(identifier, namespace=namespace)

    result = []
    for cid in cids:
        synonyms_lookup = pcp.get_synonyms(cid)
        synonyms = synonyms_lookup[0]['Synonym'] if synonyms_lookup else []
        for synonym in synonyms:
            cas_nr = re.search(r'^\d{2,7}-\d{2}-\d$', synonym)
            if cas_nr:
                cas_nr = cas_nr.group()
                result.append({
                    'CID': cid,
                    'CASNumber': cas_nr,
                })
                break
    return result


print('Example 1:')
'''IMPORTANT:The following smiles is an enol tautomer but when looked up on Pubchem will give the ketone tautomer
This is the Pubchem problem and not pubchempy library error'''
print(get_cas('OC=1(N(N=C(C=1)C2(=CC=CC=C2))C)'))    # return: [{'CID': 2763504, 'CASNumber': '34347-81-4'}]
print(get_cas('OC=1(N(N=C(C=1)C2(=CC=CC=C2))C)', 'smiles'))    # return: [{'CID': 2763504, 'CASNumber': '34347-81-4'}]

print('Example 2:')
print(get_cas('C1=CC(=CC=C1F)S'))    # return: [{'CID': 67789, 'CASNumber': '371-42-6'}]

print('Example 3:')
print(get_cas('InChI=1S/C6H5ClS/c7-5-2-1-3-6(8)4-5/h1-4,8H'))    # return [{'CID': 16257, 'CASNumber': '2037-31-2'}]
print(get_cas('InChI=1S/C6H5ClS/c7-5-2-1-3-6(8)4-5/h1-4,8H', 'inchi'))
print(get_cas('InChI=1S/C6H5ClS/c7-5-2-1-3-6(8)4-5/h1-4,8H', 'smiles'))    # wrong 'namespace' input but still works

print('Example 4:')
print(get_cas('FTUPSXHUCWFVNH-UHFFFAOYSA-N'))    # return [{'CID': 12421053, 'CASNumber': '64754-67-2'}]
# print(get_cas('FTUPSXHUCWFVNH-UHFFFAOYSA-N', 'inchi'))    # wrong 'namespace' input -> Error
# print(get_cas('FTUPSXHUCWFVNH-UHFFFAOYSA-N', 'smiles'))   # wrong 'namespace' input -> Error
print(get_cas('FTUPSXHUCWFVNH-UHFFFAOYSA-N', 'inchikey'))    # wrong 'namespace' input but still works

Example 1:
[{'CID': 2763504, 'CASNumber': '34347-81-4'}]
[{'CID': 2763504, 'CASNumber': '34347-81-4'}]
Example 2:
[{'CID': 67789, 'CASNumber': '371-42-6'}]
Example 3:
[{'CID': 16257, 'CASNumber': '2037-31-2'}]
[{'CID': 16257, 'CASNumber': '2037-31-2'}]
[{'CID': 16257, 'CASNumber': '2037-31-2'}]
Example 4:
[{'CID': 12421053, 'CASNumber': '64754-67-2'}]
[{'CID': 12421053, 'CASNumber': '64754-67-2'}]


In [8]:
# smiles_string = 'C1=CC(=CC=C1F)S'
smiles_string = 'OC=1(N(N=C(C=1)C2(=CC=CC=C2))C)'

# Look up CAS:
cas_nr = get_cas(smiles_string)[0]['CASNumber'] or None

print(pcp.get_cids(smiles_string, 'smiles'))
# Look up pKa using pka_lookup_pubchem():
# result_from_pubchem = pka_lookup_pubchem(cas_nr)
print(f'pKa from Pubchem using CAS: {pka_lookup_pubchem(cas_nr)}')
print(f'pKa from Pubchem using smiles: {pka_lookup_pubchem(smiles_string)}')

[2763504]
pKa from Pubchem using CAS: None
pKa from Pubchem using smiles: None


### Using InChI to search for pKa from PubChem

**InChI representation can be different for 1 compound (e.g. tautomers)**. 

**Do NOT use InChI to look up CAS from PubChem, PubChem does not distinguish tautomers**

In the example below: structure with 'InChI=1S/C10H10N2O/c1-12-10(13)7-9(11-12)8-5-3-2-4-6-8/h2-7,13H,1H3' is tautomer of that found in pubchem: 'InChI=1S/C10H10N2O/c1-12-10(13)7-9(11-12)8-5-3-2-4-6-8/h2-7,11H,1H3'

In [9]:
# source_inchi = 'InChI=1S/C6H5FS/c7-5-1-3-6(8)4-2-5/h1-4,8H'    # from: 'DSSTox_DWPKAA_20181127.xlsx'
# source_inchi = 'InChI=1S/C6H5ClS/c7-5-2-1-3-6(8)4-5/h1-4,8H'    # from: 'DSSTox_DWPKAA_20181127.xlsx'
source_inchi = 'InChI=1S/C10H10N2O/c1-12-10(13)7-9(11-12)8-5-3-2-4-6-8/h2-7,13H,1H3'    # from 'Opt3_acidic_tr.xlsx'
# source_inchi = 'InChI=1S/C6H5FS/c7-5-1-3-6(8)4-2-5/h1-4,8H'

print(f'source inchi: {source_inchi}')

# Look up InChI using Pubchem
lookup_inchi = pcp.get_properties(['inchi', 'inchikey', 'canonical_smiles', 'isomeric_smiles'], source_inchi, 'inchi')
print(lookup_inchi)
print(f"Is source and lookup inchi the same: {source_inchi == lookup_inchi[0]['InChI']}")
      
print(f'pKa from Pubchem using InChI: {pka_lookup_pubchem(source_inchi)}')

source inchi: InChI=1S/C10H10N2O/c1-12-10(13)7-9(11-12)8-5-3-2-4-6-8/h2-7,13H,1H3
[{'CID': 2763504, 'CanonicalSMILES': 'CN1C(=O)C=C(N1)C2=CC=CC=C2', 'IsomericSMILES': 'CN1C(=O)C=C(N1)C2=CC=CC=C2', 'InChI': 'InChI=1S/C10H10N2O/c1-12-10(13)7-9(11-12)8-5-3-2-4-6-8/h2-7,11H,1H3', 'InChIKey': 'INOLYMVSZFIBGA-UHFFFAOYSA-N'}]
Is source and lookup inchi the same: False
pKa from Pubchem using InChI: None


### Using InChIKey to search for pKa from PubChem
**Unlike InChI, InChIKey can be duplicate, albeit very rare**: https://en.wikipedia.org/wiki/International_Chemical_Identifier


In [10]:
# source_inchikey = 'OKIHXNKYYGUVTE-UHFFFAOYSA-N'    # from: 'DSSTox_DWPKAA_20181127.xlsx'
# source_inchikey = 'NRFHROYGWWFLKU-UHFFFAOYSA-N'    # from 'Opt3_acidic_tr.xlsx'  >>> empty result from pubchem
# source_inchikey = 'CQJDYPZUDYXHLM-UHFFFAOYSA-N'    # from: 'DSSTox_DWPKAA_20181127.xlsx'
source_inchikey = 'FTUPSXHUCWFVNH-UHFFFAOYSA-N'    # from 'Opt3_acidic_tr.xlsx'

print(f'source inchikey: {source_inchikey}')

# Look up InChIkey using Pubchem
lookup_inchikey = pcp.get_properties('inchikey', source_inchikey, 'inchikey')
print(lookup_inchikey)
print(f"Is source and lookup inchikey the same: {source_inchikey == lookup_inchikey[0]['InChIKey']}")


# Look up CAS:
cas_nr = get_cas(source_inchikey)[0]['CASNumber'] or None
print(f'CAS: {cas_nr}')

# Look up pKa using pka_lookup_pubchem():
# result_from_pubchem = pka_lookup_pubchem(cas_nr)
print(f'pKa from Pubchem using CAS: {pka_lookup_pubchem(cas_nr)}')
# result_from_pubchem = pka_lookup_pubchem(source_inchikey)
print(f'pKa from Pubchem using InChIKey: {pka_lookup_pubchem(source_inchikey)}')

source inchikey: FTUPSXHUCWFVNH-UHFFFAOYSA-N
[{'CID': 12421053, 'InChIKey': 'FTUPSXHUCWFVNH-UHFFFAOYSA-N'}]
Is source and lookup inchikey the same: True
CAS: 64754-67-2
pKa from Pubchem using CAS: None
pKa from Pubchem using InChIKey: None
