In [63]:
import pubchempy as pbc
import pandas as pd
import urllib3



In [7]:
# Initialize urllib3

http = urllib3.PoolManager()
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

In [98]:
# Get Inchi from cactus

def InChi_from_cactus(identifier):
    url = (f'https://cactus.nci.nih.gov/chemical/structure/{identifier}/inchi')
    
    try:
        response = http.request('GET', url)
    except:
        return False
    if "Bad" in str(response.data):
        return False
    if "found" in str(rlesponse.data):
        return False
    inchi = str(response.data.decode("UTF-8"))
    if not 'InChI' in inchi:
        return False
    return inchi

In [99]:
# test
inchi = InChi_from_cactus('c1ccccc1')
inchi

'InChI=1/C6H6/c1-2-4-6-5-3-1/h1-6H'

In [100]:
# Get Inchi from drugbankID

def inchi_from_drugbank(identifier):
    # Get InChIs from drugbank

    url = (f'https://www.drugbank.ca/structures/small_molecule_drugs/'
           f'{identifier}.inchi') 
    try:
        response = http.request('GET', url)
    except:
        return False

    if "Bad" in str(response.data):
        return False
    if "found" in str(response.data):
        return False
    inchi = str(response.data.decode("UTF-8"))
    if not 'InChI' in inchi:
        return False
    return inchi

In [101]:
# Test

inchi = inchi_from_drugbank('DB11558')
inchi

'InChI=1S/C7H8/c1-7-5-3-2-4-6-7/h2-6H,1H3'

In [125]:
def inchi_from_pubchem(identifier):
    try:
        comp =  pbc.get_compounds(identifier, namespace='name')
        inchi = comp[0].inchi
    except Exception as e:
        return False
    if not 'InChI' in inchi:
        return False
    return inchi
    

In [126]:
# Test

inchi = inchi_from_pubchem('toluene')
inchi

'InChI=1S/C7H8/c1-7-5-3-2-4-6-7/h2-6H,1H3'

In [127]:
# Main function. Return a DataFrame with found structures
def add_inchis(frame):
    names = []
    CAS = []
    drugbank = []
    inchis = []
    if 'name' in frame.columns:
        names = frame['name']

    if 'CAS' in frame.columns:
        CAS = frame['CAS']

    if 'DrugbankID' in frame.columns:
        drugbank = frame['DrugbankID']
 
    for i in range(len(frame)):
        inchi = False
        if len(CAS) > 0:
            inchi = inchi_from_cactus(CAS[i])
        if not inchi and len(names) > 0:
            inchi = inchi_from_cactus(names[i])
            if not inchi:
                inchi = inchi_from_pubchem(names[i])
        if not inchi and len(drugbank) > 0:
            inchi = inchi_from_drugbank(drugbank[i])
        inchis.append(inchi)
    frame['InChI'] = inchis
    return frame
    

In [128]:
frame = pd.read_csv("test_data.csv", sep='\t')
frame.shape

(1036, 3)

In [129]:
frame = get_inchis(frame)

Molecule: 0
InChI=1/C5H4N4S/c10-5-3-4(7-1-6-3)8-2-9-5/h1-2H,(H2,6,7,8,9,10)/f/h6,8H
Molecule 0 finished.

Molecule: 1
InChI=1/C8H9NO2/c1-6(10)9-7-2-4-8(11)5-3-7/h2-5,11H,1H3,(H,9,10)/f/h9H
Molecule 1 finished.

Molecule: 2
Trying name
InChI=1/C9H7N7O2S/c1-15-4-14-7(16(17)18)9(15)19-8-5-6(11-2-10-5)12-3-13-8/h2-4H,1H3,(H,10,11,12,13)/f/h10H
Molecule 2 finished.

Molecule: 3
InChI=1/C16H19ClN2/c1-19(2)12-10-15(16-5-3-4-11-18-16)13-6-8-14(17)9-7-13/h3-9,11,15H,10,12H2,1-2H3
Molecule 3 finished.

Molecule: 4
InChI=1/C12H15ClO3/c1-4-15-11(14)12(2,3)16-10-7-5-9(13)6-8-10/h5-8H,4H2,1-3H3
Molecule 4 finished.

Molecule: 5
InChI=1/C7H15Cl2N2O2P/c8-2-5-11(6-3-9)14(12)10-4-1-7-13-14/h1-7H2,(H,10,12)/f/h10H
Molecule 5 finished.

Molecule: 6
InChI=1/C8H11NO2/c9-4-3-6-1-2-7(10)8(11)5-6/h1-2,5,10-11H,3-4,9H2
Molecule 6 finished.

Molecule: 7
Trying name
InChI=1/C18H24O2/c1-18-9-8-14-13-5-3-12(19)10-11(13)2-4-15(14)16(18)6-7-17(18)20/h3,5,10,14-17,19-20H,2,4,6-9H2,1H3/t14-,15-,16+,17+,18+/m1/s1
Molecu

In [132]:
frame[frame['InChI']==False]

Unnamed: 0,name,CAS,DrugbankID,InChI
241,sacrosidase,EC 3.2.1.26,,False
243,nesiritide,124584-08-3,DB04899,False
245,gemtuzumab,220578-59-6,DB00056,False
336,ferumoxytol,1317-61-9,,False
355,pancrelipase,53608-75-6,DB00085,False
383,hyaluronidase,37326-33-3,,False
393,thyrotropin alfa,9002-71-5,,False
445,somatropin recombinant,12629-01-5,,False
471,pegademase bovine,EC 3.5.4.-,,False
482,dalteparin sodium,2608206,DB06779,False
