<h1>Scraping Pubchem</h1>

This colab demonstrates how to scrape the pubchem website to get the information by the compound name.

# Import


In [34]:
from bs4 import BeautifulSoup
from collections import defaultdict
import json
import pandas as pd
import pprint
import re
import requests
import urllib

# Scraping

In [30]:
compounds = ['AST-487', 'Crizotinib', 'A-674563']
label_parent, infodata, infordata_value = 'PC-Urn_label', 'PC-InfoData', 'PC-InfoData_value_sval'
string_elements = {'Molecular Weight', 'SMILES', 'InChI'}

In [32]:
res = {}
for compound in compounds:
  val_dict = {}
  safe_compound_name = urllib.parse.quote(compound, safe='')
  
  # request url to pubchem
  data = requests.get(f'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{safe_compound_name}/xml')
  html = BeautifulSoup(data.content, 'xml')
  for element in string_elements:
    tag = html.find(name=label_parent, string=element)
    parents = tag.find_parent(infodata)
    val = parents.find(infordata_value).string
    val_dict[element] = val
  res[compound] = val_dict

## pretty print nested dictionaries

In [35]:
print(json.dumps(res, indent=2))

{
  "AST-487": {
    "SMILES": "CCN1CCN(CC1)CC2=C(C=C(C=C2)NC(=O)NC3=CC=C(C=C3)OC4=NC=NC(=C4)NC)C(F)(F)F",
    "Molecular Weight": "529.6",
    "InChI": "InChI=1S/C26H30F3N7O2/c1-3-35-10-12-36(13-11-35)16-18-4-5-20(14-22(18)26(27,28)29)34-25(37)33-19-6-8-21(9-7-19)38-24-15-23(30-2)31-17-32-24/h4-9,14-15,17H,3,10-13,16H2,1-2H3,(H,30,31,32)(H2,33,34,37)"
  },
  "Crizotinib": {
    "SMILES": "CC(C1=C(C=CC(=C1Cl)F)Cl)OC2=C(N=CC(=C2)C3=CN(N=C3)C4CCNCC4)N",
    "Molecular Weight": "450.3",
    "InChI": "InChI=1S/C21H22Cl2FN5O/c1-12(19-16(22)2-3-17(24)20(19)23)30-18-8-13(9-27-21(18)25)14-10-28-29(11-14)15-4-6-26-7-5-15/h2-3,8-12,15,26H,4-7H2,1H3,(H2,25,27)/t12-/m1/s1"
  },
  "A-674563": {
    "SMILES": "CC1=C2C=C(C=CC2=NN1)C3=CC(=CN=C3)OCC(CC4=CC=CC=C4)N",
    "Molecular Weight": "358.4",
    "InChI": "InChI=1S/C22H22N4O/c1-15-21-11-17(7-8-22(21)26-25-15)18-10-20(13-24-12-18)27-14-19(23)9-16-5-3-2-4-6-16/h2-8,10-13,19H,9,14,23H2,1H3,(H,25,26)/t19-/m0/s1"
  }
}
