## How to parse websites and navigate the DOM using BeautifulSoup

In [2]:
import requests
from bs4 import BeautifulSoup

html = requests.get("http://localhost:8080/planets.html").text
soup = BeautifulSoup(html, "lxml")

In [3]:
str(soup)[:1000]

'<html>\n<head>\n</head>\n<body>\n<div id="planets">\n<h1>Planetary data</h1>\n<div id="content">Here are some interesting facts about the planets in our solar system</div>\n<p></p>\n<table border="1" id="planetsTable">\n<tr id="planetHeader">\n<th>\n</th>\n<th>\r\n                    Name\r\n                </th>\n<th>\r\n                    Mass (10^24kg)\r\n                </th>\n<th>\r\n                    Diameter (km)\r\n                </th>\n<th>\r\n                    How it got its Name\r\n                </th>\n<th>\r\n                    More Info\r\n                </th>\n</tr>\n<tr class="planet" id="planet1" name="Mercury">\n<td>\n<img src="img/mercury-150x150.png"/>\n</td>\n<td>\r\n                    Mercury\r\n                </td>\n<td>\r\n                    0.330\r\n                </td>\n<td>\r\n                    4879\r\n                </td>\n<td>Named Mercurius by the Romans because it appears to move so swiftly.</td>\n<td>\n<a href="https://en.wikipedia.org/w

In [5]:
str(soup.html.body.div.table)[:200]

'<table border="1" id="planetsTable">\n<tr id="planetHeader">\n<th>\n</th>\n<th>\r\n                    Name\r\n                </th>\n<th>\r\n                    Mass (10^24kg)\r\n                </th>\n<th>\r\n     '

In [7]:
soup.html.body.div.table.tr

<tr id="planetHeader">
<th>
</th>
<th>
                    Name
                </th>
<th>
                    Mass (10^24kg)
                </th>
<th>
                    Diameter (km)
                </th>
<th>
                    How it got its Name
                </th>
<th>
                    More Info
                </th>
</tr>

In [9]:
soup.html.body.div.table.children

<list_iterator at 0x1dfc2ef72c8>

In [11]:
[str(c)[:45] for c in soup.html.body.div.table.children]

['\n',
 '<tr id="planetHeader">\n<th>\n</th>\n<th>\r\n     ',
 '\n',
 '<tr class="planet" id="planet1" name="Mercury',
 '\n',
 '<tr class="planet" id="planet2" name="Venus">',
 '\n',
 '<tr class="planet" id="planet3" name="Earth">',
 '\n',
 '<tr class="planet" id="planet4" name="Mars">\n',
 '\n',
 '<tr class="planet" id="planet5" name="Jupiter',
 '\n',
 '<tr class="planet" id="planet6" name="Saturn"',
 '\n',
 '<tr class="planet" id="planet7" name="Uranus"',
 '\n',
 '<tr class="planet" id="planet8" name="Neptune',
 '\n',
 '<tr class="planet" id="planet9" name="Pluto">',
 '\n']

In [13]:
str(soup.html.body.div.table.tr.parent)[:200]

'<table border="1" id="planetsTable">\n<tr id="planetHeader">\n<th>\n</th>\n<th>\r\n                    Name\r\n                </th>\n<th>\r\n                    Mass (10^24kg)\r\n                </th>\n<th>\r\n     '

## Beautiful Soup's find methods

In [14]:
import requests
from bs4 import BeautifulSoup
html = requests.get("http://localhost:8080/planets.html").text
soup = BeautifulSoup(html, "lxml")

In [16]:
table = soup.find("table")
str(table)[:100]


'<table border="1" id="planetsTable">\n<tr id="planetHeader">\n<th>\n</th>\n<th>\r\n                    Nam'

In [20]:
[str(tr)[:50] for tr in table.findAll("tr")]

['<tr id="planetHeader">\n<th>\n</th>\n<th>\r\n          ',
 '<tr class="planet" id="planet1" name="Mercury">\n<t',
 '<tr class="planet" id="planet2" name="Venus">\n<td>',
 '<tr class="planet" id="planet3" name="Earth">\n<td>',
 '<tr class="planet" id="planet4" name="Mars">\n<td>\n',
 '<tr class="planet" id="planet5" name="Jupiter">\n<t',
 '<tr class="planet" id="planet6" name="Saturn">\n<td',
 '<tr class="planet" id="planet7" name="Uranus">\n<td',
 '<tr class="planet" id="planet8" name="Neptune">\n<t',
 '<tr class="planet" id="planet9" name="Pluto">\n<td>']

In [22]:
table.find("tr", {"id": "planet3"})

<tr class="planet" id="planet3" name="Earth">
<td>
<img src="img/earth-150x150.png"/>
</td>
<td>
                    Earth
                </td>
<td>
                    5.97
                </td>
<td>
                    12756
                </td>
<td>
                    The name Earth comes from the Indo-European base 'er,'which produced the Germanic noun 'ertho,' and ultimately German 'erde,'
                    Dutch 'aarde,' Scandinavian 'jord,' and English 'earth.' Related forms include Greek 'eraze,' meaning
                    'on the ground,' and Welsh 'erw,' meaning 'a piece of land.'
                </td>
<td>
<a href="https://en.wikipedia.org/wiki/Earth">Wikipedia</a>
</td>
</tr>

In [24]:
items = dict()
planet_rows = table.findAll("tr", {"class": "planet"})

for i in planet_rows:
    tds = i.findAll("td")
    items[tds[1].text.strip()] = tds[2].text.strip()

items

{'Mercury': '0.330',
 'Venus': '4.87',
 'Earth': '5.97',
 'Mars': '0.642',
 'Jupiter': '1898',
 'Saturn': '568',
 'Uranus': '86.8',
 'Neptune': '102',
 'Pluto': '0.0146'}

## Querying the DOM with XPath and lxml

In [26]:
from lxml import html
import requests
page_html = requests.get("http://localhost:8080/planets.html").text

In [28]:
tree = html.fromstring(page_html)

In [30]:
[tr for tr in tree.xpath("/html/body/div/table/tr")]

[<Element tr at 0x1dfc2f3be58>,
 <Element tr at 0x1dfc2f3bea8>,
 <Element tr at 0x1dfc2f3bef8>,
 <Element tr at 0x1dfc2f3bf48>,
 <Element tr at 0x1dfc2f3bf98>,
 <Element tr at 0x1dfc2f43048>,
 <Element tr at 0x1dfc2f43098>,
 <Element tr at 0x1dfc2f430e8>,
 <Element tr at 0x1dfc2f43138>,
 <Element tr at 0x1dfc2f43188>,
 <Element tr at 0x1dfc2f431d8>]

In [33]:
from lxml import etree

[etree.tostring(tr)[:50] for tr in tree.xpath("/html/body/div/table/tr")]

[b'<tr id="planetHeader">&#13;\n                <th>&#',
 b'<tr id="planet1" class="planet" name="Mercury">&#1',
 b'<tr id="planet2" class="planet" name="Venus">&#13;',
 b'<tr id="planet3" class="planet" name="Earth">&#13;',
 b'<tr id="planet4" class="planet" name="Mars">&#13;\n',
 b'<tr id="planet5" class="planet" name="Jupiter">&#1',
 b'<tr id="planet6" class="planet" name="Saturn">&#13',
 b'<tr id="planet7" class="planet" name="Uranus">&#13',
 b'<tr id="planet8" class="planet" name="Neptune">&#1',
 b'<tr id="planet9" class="planet" name="Pluto">&#13;',
 b'<tr id="footerRow">&#13;\n                <td>&#13;']

In [34]:
[etree.tostring(tr)[:50] for tr in tree.xpath("/html/body/div/table/tr[@class='planet']")]

[b'<tr id="planet1" class="planet" name="Mercury">&#1',
 b'<tr id="planet2" class="planet" name="Venus">&#13;',
 b'<tr id="planet3" class="planet" name="Earth">&#13;',
 b'<tr id="planet4" class="planet" name="Mars">&#13;\n',
 b'<tr id="planet5" class="planet" name="Jupiter">&#1',
 b'<tr id="planet6" class="planet" name="Saturn">&#13',
 b'<tr id="planet7" class="planet" name="Uranus">&#13',
 b'<tr id="planet8" class="planet" name="Neptune">&#1',
 b'<tr id="planet9" class="planet" name="Pluto">&#13;']

In [35]:
[etree.tostring(tr)[:50] for tr in tree.xpath("/html/body/div[1]/table/tr")]

[b'<tr id="planetHeader">&#13;\n                <th>&#',
 b'<tr id="planet1" class="planet" name="Mercury">&#1',
 b'<tr id="planet2" class="planet" name="Venus">&#13;',
 b'<tr id="planet3" class="planet" name="Earth">&#13;',
 b'<tr id="planet4" class="planet" name="Mars">&#13;\n',
 b'<tr id="planet5" class="planet" name="Jupiter">&#1',
 b'<tr id="planet6" class="planet" name="Saturn">&#13',
 b'<tr id="planet7" class="planet" name="Uranus">&#13',
 b'<tr id="planet8" class="planet" name="Neptune">&#1',
 b'<tr id="planet9" class="planet" name="Pluto">&#13;']

In [36]:
[etree.tostring(tr)[:50] for tr in tree.xpath("/html/body/div[2]/table/tr")]

[b'<tr id="footerRow">&#13;\n                <td>&#13;']

In [37]:
[etree.tostring(tr)[:50] for tr in tree.xpath("/html/body/div[@id='planets']/table/tr")]

[b'<tr id="planetHeader">&#13;\n                <th>&#',
 b'<tr id="planet1" class="planet" name="Mercury">&#1',
 b'<tr id="planet2" class="planet" name="Venus">&#13;',
 b'<tr id="planet3" class="planet" name="Earth">&#13;',
 b'<tr id="planet4" class="planet" name="Mars">&#13;\n',
 b'<tr id="planet5" class="planet" name="Jupiter">&#1',
 b'<tr id="planet6" class="planet" name="Saturn">&#13',
 b'<tr id="planet7" class="planet" name="Uranus">&#13',
 b'<tr id="planet8" class="planet" name="Neptune">&#1',
 b'<tr id="planet9" class="planet" name="Pluto">&#13;']

In [38]:
[etree.tostring(tr)[:50] for tr in tree.xpath("/html/body/div[@id='planets']/table/tr[position() > 1]")]

[b'<tr id="planet1" class="planet" name="Mercury">&#1',
 b'<tr id="planet2" class="planet" name="Venus">&#13;',
 b'<tr id="planet3" class="planet" name="Earth">&#13;',
 b'<tr id="planet4" class="planet" name="Mars">&#13;\n',
 b'<tr id="planet5" class="planet" name="Jupiter">&#1',
 b'<tr id="planet6" class="planet" name="Saturn">&#13',
 b'<tr id="planet7" class="planet" name="Uranus">&#13',
 b'<tr id="planet8" class="planet" name="Neptune">&#1',
 b'<tr id="planet9" class="planet" name="Pluto">&#13;']

In [39]:
[etree.tostring(tr)[:50] for tr in tree.xpath("/html/body/div/table/tr/parent::*")]

[b'<table id="planetsTable" border="1">&#13;\n        ',
 b'<table id="footerTable">&#13;\n            <tr id="']

In [40]:
[etree.tostring(tr)[:50] for tr in tree.xpath("/html/body/div/table/tr/parent::table")]

[b'<table id="planetsTable" border="1">&#13;\n        ',
 b'<table id="footerTable">&#13;\n            <tr id="']

In [41]:
[etree.tostring(tr)[:50] for tr in tree.xpath("/html/body/div/table/tr/parent::table[@id='footerTable']")]

[b'<table id="footerTable">&#13;\n            <tr id="']

In [42]:
#A shortcut for parent is .. (and . also represents the current node):
[etree.tostring(tr)[:50] for tr in tree.xpath("/html/body/div/table/tr/..")]

[b'<table id="planetsTable" border="1">&#13;\n        ',
 b'<table id="footerTable">&#13;\n            <tr id="']

In [44]:
#The trailing portion of this XPath,/td[3]/text()[1], 
#selects the third <td> element in the row, 
#then the text of that element (which is an array of all the text in the element), 
#and the first of those which is the mass.

mass = tree.xpath("/html/body/div[1]/table/tr[@name='Earth']/td[3]/text()[1]")[0].strip()
mass

'5.97'

## Querying data with XPath and CSS selectors

In [45]:
# Let's start examining CSS selectors using the same start up code we used in the last recipe
from lxml import html
import requests
page_html = requests.get("http://localhost:8080/planets.html").text
tree = html.fromstring(page_html)

In [46]:
[(v, v.xpath("@name")) for v in tree.cssselect('tr.planet')]

[(<Element tr at 0x1dfc2f4edb8>, ['Mercury']),
 (<Element tr at 0x1dfc2f57228>, ['Venus']),
 (<Element tr at 0x1dfc2f575e8>, ['Earth']),
 (<Element tr at 0x1dfc2f57408>, ['Mars']),
 (<Element tr at 0x1dfc2f64c78>, ['Jupiter']),
 (<Element tr at 0x1dfc2f64cc8>, ['Saturn']),
 (<Element tr at 0x1dfc2f64d18>, ['Uranus']),
 (<Element tr at 0x1dfc2f64d68>, ['Neptune']),
 (<Element tr at 0x1dfc2f64db8>, ['Pluto'])]

In [48]:
# Data for the Earth can be found in several ways. 
# The following gets the row based on id:
tr = tree.cssselect("tr#planet3")
tr[0], tr[0].xpath("./td[2]/text()")[0].strip()


(<Element tr at 0x1dfc2f575e8>, 'Earth')

In [49]:
# The following uses an attribute with a specific value:
tr = tree.cssselect("tr[name='Pluto']")
tr[0], tr[0].xpath("td[2]/text()")[0].strip()

(<Element tr at 0x1dfc2f64db8>, 'Pluto')

Because CSS selectors utilize XPath under the covers, there is overhead to its use as compared to using XPath directly. This difference is, however, almost a non-issue, and hence in certain scenarios it is easier to just use cssselect.

A full description of CSS selectors can be found at: https://www.w3.org/TR/2011/REC-css3-selectors-20110929/

## Using Scrapy selectors

In [50]:
from scrapy.selector import Selector
import requests

In [51]:
response = requests.get("http://stackoverflow.com/questions")

In [53]:
selector = Selector(response)
selector

<Selector xpath=None data='<html class="html__responsive">\r\n\r\n  ...'>

In [55]:
summaries = selector.xpath('//div[@class="summary"]/h3')
summaries[0:5]

[<Selector xpath='//div[@class="summary"]/h3' data='<h3><a href="/questions/60679040/inte...'>,
 <Selector xpath='//div[@class="summary"]/h3' data='<h3><a href="/questions/60679039/expo...'>,
 <Selector xpath='//div[@class="summary"]/h3' data='<h3><a href="/questions/60679035/auto...'>,
 <Selector xpath='//div[@class="summary"]/h3' data='<h3><a href="/questions/60679031/pass...'>,
 <Selector xpath='//div[@class="summary"]/h3' data='<h3><a href="/questions/60679029/cart...'>]

In [56]:
[x.extract() for x in summaries.xpath('a[@class="question-hyperlink"]/text()')][:10]

['Interface to extend or function to create similar types',
 'Expose Objc private headers to unit tests written in Swift',
 'Autofill blank required fields in Django admin before save and before validation',
 'Passing a templated class to the constructor of another templated class with default values for template arguments',
 "Cartopy: Can't plot vector field with uncertainties (and related questions)",
 'Code expression like {{ $title }} cause blank page in Laravel',
 'Python with same class name in same file?',
 'Recaptcha Handling Throwing Exception Inside Promise with catch',
 'copy selected file from an input on electron app form',
 'Stacking multiple columns in R']

To learn more about Scrapy Selectors see: https://doc.scrapy.org/en/latest/topics/selectors.html.

## Loading data in unicode / UTF-8

In [58]:
#Note how the Cyrillic characters were read in as multi-byte codes 
# using \ notation, such as \xd0\x89

from urllib.request import urlopen
page = urlopen("http://localhost:8080/unicode.html")
content = page.read()
content[840:1280]


b'><strong>Cyrillic</strong> &nbsp; U+0400 \xe2\x80\x93 U+04FF &nbsp; (1024\xe2\x80\x931279)</p>\n    <table class="unicode">\n        <tbody>\n            <tr valign="top">\n                <td width="50">&nbsp;</td>\n                <td class="b" width="50">\xd0\x89</td>\n                <td class="b" width="50">\xd0\xa9</td>\n                <td class="b" width="50">\xd1\x89</td>\n                <td class="b" width="50">\xd3\x83</td>\n            </tr>\n        </tbody>\n    </table>\n\n '

In [59]:
# To rectify this, we can convert the content to UTF-8 format 
# using the Python str statement:

str(content, "utf-8")[837:1270]

'<strong>Cyrillic</strong> &nbsp; U+0400 – U+04FF &nbsp; (1024–1279)</p>\n    <table class="unicode">\n        <tbody>\n            <tr valign="top">\n                <td width="50">&nbsp;</td>\n                <td class="b" width="50">Љ</td>\n                <td class="b" width="50">Щ</td>\n                <td class="b" width="50">щ</td>\n                <td class="b" width="50">Ӄ</td>\n            </tr>\n        </tbody>\n    </table>\n\n   '

In [61]:
import requests
response = requests.get("http://localhost:8080/unicode.html").text
response[837:1270]

' <p><strong>Cyrillic</strong> &nbsp; U+0400 â\x80\x93 U+04FF &nbsp; (1024â\x80\x931279)</p>\n    <table class="unicode">\n        <tbody>\n            <tr valign="top">\n                <td width="50">&nbsp;</td>\n                <td class="b" width="50">Ð\x89</td>\n                <td class="b" width="50">Ð©</td>\n                <td class="b" width="50">Ñ\x89</td>\n                <td class="b" width="50">Ó\x83</td>\n            </tr>\n        </tbody>\n    <'