In [13]:
from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup
import lxml
import re
import html5lib
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [21]:
from urllib.error import URLError


try:
    html = urlopen('https://pythonscraping.com/pages/page1.html')
    # test and see if an expected error occurs here
except HTTPError as e:
    print(e)
    # handle the error if it arises, this is for HTTP errors try http:// instead of https://
except URLError as e:
    print(e)
    # handles URLs which cannot be found
else:
    print("else")
    # if there is no exception execute this block
finally:
    print("finally")
    # run this block regardless of if there was an error or not

print("outside the try-catch block")

# modifies the html variable, only use when not using BeautifulSoup
# print(html.read())

# bs = BeautifulSoup(html, 'html.parser') # running this more than once results in errors
bs = BeautifulSoup(html, 'lxml') # faster, more robust parser
# bs = BeautifulSoup(html, "html5lib") # slowest but is best for handling bad HTML

<urlopen error [Errno -2] Name or service not known>
finally
outside the try-catch block


In [9]:
print(bs.h1) # finds first instance of the h1 tag

# all equivalent for this example
print(bs.html.body.h1)
print(bs.body.h1)
print(bs.html.h1)

<h1>An Interesting Title</h1>
<h1>An Interesting Title</h1>
<h1>An Interesting Title</h1>
<h1>An Interesting Title</h1>


In [26]:
try:
    badContent = bs.nonExistingTag.anotherTag
except AttributeError as e:
    print('Tag was not found')

Tag was not found


In [32]:
# we can make things slightly easier and less cumbersome

def getTitle(url):
    # check to make sure url opened properly
    try:
        html = urlopen(url)
    except HTTPError as e:
        return None

    # check to make sure html rendered properly and has the tag we want
    try:
        bs = BeautifulSoup(html, 'lxml')
        title = bs.body.h1
    except AttributeError as e:
        return None

    return title

title = getTitle('http://www.pythonscraping.com/pages/page1.html')
title1 = getTitle('https://www.pythonscraping.com/pages/page1.html')

if title == None:
    print('Title could not be found')
else:
    print(title)

if title1 == None:
    print('Title could not be found')
else:
    print(title1)

Title could not be found
<h1>An Interesting Title</h1>


## Chapter 2

In [35]:
html = urlopen('https://www.pythonscraping.com/pages/warandpeace.html')
bs = BeautifulSoup(html.read(), 'lxml')

In [36]:
# find_all('tag_name', 'tag_attributes')
nameList = bs.find_all('span', {'class':'green'})

for name in nameList:
    print(name.get_text()) # separates out the content from the tages themselves

Anna
Pavlovna Scherer
Empress Marya
Fedorovna
Prince Vasili Kuragin
Anna Pavlovna
St. Petersburg
the prince
Anna Pavlovna
Anna Pavlovna
the prince
the prince
the prince
Prince Vasili
Anna Pavlovna
Anna Pavlovna
the prince
Wintzingerode
King of Prussia
le Vicomte de Mortemart
Montmorencys
Rohans
Abbe Morio
the Emperor
the prince
Prince Vasili
Dowager Empress Marya Fedorovna
the baron
Anna Pavlovna
the Empress
the Empress
Anna Pavlovna's
Her Majesty
Baron
Funke
The prince
Anna
Pavlovna
the Empress
The prince
Anatole
the prince
The prince
Anna
Pavlovna
Anna Pavlovna


### find and find_all

* `python .find_all(['h1','h2','h3','h4','h5','h6'])`. You can look up multiple tags.
* `python .find_all('span', {'class':{'green', 'red'}})`. The tag attributes are a Python dictionary and matches any tag with any one of the listed attributes.
* The `recursive` argument is set to True by default, and it goes down to the bottom of the tree to find every tag. You can set this to False to speed up processing.
* The `text` argument matches based on the text content of the tags and not on the properties of those tags. See below.
* `limit` is only used in `find_all()`. `find()` is equivalent to `find_all()` with a limit of 1.
* The `keyword` argument allows one to select tags that contain a particular set of attributes. See below. The keyword argument is technically redundant though. They recommend using the tag attributes specific dictionary arguments instead. Especially since `class` is a protected keyword in Python and you must use `class_` in conjunction with this argument. It is kind of clunky. The keyword argument does operate based on `and` logic rather than `or` logic (which is what the dictionary-based approach does). However, you can use regular expressions to do the same thing.

In [38]:
nameList = bs.find_all(text='the prince')
print(nameList)

title = bs.find_all(id = 'title', class_ = 'text') # find all tags with an id of title and a value of text for the class attribute.
print(title)

['the prince', 'the prince', 'the prince', 'the prince', 'the prince', 'the prince', 'the prince']
[]


In [40]:
html = urlopen('https://www.pythonscraping.com/pages/page3.html')
bs = BeautifulSoup(html, 'lxml')



<tr><th>
Item Title
</th><th>
Description
</th><th>
Cost
</th><th>
Image
</th></tr>


<tr class="gift" id="gift1"><td>
Vegetable Basket
</td><td>
This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
<span class="excitingNote">Now with super-colorful bell peppers!</span>
</td><td>
$15.00
</td><td>
<img src="../img/gifts/img1.jpg"/>
</td></tr>


<tr class="gift" id="gift2"><td>
Russian Nesting Dolls
</td><td>
Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! <span class="excitingNote">8 entire dolls per set! Octuple the presents!</span>
</td><td>
$10,000.52
</td><td>
<img src="../img/gifts/img2.jpg"/>
</td></tr>


<tr class="gift" id="gift3"><td>
Fish Painting
</td><td>
If something seems fishy about this painting, it's because it's a fish! <span class="excitingNote">Also hand-painted by trained monkeys!</span>
</td><td>
$10,005.00
</td><td>
<img src="../img/gifts/img3.jpg"/>


In [None]:
# prints only the children (descendants I believe is recursive)
for child in bs.find('table',{'id':'giftList'}).children:
    print(child)

In [41]:
# print subsequent siblings (not including itself) in the tree structure
# you could also use previous_siblings
for sibling in bs.find('table', {'id':'giftList'}).tr.next_siblings:
    print(sibling)



<tr class="gift" id="gift1"><td>
Vegetable Basket
</td><td>
This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
<span class="excitingNote">Now with super-colorful bell peppers!</span>
</td><td>
$15.00
</td><td>
<img src="../img/gifts/img1.jpg"/>
</td></tr>


<tr class="gift" id="gift2"><td>
Russian Nesting Dolls
</td><td>
Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! <span class="excitingNote">8 entire dolls per set! Octuple the presents!</span>
</td><td>
$10,000.52
</td><td>
<img src="../img/gifts/img2.jpg"/>
</td></tr>


<tr class="gift" id="gift3"><td>
Fish Painting
</td><td>
If something seems fishy about this painting, it's because it's a fish! <span class="excitingNote">Also hand-painted by trained monkeys!</span>
</td><td>
$10,005.00
</td><td>
<img src="../img/gifts/img3.jpg"/>
</td></tr>


<tr class="gift" id="gift4"><td>
Dead Parrot
</td><td>
This is an ex-parr

In [42]:
# Sometimes you get gnarly HTML code and need to use the parent property
print(bs.find('img', {'src':'../img/gifts/img1.jpg'}).parent.previous_sibling.get_text())


$15.00



In [45]:
# we can use Regular Expressions to find all images of gifts
images = bs.find_all('img', {'src':re.compile('..\/img\/gifts/img.*.jpg')})

for image in images:
    print(image['src'])

../img/gifts/img1.jpg
../img/gifts/img2.jpg
../img/gifts/img3.jpg
../img/gifts/img4.jpg
../img/gifts/img6.jpg


Please note that tag objects can have their attributes accessed like so (e.g., to extract the URL from a website): `myTag.attrs`. This important for the discussion below.

BeautifulSoup also allows for the use of lambda functions or functions which can take other functions as arguments. BeautifulSoup allows you to pass certain types of functions as parameters into the
find_all function. The only restriction is that these functions must take a tag object as an argument and return a boolean. Every tag object that BeautifulSoup encounters is evaluated in this function, and tags that evaluate to True are returned, while the rest are discarded.

* e.g., `python bs.find_all(lambda tag: len(tag.attrs) == 2)` or find all tags with two attributes.

In [46]:
bs.find_all(lambda tag: len(tag.attrs) == 2)

[<img src="../img/gifts/logo.jpg" style="float:left;"/>,
 <tr class="gift" id="gift1"><td>
 Vegetable Basket
 </td><td>
 This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
 <span class="excitingNote">Now with super-colorful bell peppers!</span>
 </td><td>
 $15.00
 </td><td>
 <img src="../img/gifts/img1.jpg"/>
 </td></tr>,
 <tr class="gift" id="gift2"><td>
 Russian Nesting Dolls
 </td><td>
 Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! <span class="excitingNote">8 entire dolls per set! Octuple the presents!</span>
 </td><td>
 $10,000.52
 </td><td>
 <img src="../img/gifts/img2.jpg"/>
 </td></tr>,
 <tr class="gift" id="gift3"><td>
 Fish Painting
 </td><td>
 If something seems fishy about this painting, it's because it's a fish! <span class="excitingNote">Also hand-painted by trained monkeys!</span>
 </td><td>
 $10,005.00
 </td><td>
 <img src="../img/gifts/img3.jpg"/>
 </td>

In [48]:
# lambda functions and regular expressions can sometimes both be used to the same effect, pick your poison
print(bs.find_all(lambda tag: tag.get_text() == 'Or maybe he\'s only resting?'))
print(bs.find_all('', text='Or maybe he\'s only resting?'))

[<span class="excitingNote">Or maybe he's only resting?</span>]
["Or maybe he's only resting?"]


## Chapter 11

In [10]:
firefox_options = Options()
firefox_options.add_argument('--headless')
firefox_driver = webdriver.Firefox(executable_path = '/usr/local/bin/geckodriver', options = firefox_options)

firefox_driver.get('https://pythonscraping.com/pages/javascript/ajaxDemo.html')
time.sleep(3)
print(firefox_driver.find_element("id", "content").text)
firefox_driver.close()

  firefox_driver = webdriver.Firefox(executable_path = '/usr/local/bin/geckodriver', options = firefox_options)


Here is some important text you want to retrieve!
A button to click!


In [12]:
firefox_options = Options()
firefox_options.add_argument('--headless')
firefox_driver = webdriver.Firefox(executable_path = '/usr/local/bin/geckodriver', options = firefox_options)
firefox_driver.get('https://pythonscraping.com/pages/javascript/ajaxDemo.html')
time.sleep(3)

# you could still use BeautifulSoup to search through the HTML if you wanted instead of Selenium
pageSource = firefox_driver.page_source
bs = BeautifulSoup(pageSource, 'lxml')
print(bs.find(id = 'content').get_text())

  firefox_driver = webdriver.Firefox(executable_path = '/usr/local/bin/geckodriver', options = firefox_options)


Here is some important text you want to retrieve! A button to click!


An implicit wait differs from an explicit wait in that it waits for a certain state in the DOM to occur before continuing. An explicit wait defines a hardcoded time as in the previous example which has a wait of three seconds.

In an implicit wait, the triggering DOM state is defined by expected_condition (note that the import is cast to EC here, a common convention used for brevity).

In [None]:
# more efficient solution would repeatedly check for the existence of a particular element on a fully loaded page and return only when that element exists.

firefox_options = Options()
firefox_options.add_argument('--headless')
firefox_driver = webdriver.Chrome(executable_path='/usr/local/bin/geckodriver', options = firefox_options)
firefox_driver.get('http://pythonscraping.com/pages/javascript/ajaxDemo.html')

try:
    element = WebDriverWait(firefox_driver, 10).until(EC.presence_of_element_located((By.ID, 'loadedButton')))
finally:
    print(firefox_driver.find_element_by_id('content').text)

firefox_driver.close()

The book provides some more code examples of how to deal with page redirects as well.