# Advanced Parsing

* Searching for tags by attributes

In [24]:
import re
import requests 
from bs4 import BeautifulSoup

In [25]:
content = requests.get(r'http://www.pythonscraping.com/pages/warandpeace.html').text
content_parser = BeautifulSoup(content, 'html.parser')

with open('c2_content_parser.txt', mode='w') as f:
    f.write(content)

In [26]:
# find_all method can be used to find all tags and the attribute associcated with each tag

name_list = content_parser.find_all('span', {'class':'green'}) 
# this find all span tags with the class green
# this return a BS result set object which is a iterable
print(type(name_list))

<class 'bs4.element.ResultSet'>


In [27]:
for name in name_list:
    print(name.get_text()) # get_text() method extract the content from the tags

# Notes:
# calling get_text() should the last thing that we do
# we want to preserve the tag structure of a document for as long as possible

Anna
Pavlovna Scherer
Empress Marya
Fedorovna
Prince Vasili Kuragin
Anna Pavlovna
St. Petersburg
the prince
Anna Pavlovna
Anna Pavlovna
the prince
the prince
the prince
Prince Vasili
Anna Pavlovna
Anna Pavlovna
the prince
Wintzingerode
King of Prussia
le Vicomte de Mortemart
Montmorencys
Rohans
Abbe Morio
the Emperor
the prince
Prince Vasili
Dowager Empress Marya Fedorovna
the baron
Anna Pavlovna
the Empress
the Empress
Anna Pavlovna's
Her Majesty
Baron
Funke
The prince
Anna
Pavlovna
the Empress
The prince
Anatole
the prince
The prince
Anna
Pavlovna
Anna Pavlovna


In [28]:
x = content_parser.find_all(['h1','h2','h3','h4']) # we can also pass in a list of tag name
for element in x:
    print(element)

<h1>War and Peace</h1>
<h2>Chapter 1</h2>


In [29]:
y = content_parser.find_all('span', {'class':{'green', 'red'}}) 

for element in y:
    print(element.get_text())
    print()

Well, Prince, so Genoa and Lucca are now just family estates of the
Buonapartes. But I warn you, if you don't tell me that this means war,
if you still try to defend the infamies and horrors perpetrated by
that Antichrist- I really believe he is Antichrist- I will have
nothing more to do with you and you are no longer my friend, no longer
my 'faithful slave,' as you call yourself! But how do you do? I see
I have frightened you- sit down and tell me all the news.

Anna
Pavlovna Scherer

Empress Marya
Fedorovna

Prince Vasili Kuragin

Anna Pavlovna

St. Petersburg

If you have nothing better to do, Count [or Prince], and if the
prospect of spending an evening with a poor invalid is not too
terrible, I shall be very charmed to see you tonight between 7 and 10-
Annette Scherer.

Heavens! what a virulent attack!

the prince

Anna Pavlovna

First of all, dear friend, tell me how you are. Set your friend's
mind at rest,

Can one be well while suffering morally? Can one be calm in times
like t

In [30]:
prince_content = content_parser.find_all(text='the prince')
prince_content

['the prince',
 'the prince',
 'the prince',
 'the prince',
 'the prince',
 'the prince',
 'the prince']

In [31]:
title = content_parser.find_all(id='title', class_='text')
print(title)
# this return first tag with the following attribute:
# class : 'text'
# id : 'title'

[]


In [32]:
title = content_parser.find(id='title')
print(title)

None


In [33]:
title = content_parser.find_all(id='title', class_='green')
print(title)

[]


In [34]:
title = content_parser.find_all(class_={'green', 'red'})

for element in title:
    print(element.get_text())
    print()

Well, Prince, so Genoa and Lucca are now just family estates of the
Buonapartes. But I warn you, if you don't tell me that this means war,
if you still try to defend the infamies and horrors perpetrated by
that Antichrist- I really believe he is Antichrist- I will have
nothing more to do with you and you are no longer my friend, no longer
my 'faithful slave,' as you call yourself! But how do you do? I see
I have frightened you- sit down and tell me all the news.

Anna
Pavlovna Scherer

Empress Marya
Fedorovna

Prince Vasili Kuragin

Anna Pavlovna

St. Petersburg

If you have nothing better to do, Count [or Prince], and if the
prospect of spending an evening with a poor invalid is not too
terrible, I shall be very charmed to see you tonight between 7 and 10-
Annette Scherer.

Heavens! what a virulent attack!

the prince

Anna Pavlovna

First of all, dear friend, tell me how you are. Set your friend's
mind at rest,

Can one be well while suffering morally? Can one be calm in times
like t

# Navigating Trees

* In BS, children tags are always one level below the parent tags whereas the descendants tags can be at any level in the tree below the parent tag
* All children tags are descendants but not all descendants are childrens



In [35]:
gift_website = 'https://www.pythonscraping.com/pages/page3.html'
gift_website_content = requests.get(gift_website).text

content = BeautifulSoup(gift_website_content, 'html.parser')

with open('c2_content_gift_website.txt', mode='w') as f:
    f.write(gift_website_content)

In [36]:
# use the children tag if we only want the descendants that are children
chd = content.find('table', {'id': 'giftList'}).children # this will return a list iterator 
for child in chd:
    print(child)



<tr><th>
Item Title
</th><th>
Description
</th><th>
Cost
</th><th>
Image
</th></tr>


<tr class="gift" id="gift1"><td>
Vegetable Basket
</td><td>
This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
<span class="excitingNote">Now with super-colorful bell peppers!</span>
</td><td>
$15.00
</td><td>
<img src="../img/gifts/img1.jpg"/>
</td></tr>


<tr class="gift" id="gift2"><td>
Russian Nesting Dolls
</td><td>
Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! <span class="excitingNote">8 entire dolls per set! Octuple the presents!</span>
</td><td>
$10,000.52
</td><td>
<img src="../img/gifts/img2.jpg"/>
</td></tr>


<tr class="gift" id="gift3"><td>
Fish Painting
</td><td>
If something seems fishy about this painting, it's because it's a fish! <span class="excitingNote">Also hand-painted by trained monkeys!</span>
</td><td>
$10,005.00
</td><td>
<img src="../img/gifts/img3.jpg"/>


In [37]:
for d in content.find('table', {'id': 'giftList'}).descendants:
    print(d)
    print()




<tr><th>
Item Title
</th><th>
Description
</th><th>
Cost
</th><th>
Image
</th></tr>

<th>
Item Title
</th>


Item Title


<th>
Description
</th>


Description


<th>
Cost
</th>


Cost


<th>
Image
</th>


Image





<tr class="gift" id="gift1"><td>
Vegetable Basket
</td><td>
This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
<span class="excitingNote">Now with super-colorful bell peppers!</span>
</td><td>
$15.00
</td><td>
<img src="../img/gifts/img1.jpg"/>
</td></tr>

<td>
Vegetable Basket
</td>


Vegetable Basket


<td>
This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
<span class="excitingNote">Now with super-colorful bell peppers!</span>
</td>


This vegetable basket is the perfect gift for your health conscious (or overweight) friends!


<span class="excitingNote">Now with super-colorful bell peppers!</span>

Now with super-colorful bell peppers!




<td>
$15.00
</td>


$15.00


<td>
<img src=".

In [38]:
# objects cannot be siblings with themselves
# Everytime we get the siblings of an object, the next_siblings function will call on the next sibling tags

for siblings in content.find('table', {'id': 'giftList'}).tr.next_siblings:
    print(siblings)



<tr class="gift" id="gift1"><td>
Vegetable Basket
</td><td>
This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
<span class="excitingNote">Now with super-colorful bell peppers!</span>
</td><td>
$15.00
</td><td>
<img src="../img/gifts/img1.jpg"/>
</td></tr>


<tr class="gift" id="gift2"><td>
Russian Nesting Dolls
</td><td>
Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! <span class="excitingNote">8 entire dolls per set! Octuple the presents!</span>
</td><td>
$10,000.52
</td><td>
<img src="../img/gifts/img2.jpg"/>
</td></tr>


<tr class="gift" id="gift3"><td>
Fish Painting
</td><td>
If something seems fishy about this painting, it's because it's a fish! <span class="excitingNote">Also hand-painted by trained monkeys!</span>
</td><td>
$10,005.00
</td><td>
<img src="../img/gifts/img3.jpg"/>
</td></tr>


<tr class="gift" id="gift4"><td>
Dead Parrot
</td><td>
This is an ex-parr

In [39]:
content.find('table', {'id': 'giftList'}).tr

<tr><th>
Item Title
</th><th>
Description
</th><th>
Cost
</th><th>
Image
</th></tr>

In [40]:
# parent tag

print(content.find('img', {'src': '../img/gifts/img1.jpg'}).parent)
print()
print(content.find('img', {'src': '../img/gifts/img1.jpg'}).parent.previous_sibling)
print()
print(content.find('img', {'src': '../img/gifts/img1.jpg'}).parent.previous_sibling.get_text())

<td>
<img src="../img/gifts/img1.jpg"/>
</td>

<td>
$15.00
</td>


$15.00



In [41]:
type(content.find('img', {'src': '../img/gifts/img1.jpg'}).parent)

bs4.element.Tag

In [42]:
content.find('img', {'src': '../img/gifts/img1.jpg'})

<img src="../img/gifts/img1.jpg"/>

In [43]:
content.find('table', {'id': 'giftList'}).descendants

<generator object Tag.descendants at 0x0000021E655AC350>

# Using Regular Expression

In [44]:
gift_website = 'https://www.pythonscraping.com/pages/page3.html'
gift_website_content = requests.get(gift_website).text

content = BeautifulSoup(gift_website_content, 'html.parser')

In [46]:
images = content.find_all('img')
images

[<img src="../img/gifts/logo.jpg" style="float:left;"/>,
 <img src="../img/gifts/img1.jpg"/>,
 <img src="../img/gifts/img2.jpg"/>,
 <img src="../img/gifts/img3.jpg"/>,
 <img src="../img/gifts/img4.jpg"/>,
 <img src="../img/gifts/img6.jpg"/>]

In [163]:
images = content.find_all('img',
                                {'src': re.compile(r'\.\./img/gifts/img.?\.jpg')}
                                )
images

[<img src="../img/gifts/img1.jpg"/>,
 <img src="../img/gifts/img2.jpg"/>,
 <img src="../img/gifts/img3.jpg"/>,
 <img src="../img/gifts/img4.jpg"/>,
 <img src="../img/gifts/img6.jpg"/>]

In [61]:
for img in images:
    print(img['src'])

../img/gifts/img1.jpg
../img/gifts/img2.jpg
../img/gifts/img3.jpg
../img/gifts/img4.jpg
../img/gifts/img6.jpg


# Accessing Attributes

In [73]:
content.find('tr').attrs

{}

In [70]:
content.find_all('span')

[<span class="excitingNote">Now with super-colorful bell peppers!</span>,
 <span class="excitingNote">8 entire dolls per set! Octuple the presents!</span>,
 <span class="excitingNote">Also hand-painted by trained monkeys!</span>,
 <span class="excitingNote">Or maybe he's only resting?</span>,
 <span class="excitingNote">Keep your friends guessing!</span>]

# Lambda Expression

In [101]:
x = content.find_all(lambda tag: len(tag.attrs) == 2)

# the second agrument, i.e. len(tag.attrs) == 2 return a Boolean value

for element in x:
    print(element)
    print()

<img src="../img/gifts/logo.jpg" style="float:left;"/>

<tr class="gift" id="gift1"><td>
Vegetable Basket
</td><td>
This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
<span class="excitingNote">Now with super-colorful bell peppers!</span>
</td><td>
$15.00
</td><td>
<img src="../img/gifts/img1.jpg"/>
</td></tr>

<tr class="gift" id="gift2"><td>
Russian Nesting Dolls
</td><td>
Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! <span class="excitingNote">8 entire dolls per set! Octuple the presents!</span>
</td><td>
$10,000.52
</td><td>
<img src="../img/gifts/img2.jpg"/>
</td></tr>

<tr class="gift" id="gift3"><td>
Fish Painting
</td><td>
If something seems fishy about this painting, it's because it's a fish! <span class="excitingNote">Also hand-painted by trained monkeys!</span>
</td><td>
$10,005.00
</td><td>
<img src="../img/gifts/img3.jpg"/>
</td></tr>

<tr class="gift" id="g

# Playing around

In [307]:
x = content.find('table', {'id': 'giftList'}).get_text()
x

'\n\nItem Title\n\nDescription\n\nCost\n\nImage\n\n\nVegetable Basket\n\nThis vegetable basket is the perfect gift for your health conscious (or overweight) friends!\nNow with super-colorful bell peppers!\n\n$15.00\n\n\n\n\nRussian Nesting Dolls\n\nHand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! 8 entire dolls per set! Octuple the presents!\n\n$10,000.52\n\n\n\n\nFish Painting\n\nIf something seems fishy about this painting, it\'s because it\'s a fish! Also hand-painted by trained monkeys!\n\n$10,005.00\n\n\n\n\nDead Parrot\n\nThis is an ex-parrot! Or maybe he\'s only resting?\n\n$0.50\n\n\n\n\nMystery Box\n\nIf you love suprises, this mystery box is for you! Do not place on light-colored surfaces. May cause oil staining. Keep your friends guessing!\n\n$1.50\n\n\n\n'

In [319]:
prices_pattern = re.compile(r'\$\d+\.?\,?\d+\.?\d+')
prices_pattern.findall(x)

['$15.00', '$10,000.52', '$10,005.00', '$0.50', '$1.50']