In [4]:
from bs4 import BeautifulSoup

##### Let's do some simple examples

In [5]:
# Example 1
# we have a simple html text
htmltxt = '<p>Hello World<p>'
parsed_txt = BeautifulSoup(htmltxt, 'lxml')

##### There are 3 ways of parser, let's see the advantages and disadvantages of each parser method.
1. html.parser - BeautifulSoup(markup, "html.parser")
-Advantages: Batteries included, Decent Speed, Lenient
-Disadvantages: Not very lenient (before Python 2.7.3 or 3.2.2)
2. lxml - BeautifulSoup(markup, "lxml")
-Advantages: Very fast, lenient
-Disadvantages: External C dependency
3. html5lib - BeautifulSoup(markup, "html5lib")
-Advantages: Extremely lenient, Parses pages the same way a web browser does, Creates valid HTML5
-Disadvantages: Very slow, External Python dependency

In [8]:
# check type
print('The type of parsed_txt is: ', type(parsed_txt))

The type of parsed_txt is:  <class 'bs4.BeautifulSoup'>


In [11]:
# Extract text
print(parsed_txt)
print(parsed_txt.text)

<html><body><p>Hello World</p><p></p></body></html>
Hello World


In [14]:
# Example 2
mytxt = """
<h1>Hello World</h1>
<p>This is a <a href="http://example.com">link</a></p>"""

soup = BeautifulSoup(mytxt, 'lxml')
print(soup.text)

Hello World
This is a link


### Finding a tage with find()

In [15]:
# Generally, we don't want to just spit all of the tag-stripped text of an HTML document. 
# Usually, we want to extract text from just a few specific elements.
# Example 3
mytxt = """
<h1>Hello World</h1>
<p>This is a <a href="http://example.com">link</a></p>
"""

In [16]:
# mytxt contains 3 tags: 
# 1. A headline, <h1>
# 2. A paragragh, <p>
# 3. Within the paragragh, a hyperlink, ,<a>

In [17]:
soup = BeautifulSoup(mytxt, "lxml")

In [22]:
print(soup.find('h1'))
print(soup.find('p'))
print(soup.find('a'))
print(soup.find('h1').text)
print(soup.find('p').text)
print(soup.find('a').text)
print(type(soup.find('h1')))

<h1>Hello World</h1>
<p>This is a <a href="http://example.com">link</a></p>
<a href="http://example.com">link</a>
Hello World
This is a link
link
<class 'bs4.element.Tag'>


### Extracting attributes from a tag with attrs

In [29]:
# In the previous example, how can we extract the link?
# use attrs function to extract the link
link = soup.find('a')
print(type(link))
# Take a look at the type of the link attributes
print(type(link.attrs))
print(link.attrs)
# stripe out the link
print(link.attrs['href'])

<class 'bs4.element.Tag'>
<class 'dict'>
{'href': 'http://example.com'}
http://example.com


### Finding multiple elements with find_all

In [31]:
# example 4
moretxt = """
<p>Visit the <a href='http://www.nytimes.com'>New York Times</a></p>
<p>Visit the <a href='http://www.wsj.com'>Wall Street Journal</a></p>
"""

soup = BeautifulSoup(moretxt, 'lxml')
soup

<html><body><p>Visit the <a href="http://www.nytimes.com">New York Times</a></p>
<p>Visit the <a href="http://www.wsj.com">Wall Street Journal</a></p>
</body></html>

In [45]:
tags = soup.find_all('p')
print(type(tags))
# a ResultSet acts very much like list3
print(tags)
print(len(tags))
print(tags[0].text)
print(tags[0].attrs)

<class 'bs4.element.ResultSet'>
[<p>Visit the <a href="http://www.nytimes.com">New York Times</a></p>, <p>Visit the <a href="http://www.wsj.com">Wall Street Journal</a></p>]
2
Visit the New York Times
{}


### Finding nested elements

In [52]:
# example 5
evenmoretxt = """
<h1><a href="http://www.a.com">Awesome</a></h1>
<h1><a href="http://www.b.com">Really Awesome</a></h1>

<div><a href="http://na.com">Ignore me</a></div>
<div><a href="http://127.0.0.1">Ignore me again</a></div>
"""

soup = BeautifulSoup(evenmoretxt, 'lxml')

In [53]:
heads = soup.find_all('h1')
heads

[<h1><a href="http://www.a.com">Awesome</a></h1>,
 <h1><a href="http://www.b.com">Really Awesome</a></h1>]

In [57]:
links = []
for h in heads:
    a = h.find('a')
    links.append(a)
links

[<a href="http://www.a.com">Awesome</a>,
 <a href="http://www.b.com">Really Awesome</a>]