In [1]:
# import BeautifulSoup 
from bs4 import BeautifulSoup

In [2]:
# example html
html=['<html><heading style="font-size:20px"><i>This is the title<br><br></i></heading>',
     '<body><b>This is the body</b><p id="para1">This is para1<a href="www.google.com">Google</a></p>',
     '<p id="para2">This is para 2</p></body></html>']
html=''.join(html)

In [4]:
# Instantiate a soup object to identifies a structure in html and creates a parse tree to navigate and extract pieces
soup = BeautifulSoup(html, "html")
print(soup.prettify())

<html>
 <body>
  <heading style="font-size:20px">
   <i>
    This is the title
    <br/>
    <br/>
   </i>
  </heading>
  <b>
   This is the body
  </b>
  <p id="para1">
   This is para1
   <a href="www.google.com">
    Google
   </a>
  </p>
  <p id="para2">
   This is para 2
  </p>
 </body>
</html>


In [5]:
# HTML hierarchical structure in parse tree, i.e. parent, sibling, etc.
# navigate using tags or attributes
soup.html.name

'html'

In [6]:
soup.body.name

'body'

In [7]:
soup.body.text

u'This is the titleThis is the bodyThis is para1GoogleThis is para 2'

In [8]:
soup.html.contents

[<body><heading style="font-size:20px"><i>This is the title<br/><br/></i></heading><b>This is the body</b><p id="para1">This is para1<a href="www.google.com">Google</a></p><p id="para2">This is para 2</p></body>]

In [9]:
soup.body.contents

[<heading style="font-size:20px"><i>This is the title<br/><br/></i></heading>,
 <b>This is the body</b>,
 <p id="para1">This is para1<a href="www.google.com">Google</a></p>,
 <p id="para2">This is para 2</p>]

In [10]:
soup.body.parent.name

'html'

In [11]:
soup.b.nextSibling

<p id="para1">This is para1<a href="www.google.com">Google</a></p>

In [12]:
soup.p.previousSibling

<b>This is the body</b>

In [13]:
# findAll, find are methods to search the tree for specific tags, or tags with certain attributes
bold = soup.findAll('b') # find all bold text and save into list
print(bold) 

[<b>This is the body</b>]


In [14]:
print(bold[0].text)

This is the body


In [15]:
# get all text in paragraphs and join into a single string
paras = ' '.join([p.text for p in soup.findAll('p')])
print(paras)

This is para1Google This is para 2


In [16]:
# findAll by attribute, i.e id
soup.findAll(id="para2")[0].text

u'This is para 2'

In [18]:
# e.g. find any text with font size 20
font20 = ' '.join([p.text for p in soup.findAll(style="font-size:20px")])
print(font20)

This is the title


In [19]:
# arguments as list or dictionary
soup.findAll(['b','p'])

[<b>This is the body</b>,
 <p id="para1">This is para1<a href="www.google.com">Google</a></p>,
 <p id="para2">This is para 2</p>]

In [20]:
soup.findAll({'b':True, 'p':True})

[<b>This is the body</b>,
 <p id="para1">This is para1<a href="www.google.com">Google</a></p>,
 <p id="para2">This is para 2</p>]

In [21]:
# Find links and do scraping
links = soup.find('a')
print(links)

<a href="www.google.com">Google</a>


In [22]:
# find limited number of links
soup.findAll('a', limit=10)

[<a href="www.google.com">Google</a>]

In [23]:
# extract url and text from link
print(links['href']+" is the url and "+links.text+" is the text")

www.google.com is the url and Google is the text


In [24]:
# find in branches or pre-defined scope
soup.find(text="Google").findNext('p').text

u'This is para 2'

In [27]:
type(soup.find(text="Google"))

bs4.BeautifulSoup