简单使用

In [2]:
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

In [3]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'lxml')
# html进行美化
print(soup.prettify())
print(soup.title)  # 获取标签title
# <title>The Dormouse's story</title>

print(soup.title.name)   # 获取标签名称
# 'title'

print(soup.title.string)   # 获取标签title内的内容
# 'The Dormouse's story'

print(soup.title.parent)  # 获取父级标签

print(soup.title.parent.name)  # 获取父级标签名称
# 'head'

print(soup.p)
# <p class="title"><b>The Dormouse's story</b></p>

print(soup.p['class'])  # 获取p的class属性值
# 'title'

print(soup.a)
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

print(soup.find_all('a'))
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
#  <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
#  <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

print(soup.find(id="link3"))  # 获取id为link3的标签
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
for link in soup.find_all('a'):
    print(link.get('href'))
    
print(soup.get_text())

<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>

<title>The Dormouse's story</title>
title
The Dormouse's story
<head><title>The Dormouse's story</title></head>
head
<p class="title"><b>The Dormouse's story</b></p>
['title']
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a 

遍历文档树

In [4]:
html_doc = """
<html><head><title>The Dormouse's story</title></head>
    <body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

In [5]:
from bs4 import BeautifulSoup
# lxml和html.parser解析的有时候会根据html是否完整而有解析不同的问题，需要注意
soup = BeautifulSoup(html_doc, 'html.parser')
head_tag = soup.head
print(head_tag)
print(head_tag.contents)
title_tag = head_tag.contents[0]
print(title_tag)
print(title_tag.contents)
text = title_tag.contents[0]
# print(text.contents)
print(soup.head.string)
print(soup.title.string)
print(soup.html.string)
print(soup.html.text)

for string in soup.strings:
    print(repr(string))
    
for string in soup.stripped_strings:
    print(repr(string))
    
title_tag = soup.title
print(title_tag)
print(title_tag.parent)
html_tag = soup.html

# print(html_tag)
print(type(html_tag.parent))

<head><title>The Dormouse's story</title></head>
[<title>The Dormouse's story</title>]
<title>The Dormouse's story</title>
["The Dormouse's story"]
The Dormouse's story
The Dormouse's story
None
The Dormouse's story

The Dormouse's story
Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.
...

'\n'
"The Dormouse's story"
'\n'
'\n'
"The Dormouse's story"
'\n'
'Once upon a time there were three little sisters; and their names were\n'
'Elsie'
',\n'
'Lacie'
' and\n'
'Tillie'
';\nand they lived at the bottom of a well.'
'\n'
'...'
'\n'
"The Dormouse's story"
"The Dormouse's story"
'Once upon a time there were three little sisters; and their names were'
'Elsie'
','
'Lacie'
'and'
'Tillie'
';\nand they lived at the bottom of a well.'
'...'
<title>The Dormouse's story</title>
<head><title>The Dormouse's story</title></head>
<class 'bs4.BeautifulSoup'>


搜索文档树

In [9]:
print(soup.find_all('title'))
print(soup.find_all('p', 'title'))
print(soup.find_all('a'))
print(soup.find_all(id='link2'))
import re
print(soup.find_all(string=re.compile('sisters')))
for tag in soup.find_all(re.compile('^b')):
    print(tag.name)
soup.find_all(['a','b'])
print(soup.find_all(id='link2'))
print(soup.find_all(href=re.compile('elsie')))
print(soup.find_all(string=re.compile('^The')))
print(soup.find_all(class_=re.compile('st')))
print(soup.find_all(id=True))
print(soup.find_all(href=re.compile('elsie'), id='link1'))
print(soup.find_all("a", class_="sister"))
print(soup.find_all(attrs={'data-foo': 'value'}))
print(soup.find_all('b', class_='story', id='x'))
print(soup.find_all(attrs={'class':'story', 'id': 'x'}))
print(soup.find_all(string='Elsie'))
print(soup.find_all(string=["Tillie", "Elsie", "Lacie"]))
print(soup.find_all(string=re.compile('Dormouse')))
print(soup.find_all('a', limit=2))
print(soup.find_all('a')[0:2])
print(soup.find_all('title', limit=1))
print(soup.find('title'))
print(soup.find('nosuchtag'))
print(soup.head.title)
print(soup.find('head').find('title'))

[<title>The Dormouse's story</title>]
[<p class="title"><b>The Dormouse's story</b></p>]
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]
['Once upon a time there were three little sisters; and their names were\n']
body
b
[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]
["The Dormouse's story", "The Dormouse's story"]
[<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom o

css选择器

In [21]:
print(soup.select('title'))
print(soup.select('b'))
print(soup.select('.sister'))
print(soup.select('#link1'))
print(soup.select("p #link2"))
print(soup.select('p > #link2'))
print(soup.select('.story#test'))
print(soup.select('.story.test'))
print(soup.select(".story.test#book"))

[]


In [22]:
print(soup.select("a[href='http://example.com/tillie']"))

[<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]


In [23]:
for title in soup.select('a'):
    print(title.get_text())

Elsie
Lacie
Tillie
