In [1]:
from bs4 import BeautifulSoup

In [2]:
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

In [3]:
soup = BeautifulSoup(html_doc, 'html.parser')
print(soup.prettify())

<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>


Вот несколько простых способов навигации по этой структуре данных:

In [4]:
soup.title

<title>The Dormouse's story</title>

In [8]:
soup.title.name

'title'

In [9]:
soup.title.string

"The Dormouse's story"

In [10]:
soup.title.parent.name

'head'

In [13]:
soup.head.parent.name

'html'

Одна из распространенных задач — извлечь все URL-адреса, найденные на странице в тегах <a>:

In [10]:
for link in soup.find_all('a'):
    print(link.get('href'))

http://example.com/elsie
http://example.com/lacie
http://example.com/tillie


Другая распространенная задача — извлечь весь текст со страницы:

In [11]:
print(soup.get_text())


The Dormouse's story

The Dormouse's story
Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.
...



In [16]:
soup = BeautifulSoup('<b class="boldest">Extremely bold</b>')
tag = soup.b
# tag
type(tag)

bs4.element.Tag

In [17]:
tag.name

'b'

In [18]:
tag.name = "blockquote"
tag

<blockquote class="boldest">Extremely bold</blockquote>

In [21]:
tag['class']

['boldest']

In [23]:
tag['id'] = 'verybold'
tag['another-attribute'] = 1
tag

<blockquote another-attribute="1" class="boldest" id="verybold">Extremely bold</blockquote>

In [24]:
tag.attrs

{'class': ['boldest'], 'id': 'verybold', 'another-attribute': 1}

In [25]:
del tag['id']
del tag['another-attribute']
tag

<blockquote class="boldest">Extremely bold</blockquote>

In [26]:
tag.attrs

{'class': ['boldest']}

In [27]:
tag.string

'Extremely bold'

In [28]:
type(tag.string)

bs4.element.NavigableString

In [29]:
tag.string.replace_with("No longer bold")
tag

<blockquote class="boldest">No longer bold</blockquote>

In [38]:
markup = "<b><!--Hey, buddy. Want to buy a used parser?--></b>"
soup = BeautifulSoup(markup)
comment = soup.b.string
type(comment)
# comment

bs4.element.Comment

In [40]:
print(soup.b.prettify())

<b>
 <!--Hey, buddy. Want to buy a used parser?-->
</b>


In [41]:
html_doc = """<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>"""

In [54]:
soup = BeautifulSoup(html_doc, 'html.parser')
soup
# print(soup.prettify())

<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p></body></html>

In [45]:
soup.a

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

In [46]:
soup.find_all('a')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [53]:
for link in soup.find_all('a'):
#     print(link['href'])
    print(link.get('href'))

http://example.com/elsie
http://example.com/lacie
http://example.com/tillie


In [56]:
page_body = soup.body
page_body

<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p></body>

In [68]:
head_tag = soup.head
head_tag

<head><title>The Dormouse's story</title></head>

In [57]:
page_body.contents

['\n',
 <p class="title"><b>The Dormouse's story</b></p>,
 '\n',
 <p class="story">Once upon a time there were three little sisters; and their names were
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
 and they lived at the bottom of a well.</p>,
 '\n',
 <p class="story">...</p>]

In [62]:
page_body.contents[5]

<p class="story">...</p>

In [66]:
len(soup.contents)

1

In [65]:
len(page_body.contents)

6

In [64]:
soup.contents[0].name

'html'

In [69]:
title_tag = head_tag.contents[0]
title_tag

<title>The Dormouse's story</title>

In [71]:
for child in title_tag.children:
    print(child)

The Dormouse's story


In [72]:
for child in head_tag.descendants:
    print(child)

<title>The Dormouse's story</title>
The Dormouse's story


In [73]:
for child in page_body.descendants:
    print(child)



<p class="title"><b>The Dormouse's story</b></p>
<b>The Dormouse's story</b>
The Dormouse's story


<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
Once upon a time there were three little sisters; and their names were

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
Elsie
,

<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
Lacie
 and

<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
Tillie
;
and they lived at the bottom of a well.


<p class="story">...</p>
...


In [74]:
len(list(soup.children))

1

In [75]:
list(soup.children)

[<html><head><title>The Dormouse's story</title></head>
 <body>
 <p class="title"><b>The Dormouse's story</b></p>
 <p class="story">Once upon a time there were three little sisters; and their names were
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
 and they lived at the bottom of a well.</p>
 <p class="story">...</p></body></html>]

In [76]:
len(list(soup.descendants))

25

In [77]:
title_tag.string

"The Dormouse's story"

In [78]:
head_tag.contents

[<title>The Dormouse's story</title>]

In [79]:
head_tag.string

"The Dormouse's story"

In [83]:
print(soup.html.string)

None


In [84]:
for string in soup.strings:
    print(repr(string))

"The Dormouse's story"
'\n'
'\n'
"The Dormouse's story"
'\n'
'Once upon a time there were three little sisters; and their names were\n'
'Elsie'
',\n'
'Lacie'
' and\n'
'Tillie'
';\nand they lived at the bottom of a well.'
'\n'
'...'


In [85]:
for string in soup.stripped_strings:
    print(repr(string))

"The Dormouse's story"
"The Dormouse's story"
'Once upon a time there were three little sisters; and their names were'
'Elsie'
','
'Lacie'
'and'
'Tillie'
';\nand they lived at the bottom of a well.'
'...'


In [89]:
def multiply(x):
    return x*2

elements = [1,2,3,4]
elements_by2 = list(map(multiply, elements))
elements_by2

[2, 4, 6, 8]

In [91]:
elements = [1,2,3,4]
elements_by2 = list(map(lambda x: x*2, elements))
elements_by2

[2, 4, 6, 8]

In [97]:
cities = ['madrid', 'munich', 'valencia']
cities_cap = list(map(lambda x: x.title(), cities))
cities_cap

['Madrid', 'Munich', 'Valencia']

In [100]:
elements = [1,2,3,4]
my
elements_by2 = map(lambda x:x*2,elements)
print(type(elements_by2))

<class 'map'>


In [103]:
x = lambda a, b: a * b
x(5, 6)

30

In [104]:
def y(a, b):
    return a * b
y(5, 6)

30

In [107]:
def myfunc(n):
    return lambda a: a * n

# myfunc(2)
mydoubler = myfunc(2)
mydoubler(11)

22