# BeautifulSoap Cheat Sheet

## Convert between html

In [18]:
from bs4 import BeautifulSoup

markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>'
soup = BeautifulSoup(markup, 'html.parser')
a_tag = soup.a
a_tag.i.unwrap()

<i></i>

In [19]:
soup.decode(formatter=None)

'<a href="http://example.com/">I linked to example.com</a>'

## To delete html tags
- clear()
- extract()
- unwrap()

In [18]:
from bs4 import BeautifulSoup

markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>'
soup = BeautifulSoup(markup, 'html.parser')
a_tag = soup.a
a_tag

<a href="http://example.com/">I linked to <i>example.com</i></a>

In [4]:
i_tag = soup.i
i_tag

<i>example.com</i>

In [5]:
i_tag.clear()
i_tag

<i></i>

In [6]:
a_tag

<a href="http://example.com/">I linked to <i></i></a>

In [7]:
i_tag.extract()
i_tag

<i></i>

In [8]:
a_tag

<a href="http://example.com/">I linked to </a>

In [9]:
markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>'
soup = BeautifulSoup(markup, 'html.parser')
a_tag = soup.a
a_tag

<a href="http://example.com/">I linked to <i>example.com</i></a>

In [10]:
i_tag = soup.i
i_tag

<i>example.com</i>

In [11]:
i_tag.unwrap()
i_tag

<i></i>

In [12]:
a_tag

<a href="http://example.com/">I linked to example.com</a>

## Check html tags
- get the name: Tag.name
- get all content about the element: Tag.contents
- check if the elements are html tag: isinstance(elem, Tag)

In [15]:
from bs4 import BeautifulSoup

soup = BeautifulSoup('<div><span></span></div>')
print(soup.find('div').name)

div


In [17]:
# sample code

if isinstance(elem, Tag) and elem.name == "img":
    pass

SyntaxError: invalid syntax (<ipython-input-17-1efd615b9b70>, line 3)

In [None]:
p_tag = bs.find("p")
elem = p_tag.contents[0]

About bs.Tag, see [here](https://www.crummy.com/software/BeautifulSoup/bs4/doc/#name)
```Python
tag.name = "blockquote"
tag
# <blockquote class="boldest">Extremely bold</blockquote>

tag = BeautifulSoup('<b id="boldest">bold</b>', 'html.parser').b
tag['id']
# 'boldest'
```
                         

## Search on BS object
- find specific tag
- find all tag in same level (disable recursive)
- find tag with attribute, or with specific value in targeted attr
- fetch attr value

In [None]:
# get one element and the first element
# https://stackoverflow.com/questions/47818858/python-getting-the-first-child-of-the-parent-element-with-beautifulsoup4

.find('img')

In [None]:
# How to check if tag <a> or/and <img> is the children of div on Beautiful Soup
# https://stackoverflow.com/questions/48699335/how-to-check-if-tag-a-or-and-img-is-the-children-of-div-on-beautiful-soup

tag.find_all(recursive=False)

In [22]:
"""
Q: find tag with attr?

A: Sure, you can
https://stackoverflow.com/a/31417182/1911726
"""

html = '<p><img alt="Garifuna dish of hudutu" src="https://www.seriouseats.com/2020/06/20200610-garifuna-cuisine-hudutu-dish-wes-milton-guity.jpg"/></p><p><p class="caption">Hudutu baruru with falmo, a coconut-based seafood stew. [Photograph: Wes Güity and Milton Güity]</p><p>Many dishes comprise the world of Garifuna cuisine.</p><p><img alt="" src="https://www.seriouseats.com/2020/06/peeling-plantain-for-hudutu-edit.jpg"/></p><p><p class="caption">[Photograph: Wes Güity and Milton Güity]</p><p>The Garifuna origin story is a complex one that involves attempts to enslave.</p>'
from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'html.parser')
p_tag = soup.find("p")
p_tag

<p><img alt="Garifuna dish of hudutu" src="https://www.seriouseats.com/2020/06/20200610-garifuna-cuisine-hudutu-dish-wes-milton-guity.jpg"/></p>

In [23]:
p_tag = soup.find_all("p", attrs={"class":"caption"})
p_tag

[<p class="caption">Hudutu baruru with falmo, a coconut-based seafood stew. [Photograph: Wes Güity and Milton Güity]</p>,
 <p class="caption">[Photograph: Wes Güity and Milton Güity]</p>]

In [25]:
p_tag = soup.find("p", attrs={"class":"caption"})
p_tag

<p class="caption">Hudutu baruru with falmo, a coconut-based seafood stew. [Photograph: Wes Güity and Milton Güity]</p>

In [26]:
p_tag.has_attr("class")

True

In [29]:
p_tag.get("class")

['caption']

In [30]:
"""
More complicated case
"""

html = '<p><img alt="Garifuna dish of hudutu" src="https://www.seriouseats.com/2020/06/20200610-garifuna-cuisine-hudutu-dish-wes-milton-guity.jpg"/></p><p><p class="caption">Hudutu baruru with falmo, a coconut-based seafood stew. [Photograph: Wes Güity and Milton Güity]</p><p>Many dishes comprise the world of Garifuna cuisine.</p><p><img alt="" src="https://www.seriouseats.com/2020/06/peeling-plantain-for-hudutu-edit.jpg"/></p><p><p class="caption">[Photograph: Wes Güity and Milton Güity]</p><p>The Garifuna origin story is a complex one that involves attempts to enslave.</p>'
from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'html.parser')
p_tag = soup.find("p")
p_tag

<p><img alt="Garifuna dish of hudutu" src="https://www.seriouseats.com/2020/06/20200610-garifuna-cuisine-hudutu-dish-wes-milton-guity.jpg"/></p>

In [31]:
p_next_sibling = p_tag.find_next_sibling("p")
p_next_sibling

<p><p class="caption">Hudutu baruru with falmo, a coconut-based seafood stew. [Photograph: Wes Güity and Milton Güity]</p><p>Many dishes comprise the world of Garifuna cuisine.</p><p><img alt="" src="https://www.seriouseats.com/2020/06/peeling-plantain-for-hudutu-edit.jpg"/></p><p><p class="caption">[Photograph: Wes Güity and Milton Güity]</p><p>The Garifuna origin story is a complex one that involves attempts to enslave.</p></p></p>

In [34]:
p_next_next = p_next_sibling.find("p", attrs={"class":"caption"})
p_next_next

<p class="caption">Hudutu baruru with falmo, a coconut-based seafood stew. [Photograph: Wes Güity and Milton Güity]</p>

In [35]:
p_next_next.find("p", attrs={"class":"caption"})

## Fetch content of html tags

In [None]:
# Another common task is extracting all the text from a page:
html_doc = """<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""


print(soup.get_text())
# The Dormouse's story
#
# The Dormouse's story
#
# Once upon a time there were three little sisters; and their names were
# Elsie,
# Lacie and
# Tillie;
# and they lived at the bottom of a well.
#
# ...

## Level-manipuation
- sibling operation: find_next_sibling
    - From official guide, .next_element and .previous_element; .next_elements and .previous_elements

In [16]:
html_doc = """<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
"""

from bs4 import BeautifulSoup

soup = BeautifulSoup(html_doc, 'html.parser')
p_first = soup.find("p")
p_first

<p class="title"><b>The Dormouse's story</b></p>

In [17]:
type(p_first)

bs4.element.Tag

In [3]:
p_next_siblings = p_first.find_next_siblings("p")
p_next_siblings

[<p class="story">Once upon a time there were three little sisters; and their names were
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
 and they lived at the bottom of a well.</p>]

In [4]:
p_next_sibling = p_first.find_next_sibling("p")
p_next_sibling

<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

In [5]:
p_next_sibling.name

'p'

In [6]:
p_next_sibling.text

'Once upon a time there were three little sisters; and their names were\nElsie,\nLacie and\nTillie;\nand they lived at the bottom of a well.'

In [9]:
p_next_next_ = p_next_sibling.find_next_sibling("p")
# p_next_next

In [13]:
p_next_sibling.find_next_sibling("p") == None

True

# Regex

## About major functions
- re.search
- re.match
- re.finditer

re.search(pattern, string, flags=0)

Ref: https://docs.python.org/3/library/re.html

(Dot.) In the default mode, this matches any character except a newline. If the DOTALL flag has been specified, this matches any character including a newline.

## Character Class

```Python
import re

>>> re.search(r"[\[]", "acd")
>>> re.search(r"[\[]", "[dafs]")
<_sre.SRE_Match object; span=(0, 1), match='['>
>>> re.search(r"[abc]", "[dafs]")
<_sre.SRE_Match object; span=(2, 3), match='a'>
>>> re.search(r"[abc]", "[xxxx]")
>>> re.search(r"[abc]", "xxxx")
>>> 
>>> 
>>> re.search(r"[abc]", "xxax")
<_sre.SRE_Match object; span=(2, 3), match='a'>
>>> 
>>> re.search(r"[\a]", "xxax")
>>> 
>>> 
>>> re.search(r"[^a]", "xxax")
<_sre.SRE_Match object; span=(0, 1), match='x'>
>>> 
>>> 
>>> 
>>> re.search(r"[^a]", "xxbx")
<_sre.SRE_Match object; span=(0, 1), match='x'>
>>> re.search(r"[^abx]", "xxbx")
>>> 
>>> 
>>> re.search(r"[^adx]", "xxbx")
<_sre.SRE_Match object; span=(2, 3), match='b'>
>>> 
>>> 
>>> re.search(r"[^adx]", "xxbx")


About special characters
>>> re.search(r"[\[]", "[")
<_sre.SRE_Match object; span=(0, 1), match='['>
```

## Capture

In [9]:
import re

match_grp = re.search(r"\[(.*?)\]", "The fastest, freshest ice cream you'll ever make. [Photographs: Vicky Wasik]")
match_grp

<re.Match object; span=(50, 76), match='[Photographs: Vicky Wasik]'>

In [11]:
match_grp.groups()[0]

'Photographs: Vicky Wasik'

## Non-greedy search, or find the first matched one
- adding ? after (+,*)

In [1]:
import re

re.search('<p><img.*?/></p>')

TypeError: search() missing 1 required positional argument: 'string'

## escape character cases

## Comparison between functions
- match()
- group()
- search()
- finditer