# Intro to Web Scraping with BeautifulSoup

In [2]:
# Import BeautifulSoup 
from bs4 import BeautifulSoup

In [4]:
# Set a html variable 
html_doc = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Intro to Web Scraping with BeautifulSoup</title>
</head>
<body>
    <div id="section-1">
        <h3 data-hello="hi">Hello</h3> 
        <img src="https://sources.unplash.com/200x200/?nature, water" alt="">
        <p>Lorem ipsum dolor sit amet consectetur adipisicing elit. Nobis quae deleniti accusamus doloribus accusantium, sapiente culpa iusto sunt neque quia?</p>
    </div>
    <div id="section-2">
        <ul class="items">
            <a href="#"><li class="item">item 1</li></a>
            <a href="#"><li class="item">item 2</li></a>
            <a href="#"><li class="item">item 3</li></a>
            <a href="#"><li class="item">item 4</li></a>   
        </ul>
    </div>
</body>
</html>
"""

In [5]:
# Init soup object 
soup = BeautifulSoup(html_doc, 'html.parser') 

In [7]:
# Direct ==> it gives you a list object 
soup.body()

[<div id="section-1">
 <h3 data-hello="hi">Hello</h3>
 <img alt="" src="https://sources.unplash.com/200x200/?nature, water"/>
 <p>Lorem ipsum dolor sit amet consectetur adipisicing elit. Nobis quae deleniti accusamus doloribus accusantium, sapiente culpa iusto sunt neque quia?</p>
 </div>,
 <h3 data-hello="hi">Hello</h3>,
 <img alt="" src="https://sources.unplash.com/200x200/?nature, water"/>,
 <p>Lorem ipsum dolor sit amet consectetur adipisicing elit. Nobis quae deleniti accusamus doloribus accusantium, sapiente culpa iusto sunt neque quia?</p>,
 <div id="section-2">
 <ul class="items">
 <a href="#"><li class="item">item 1</li></a>
 <a href="#"><li class="item">item 2</li></a>
 <a href="#"><li class="item">item 3</li></a>
 <a href="#"><li class="item">item 4</li></a>
 </ul>
 </div>,
 <ul class="items">
 <a href="#"><li class="item">item 1</li></a>
 <a href="#"><li class="item">item 2</li></a>
 <a href="#"><li class="item">item 3</li></a>
 <a href="#"><li class="item">item 4</li></a>


In [8]:
# Printing body 
print(soup.body)

<body>
<div id="section-1">
<h3 data-hello="hi">Hello</h3>
<img alt="" src="https://sources.unplash.com/200x200/?nature, water"/>
<p>Lorem ipsum dolor sit amet consectetur adipisicing elit. Nobis quae deleniti accusamus doloribus accusantium, sapiente culpa iusto sunt neque quia?</p>
</div>
<div id="section-2">
<ul class="items">
<a href="#"><li class="item">item 1</li></a>
<a href="#"><li class="item">item 2</li></a>
<a href="#"><li class="item">item 3</li></a>
<a href="#"><li class="item">item 4</li></a>
</ul>
</div>
</body>


[<a href="#Intro-to-Web-Scraping-with-BeautifulSoup">Back to Top</a>]

In [9]:
# Get head of html document 
print(soup.head)

<head>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<title>Intro to Web Scraping with BeautifulSoup</title>
</head>


In [10]:
# Get title of html document 
print(soup.title)

<title>Intro to Web Scraping with BeautifulSoup</title>


In [15]:
# find() ==> it gives you first one 
e1 = soup.find('div')
print(e1)

<div id="section-1">
<h3 data-hello="hi">Hello</h3>
<img alt="" src="https://sources.unplash.com/200x200/?nature, water"/>
<p>Lorem ipsum dolor sit amet consectetur adipisicing elit. Nobis quae deleniti accusamus doloribus accusantium, sapiente culpa iusto sunt neque quia?</p>
</div>


In [21]:
# find_all() or findAll() ==> it gives a list of dives 
e2 = soup.find_all('div')
# e2 = soup.findAll('div')
print(e2)

[<div id="section-1">
<h3 data-hello="hi">Hello</h3>
<img alt="" src="https://sources.unplash.com/200x200/?nature, water"/>
<p>Lorem ipsum dolor sit amet consectetur adipisicing elit. Nobis quae deleniti accusamus doloribus accusantium, sapiente culpa iusto sunt neque quia?</p>
</div>, <div id="section-2">
<ul class="items">
<a href="#"><li class="item">item 1</li></a>
<a href="#"><li class="item">item 2</li></a>
<a href="#"><li class="item">item 3</li></a>
<a href="#"><li class="item">item 4</li></a>
</ul>
</div>]


In [22]:
# Inspect list element from e2 
e2[0]

<div id="section-1">
<h3 data-hello="hi">Hello</h3>
<img alt="" src="https://sources.unplash.com/200x200/?nature, water"/>
<p>Lorem ipsum dolor sit amet consectetur adipisicing elit. Nobis quae deleniti accusamus doloribus accusantium, sapiente culpa iusto sunt neque quia?</p>
</div>

In [23]:
# Inspect list element from e2 
e2[1]

<div id="section-2">
<ul class="items">
<a href="#"><li class="item">item 1</li></a>
<a href="#"><li class="item">item 2</li></a>
<a href="#"><li class="item">item 3</li></a>
<a href="#"><li class="item">item 4</li></a>
</ul>
</div>

In [25]:
# find by id or class 
e3 = soup.find(id="section-1")
print(e3)

<div id="section-1">
<h3 data-hello="hi">Hello</h3>
<img alt="" src="https://sources.unplash.com/200x200/?nature, water"/>
<p>Lorem ipsum dolor sit amet consectetur adipisicing elit. Nobis quae deleniti accusamus doloribus accusantium, sapiente culpa iusto sunt neque quia?</p>
</div>


In [27]:
# find by id or class ==> it shows an error beacause class is a reserved word that's why we have to use underscore after clss
e4 = soup.find(class = "items")
print(e4)

SyntaxError: invalid syntax (<ipython-input-27-7d5c2fdd0420>, line 2)

In [29]:
e4 = soup.find(class_= "items")
print(e4)

<ul class="items">
<a href="#"><li class="item">item 1</li></a>
<a href="#"><li class="item">item 2</li></a>
<a href="#"><li class="item">item 3</li></a>
<a href="#"><li class="item">item 4</li></a>
</ul>


In [31]:
# find by `data attribute`: attrs {} JSON object 
e5 = soup.find(attrs={'data-hello': 'hi'})
print(e5)

<h3 data-hello="hi">Hello</h3>


[<a href="#Intro-to-Web-Scraping-with-BeautifulSoup">Back to Top</a>]

In [32]:
# select element by id ==> it returns a list 
e6 = soup.select('#section-1')
print(e6)

[<div id="section-1">
<h3 data-hello="hi">Hello</h3>
<img alt="" src="https://sources.unplash.com/200x200/?nature, water"/>
<p>Lorem ipsum dolor sit amet consectetur adipisicing elit. Nobis quae deleniti accusamus doloribus accusantium, sapiente culpa iusto sunt neque quia?</p>
</div>]


In [33]:
# Inspect element ==> it not returns a list because we select index of 0 
e6 = soup.select('#section-1')[0]
print(e6)

<div id="section-1">
<h3 data-hello="hi">Hello</h3>
<img alt="" src="https://sources.unplash.com/200x200/?nature, water"/>
<p>Lorem ipsum dolor sit amet consectetur adipisicing elit. Nobis quae deleniti accusamus doloribus accusantium, sapiente culpa iusto sunt neque quia?</p>
</div>


In [34]:
# select element by class ==> it returns a list 
e7 = soup.select(".items")
print(e7)

[<ul class="items">
<a href="#"><li class="item">item 1</li></a>
<a href="#"><li class="item">item 2</li></a>
<a href="#"><li class="item">item 3</li></a>
<a href="#"><li class="item">item 4</li></a>
</ul>]


In [35]:
# Inspect element ==> it is not return a list 
e7 = soup.select(".items")[0]
print(e7)

<ul class="items">
<a href="#"><li class="item">item 1</li></a>
<a href="#"><li class="item">item 2</li></a>
<a href="#"><li class="item">item 3</li></a>
<a href="#"><li class="item">item 4</li></a>
</ul>


In [38]:
# get_text() method 
e8 = soup.find(class_="item").get_text()
print(e8)

item 1


In [39]:
# Looping over items 
for item in soup.select(".item"): 
    print(item.get_text())

item 1
item 2
item 3
item 4


[<a href="#Intro-to-Web-Scraping-with-BeautifulSoup">Back to Top</a>]

## Navigation

In [41]:
# Contents 
elem = soup.body.contents
print(elem)

['\n', <div id="section-1">
<h3 data-hello="hi">Hello</h3>
<img alt="" src="https://sources.unplash.com/200x200/?nature, water"/>
<p>Lorem ipsum dolor sit amet consectetur adipisicing elit. Nobis quae deleniti accusamus doloribus accusantium, sapiente culpa iusto sunt neque quia?</p>
</div>, '\n', <div id="section-2">
<ul class="items">
<a href="#"><li class="item">item 1</li></a>
<a href="#"><li class="item">item 2</li></a>
<a href="#"><li class="item">item 3</li></a>
<a href="#"><li class="item">item 4</li></a>
</ul>
</div>, '\n']


In [43]:
# It's nothing, because it looks line break 
elem = soup.body.contents[0]
print(elem)





In [44]:
# Inspect elements 
elem = soup.body.contents[1]
print(elem)

<div id="section-1">
<h3 data-hello="hi">Hello</h3>
<img alt="" src="https://sources.unplash.com/200x200/?nature, water"/>
<p>Lorem ipsum dolor sit amet consectetur adipisicing elit. Nobis quae deleniti accusamus doloribus accusantium, sapiente culpa iusto sunt neque quia?</p>
</div>


In [46]:
# Inspect elements 
elem = soup.body.contents[1].contents[1]
print(elem)

<h3 data-hello="hi">Hello</h3>


In [48]:
# next siblings ==> # It's nothing, because it looks line break 
elem = soup.body.contents[1].contents[1].next_sibling
print(elem)





In [49]:
# next siblings ==> # It's nothing, because it looks line break 
elem = soup.body.contents[1].contents[1].next_sibling.next_sibling
print(elem)

<img alt="" src="https://sources.unplash.com/200x200/?nature, water"/>


In [50]:
# find_next_sibling() method 
elem = soup.body.contents[1].contents[1].find_next_sibling()
print(elem)

<img alt="" src="https://sources.unplash.com/200x200/?nature, water"/>


In [51]:
# find_previous_sibling() method 
elem = soup.find(id="section-2").find_previous_sibling()
print(elem)

<div id="section-1">
<h3 data-hello="hi">Hello</h3>
<img alt="" src="https://sources.unplash.com/200x200/?nature, water"/>
<p>Lorem ipsum dolor sit amet consectetur adipisicing elit. Nobis quae deleniti accusamus doloribus accusantium, sapiente culpa iusto sunt neque quia?</p>
</div>


In [None]:
# find_parent() method 
elem = soup.find(class_='.item').find_parent()
print(elem)

In [61]:
# find next paragraph 
elem = soup.find('h3').find_next_sibling('p') 
print(elem)

<p>Lorem ipsum dolor sit amet consectetur adipisicing elit. Nobis quae deleniti accusamus doloribus accusantium, sapiente culpa iusto sunt neque quia?</p>


[<a href="#Intro-to-Web-Scraping-with-BeautifulSoup">Back to Top</a>]

## Resources 
- https://www.crummy.com/software/BeautifulSoup/bs4/doc/
- https://youtu.be/4UcqECQe5Kc?t=187