# Loading the necessary libraries

In [6]:
import requests
from bs4 import BeautifulSoup as bs

## Loading the first page

In [18]:
# Loading webpage content
r = requests.get("https://keithgalli.github.io/web-scraping/example.html")

# Convert to a beautiful soup object
soup = bs(r.content)

# Printing html
# print(soup.prettify())

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



### Beautiful Soup to Scrape

#### find and find_all

In [15]:
first_header = soup.find("h2")
print("The first header is " + str(first_header))
headers = soup.find_all("h2")
print("The headers are " + str(headers))

The first header is <h2>A Header</h2>
The headers are [<h2>A Header</h2>, <h2>Another header</h2>]


In [35]:
# Passing a list of elements to look for - first occurence
first_header = soup.find(["h1", "h2"])
print("I found " + str(first_header) + " first.")
first_header = soup.find(["h2", "h1"])
print("I found " + str(first_header) + " first.\n")

# All occurences
headers = soup.find_all(["h1", "h2"])
print("The headers are " + str(headers) + ".")

I found <h1>HTML Webpage</h1> first.
I found <h1>HTML Webpage</h1> first.

The headers are [<h1>HTML Webpage</h1>, <h2>A Header</h2>, <h2>Another header</h2>].


In [44]:
# Pass in attributes to the find and find all function
paragraph = soup.find_all("p")
print("This is an example of an element without a paragraph " + str(paragraph))

paragraph = soup.find_all("p", attrs={"id": "paragraph-id"})
print("\nI got the specific attribute which is " + str(paragraph))

This is an example of an element without a paragraph [<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>, <p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>]

I got the specific attribute which is [<p id="paragraph-id"><b>Some bold text</b></p>]


In [49]:
# Nesting find and find_all calls
body = soup.find("body")
div = body.find("div")
header = div.find("h1")
print(header.prettify())

<h1>
 HTML Webpage
</h1>



In [69]:
# Searching fr specific strings in find or find_all calls
# Using the regex library
import re

paragraphs = soup.find_all("p", string=re.compile("Some"))
print(paragraphs)

headers = soup.find_all("h2", string=re.compile("(H|h)eader"))
print(headers)

[<p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>]
[<h2>A Header</h2>, <h2>Another header</h2>]


### select(CSS selector)
Beautiful Soup can mimic a CSS selector

In [70]:
print(soup.body.prettify())

<body>
 <div align="middle">
  <h1>
   HTML Webpage
  </h1>
  <p>
   Link to more interesting example:
   <a href="https://keithgalli.github.io/web-scraping/webpage.html">
    keithgalli.github.io/web-scraping/webpage.html
   </a>
  </p>
 </div>
 <h2>
  A Header
 </h2>
 <p>
  <i>
   Some italicized text
  </i>
 </p>
 <h2>
  Another header
 </h2>
 <p id="paragraph-id">
  <b>
   Some bold text
  </b>
 </p>
</body>



In [71]:
# getting paragraph inside of divs
content = soup.select("div p")
content

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>]

In [72]:
# getting all paragraphs that is directly after h2
paragraphs = soup.select("h2 ~ p")
paragraph

[<p id="paragraph-id"><b>Some bold text</b></p>]

In [73]:
# getting element in a certain format
bold_text = soup.select("p#paragraph-id b")
bold_text

[<b>Some bold text</b>]

In [76]:
# Running nested calls
paragraphs = soup.select("body > p") # directed descendant of the body
print(paragraphs)

for paragraph in paragraphs:
    print(paragraph.select("i"))

[<p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>]
[<i>Some italicized text</i>]
[]


### Getting different properties if the HTML

In [79]:
header = soup.find("h2")
print(header) # text with tags
print(header.string) # text only

<h2>A Header</h2>
A Header


In [81]:
div = soup.find("div")
print(div.prettify())
print(div.string)

# This will return a 'none' result because the script doesn't know which child element
# in the parent element to get the string from.

<div align="middle">
 <h1>
  HTML Webpage
 </h1>
 <p>
  Link to more interesting example:
  <a href="https://keithgalli.github.io/web-scraping/webpage.html">
   keithgalli.github.io/web-scraping/webpage.html
  </a>
 </p>
</div>

None


In [83]:
# In order to extract all the child element texts from the tags
div = soup.find("div")
print(div.prettify())
print(div.get_text())

<div align="middle">
 <h1>
  HTML Webpage
 </h1>
 <p>
  Link to more interesting example:
  <a href="https://keithgalli.github.io/web-scraping/webpage.html">
   keithgalli.github.io/web-scraping/webpage.html
  </a>
 </p>
</div>


HTML Webpage
Link to more interesting example: keithgalli.github.io/web-scraping/webpage.html



### Getting a specific property from an element

In [85]:
# For links
link = soup.find("a")
link["href"]

'https://keithgalli.github.io/web-scraping/webpage.html'

In [86]:
# for elements
paragraphs = soup.select("p#paragraph-id")
paragraphs[0]["id"]

'paragraph-id'

## Code Navigation

In [88]:
print(soup.body.prettify())

<body>
 <div align="middle">
  <h1>
   HTML Webpage
  </h1>
  <p>
   Link to more interesting example:
   <a href="https://keithgalli.github.io/web-scraping/webpage.html">
    keithgalli.github.io/web-scraping/webpage.html
   </a>
  </p>
 </div>
 <h2>
  A Header
 </h2>
 <p>
  <i>
   Some italicized text
  </i>
 </p>
 <h2>
  Another header
 </h2>
 <p id="paragraph-id">
  <b>
   Some bold text
  </b>
 </p>
</body>



In [87]:
# Path Syntax
soup.body.div.h1

<h1>HTML Webpage</h1>

#### Knowing the terms: Parent, Sibling, and Child

In [90]:
soup.body.find("div").find_next_siblings()

[<h2>A Header</h2>,
 <p><i>Some italicized text</i></p>,
 <h2>Another header</h2>,
 <p id="paragraph-id"><b>Some bold text</b></p>]