Link: https://www.youtube.com/watch?v=GjKQ6V_ViQE&ab_channel=KeithGalli

## Load in the necessary libraries

In [1]:
import requests # pip install requests
from bs4 import BeautifulSoup as bs # pip install beautifulsoup4
import re


## Load our first page

In [2]:
# Load the webpage content
r = requests.get("https://keithgalli.github.io/web-scraping/example.html")

# Convert to a BeautifulSoup object
soup = bs(r.content)

# Print out our html
print(soup.prettify())

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



## Start using BeautifulSoup to Scrape

### find and find_all

In [3]:
first_header = soup.find("h2")
print(first_header)

<h2>A Header</h2>


In [4]:
headers = soup.find_all("h2")
print(headers)

[<h2>A Header</h2>, <h2>Another header</h2>]


In [5]:
# Pass in a list of elements to look for
# Only finds the first occurence of any items' existence on the html webpage
first_header = soup.find(["h1", "h2"])
print(first_header)

<h1>HTML Webpage</h1>


In [6]:
# Pass in a list of elements to look for
# Only finds all occcurences of any items' existence on the html webpage
headers = soup.find_all(["h1", "h2"])
print(headers)

[<h1>HTML Webpage</h1>, <h2>A Header</h2>, <h2>Another header</h2>]


In [7]:
# You can pass in attributes to the find/find_all function
paragraph = soup.find_all("p")
print(paragraph)

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>, <p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>]


In [8]:
# You can pass in attributes to the find/find_all function
paragraph = soup.find_all("p", attrs={"id" : 'paragraph-id'})
print(paragraph)

[<p id="paragraph-id"><b>Some bold text</b></p>]


In [9]:
# You can nest find and find_all calls
body = soup.find('body')
div = body.find('div')
header = div.find('h1')
header

<h1>HTML Webpage</h1>

In [10]:
# Search for specific strings in our find/find_all calls
print(soup.prettify())
# string_search = 

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



In [11]:
# Search for specific strings in our find/find_all calls
### WILL NOT WORK as it is looking for an exact paragraph.
paragraphs = soup.find_all("p", string="Some")
print(paragraphs)

### The way to get around this is through RegEx
paragraphs = soup.find_all("p", string=re.compile("Some"))
print(paragraphs)

[]
[<p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>]


In [12]:
headers = soup.find_all("h2", string=re.compile("header"))
print(headers)

# To look for any capitalization of "header"
headers = soup.find_all("h2", string=re.compile("(H|h)eader"))
print(headers)

[<h2>Another header</h2>]
[<h2>A Header</h2>, <h2>Another header</h2>]


### select (CSS selector)