# Web Scraping 101

Uses BeautifulSoup

### Sample HTML 

%%html
<!DOCTYPE html>
<html>
<head>
<title>Luxury Car Prices</title>
</head>
<body>
<h3><b id='boldest'>Mercedes S-Class</b></h3>
<p> Price: $ 140,000 </p>
<h3> BMW 7-Series </h3>
<p> Price: $110,000 </p>
<h3> Cadillac CTS </h3>
<p> Price: $80,000</p>
</body>
</html>

In [1]:
# Store HTML as a string
html_str = "<!DOCTYPE html><html><head><title>Luxury Car Prices</title></head><body><h3><b id='boldest'>Mercedes S-Class</b></h3><p> Price: $ 140,000 </p><h3> BMW 7-Series </h3><p> Price: $110,000 </p><h3> Cadillac CTS </h3><p> Price: $80,000</p></body></html>"

### Beautiful Soup

In [2]:
# Prerequisites
from bs4 import BeautifulSoup
import requests

In [3]:
# Parse HTML string
soup = BeautifulSoup(html_str, 'html5lib')

In [5]:
print(soup)

<!DOCTYPE html>
<html><head><title>Luxury Car Prices</title></head><body><h3><b id="boldest">Mercedes S-Class</b></h3><p> Price: $ 140,000 </p><h3> BMW 7-Series </h3><p> Price: $110,000 </p><h3> Cadillac CTS </h3><p> Price: $80,000</p></body></html>


In [4]:
# Print pretty HTML
print(soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  <title>
   Luxury Car Prices
  </title>
 </head>
 <body>
  <h3>
   <b id="boldest">
    Mercedes S-Class
   </b>
  </h3>
  <p>
   Price: $ 140,000
  </p>
  <h3>
   BMW 7-Series
  </h3>
  <p>
   Price: $110,000
  </p>
  <h3>
   Cadillac CTS
  </h3>
  <p>
   Price: $80,000
  </p>
 </body>
</html>


### Tags

In [7]:
tag_object = soup.title
print("tag object: ", tag_object)
print("tag object type: ", type(tag_object))

tag object:  <title>Luxury Car Prices</title>
tag object type:  <class 'bs4.element.Tag'>


In [8]:
# if more than one tag with the same name, returs the first
tag_object = soup.h3
print("tag object: ", tag_object)
print("tag object type: ", type(tag_object))

tag object:  <h3><b id="boldest">Mercedes S-Class</b></h3>
tag object type:  <class 'bs4.element.Tag'>


### Children, Parents, Siblings

In [9]:
# Access child
tag_object = soup.h3
tag_child = tag_object.b 
print("Child: ", tag_child)

<b id="boldest">Mercedes S-Class</b>


In [10]:
# Access parent 
tag_parent = tag_child.parent
print("Parent: ", tag_parent)

Parent:  <h3><b id="boldest">Mercedes S-Class</b></h3>


In [11]:
# Access Sibling
print("Object: ", tag_object)
print("Sibling: ", tag_object.next_sibling)

Object:  <h3><b id="boldest">Mercedes S-Class</b></h3>
Sibling:  <p> Price: $ 140,000 </p>


### HTML Attributes

In [13]:
# Tag's attributes can be accessed as dictionary
tag_child['id']

'boldest'

In [14]:
# see whole dictionary
tag_child.attrs

{'id': 'boldest'}

In [15]:
# Attribute getter
tag_child.get('id')

'boldest'

### Navigable String

In [17]:
tag_string = tag_child.string
print("tag_string: ", tag_string)
print("tag_string type: ", type(tag_string))

tag_string:  Mercedes S-Class
tag_string type:  <class 'bs4.element.NavigableString'>


### find_all()

In [20]:
cars = soup.find_all('h3')
cars

[<h3><b id="boldest">Mercedes S-Class</b></h3>,
 <h3> BMW 7-Series </h3>,
 <h3> Cadillac CTS </h3>]

In [21]:
# First car
cars[0]

<h3><b id="boldest">Mercedes S-Class</b></h3>

In [22]:
# find specific id
soup.find_all(id='boldest')

[<b id="boldest">Mercedes S-Class</b>]

## Download and scrape a web page

In [24]:
url = "http://www.cnn.com"

text = requests.get(url).text

In [25]:
# Create BeautifulSoup object
soup2 = soup = BeautifulSoup(text, 'html5lib')

In [26]:
# Scrape all links
for link in soup2.find_all('a', href=True):
    print(link.get('href'))

https://www.cnn.com
https://www.cnn.com/us
https://www.cnn.com/world
https://www.cnn.com/politics
https://www.cnn.com/business
https://www.cnn.com/opinions
https://www.cnn.com/health
https://www.cnn.com/entertainment
https://www.cnn.com/style
https://www.cnn.com/travel
https://www.cnn.com/sports
https://www.cnn.com/cnn-underscored
https://www.cnn.com/science
https://www.cnn.com/climate
https://www.cnn.com/weather
https://www.cnn.com/world/europe/ukraine
https://www.cnn.com/world/middleeast/israel
https://www.cnn.com/us
https://www.cnn.com/world
https://www.cnn.com/politics
https://www.cnn.com/business
https://www.cnn.com/opinions
https://www.cnn.com/health
https://www.cnn.com/entertainment
https://www.cnn.com/style
https://www.cnn.com/travel
https://www.cnn.com/sports
https://www.cnn.com/cnn-underscored
https://www.cnn.com/science
https://www.cnn.com/climate
https://www.cnn.com/weather
https://www.cnn.com/world/europe/ukraine
https://www.cnn.com/world/middleeast/israel
https://www.cnn.

In [27]:
# Scrape all image tags
for link in soup2.find_all('img'):
    print(link)
    print(link.get('src'))

<img alt="Butch Wilmore and Suni Williams enter the ISS on June 6, 2024." class="image__dam-img image__dam-img--loading" height="1688" onerror="imageLoadError(this)" onload="this.classList.remove('image__dam-img--loading')" src="https://media.cnn.com/api/v1/images/stellar/prod/screen-shot-2024-06-06-at-3-49-37-pm.jpg?c=16x9&amp;q=h_438,w_780,c_fill" width="3000"/>
https://media.cnn.com/api/v1/images/stellar/prod/screen-shot-2024-06-06-at-3-49-37-pm.jpg?c=16x9&q=h_438,w_780,c_fill
<img alt="still_20688397_3374068.294_still.jpg" class="image__dam-img image__dam-img--loading" height="1080" onerror="imageLoadError(this)" onload="this.classList.remove('image__dam-img--loading')" src="https://media.cnn.com/api/v1/images/stellar/prod/still-20688397-3374068-294-still.jpg?c=16x9&amp;q=h_144,w_256,c_fill" width="1920"/>
https://media.cnn.com/api/v1/images/stellar/prod/still-20688397-3374068-294-still.jpg?c=16x9&q=h_144,w_256,c_fill
<img alt="KARS, TURKIYE - MAY 18: A view of moon, appearing in t