### Load the libraries

In [2]:
import requests
from bs4 import BeautifulSoup as bs

### Load our first page

In [5]:
# loading the webpage content
r = requests.get("https://keithgalli.github.io/web-scraping/example.html")

# convert to a beautiful soup object
soup = bs(r.content)

# print the html
print(soup.prettify())

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



### Start using Beautiful Soup to scrape

#### find vs find_all

In [7]:
# soup.find() returns the first element that matches the element specified
first_header = soup.find('h2')
first_header

<h2>A Header</h2>

In [8]:
# soup.find_all() returns a list of all elements that matches the element specified
second_header = soup.find_all('h2')
second_header

[<h2>A Header</h2>, <h2>Another header</h2>]

In [10]:
# pass in a list of elements to look for - find
headers = soup.find(['h1', 'h2']) # returns the first matching element found from the list. order doesnt matter
headers

<h1>HTML Webpage</h1>

In [11]:
# pass in a list of elements to look for - find_all
headers = soup.find_all(['h1', 'h2']) # returns all the matching elements found from the list. order doesnt matter
headers

[<h1>HTML Webpage</h1>, <h2>A Header</h2>, <h2>Another header</h2>]

In [14]:
# passing attributes
headers = soup.find_all("p", attrs = {"id": "paragraph-id"}) # find all paragraph elements with id paragraph-id
headers

[<p id="paragraph-id"><b>Some bold text</b></p>]

In [16]:
# nested fetch
body = soup.find("body")
pInsideBody = body.find('p')
print(pInsideBody)

<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>


In [19]:
# search for an element that contains a specific test
element = soup.find_all('p', string = 'Some italicized text') # the string attribute has to provided with the entire text content of the element
element

[<p><i>Some italicized text</i></p>]

In [21]:
# regular expressions can be used to search for any keyword instead of entire text content
import re

paragraphs = soup.find_all('p', string = re.compile('Some'))
paragraphs

headers = soup.find_all('h2', string = re.compile('(H|h)eader'))
headers

[<h2>A Header</h2>, <h2>Another header</h2>]

#### select (CSS selector)

In [23]:
content = soup.select('p') # fetch a paragraph element
content

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>,
 <p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [24]:
content = soup.select('div p') # fetch a paragraph element thats inside a div element
content

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>]

In [25]:
content = soup.select('h2 ~ p') # fetch a paragraph element thats next to an h2 element
content

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [29]:
content = soup.select('#paragraph-id b') # fetch a bold element thats inside a paragraph element with id 'paragraph-id'
content

[<b>Some bold text</b>]

In [33]:
content = soup.select('body > p') # Selects all <p> elements where the parent is a body element. Immediate children
print(content)

# we can nest through the results
for ele in content:
    print(ele.select('i'))

[<p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>]
[<i>Some italicized text</i>]
[]


In [34]:
# grab by element with specific property
soup.select('[align = middle]')

[<div align="middle">
 <h1>HTML Webpage</h1>
 <p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
 </div>]

#### Extract content from HTML

In [41]:
# scrape text from a single element
header = soup.find('h2')
print(header.string)

# scrape text from multiple child elements
header = soup.find('div')
print(header.get_text())

A Header

HTML Webpage
Link to more interesting example: keithgalli.github.io/web-scraping/webpage.html



In [44]:
# get a specific property of an element
link = soup.find('a')
print(link)

<a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a>


In [48]:
# get a specific property from an element
link = soup.find('a')
print(link['href'])

para = soup.select('p')
print(para[2]['id'])

https://keithgalli.github.io/web-scraping/webpage.html
paragraph-id


#### Code Navigation

In [50]:
# we can use the dot operator to navigate the elements
soup.body.div.h1.string

# there is a concept of parent, child, siblings and there are methods to find all of these
soup.body.find('div').find_next_siblings()

[<h2>A Header</h2>,
 <p><i>Some italicized text</i></p>,
 <h2>Another header</h2>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

### Excercises

#### load the webpage

In [51]:
# loading the webpage content
r = requests.get("https://keithgalli.github.io/web-scraping/webpage.html")

# convert to a beautiful soup object
soup = bs(r.content)

# print the html
print(soup.prettify())

<head>
 <title>
  Keith Galli's Page
 </title>
 <style>
  table {
    border-collapse: collapse;
  }
  th {
    padding:5px;
  }
  td {
    border: 1px solid #ddd;
    padding: 5px;
  }
  tr:nth-child(even) {
    background-color: #f2f2f2;
  }
  th {
    padding-top: 12px;
    padding-bottom: 12px;
    text-align: left;
    background-color: #add8e6;
    color: black;
  }
  .block {
  width: 100px;
  /*float: left;*/
    display: inline-block;
    zoom: 1;
  }
  .column {
  float: left;
  height: 200px;
  /*width: 33.33%;*/
  padding: 5px;
  }

  .row::after {
    content: "";
    clear: both;
    display: table;
  }
 </style>
</head>
<body>
 <h1>
  Welcome to my page!
 </h1>
 <img src="./images/selfie1.jpg" width="300px"/>
 <h2>
  About me
 </h2>
 <p>
  Hi, my name is Keith and I am a YouTuber who focuses on content related to programming, data science, and machine learning!
 </p>
 <p>
  Here is a link to my channel:
  <a href="https://www.youtube.com/kgmit">
   youtube.com/kgmit
  </

#### task 1: fetch all social links on the page (3 different ways possible)

In [59]:
# method 1
socials = soup.select('.socials .social a')
socials = [social['href'] for social in socials]
socials

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

In [71]:
socials = soup.find('ul', attrs = {"class": "socials"})
socials = socials.find_all('a')
socials = [social['href'] for social in socials]
socials

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

#### task 2: Scrape a table

In [106]:
import pandas as pd
table = soup.find('table', attrs = {"class": "hockey-stats"})
table_header = table.find('thead').find_all('th')
table_headings = [column.string for column in table_header]
print(table_headings)


table_body = table.find('tbody')
table_rows = table_body.find_all('tr')
data = []
for row in table_rows:
    tds = row.find_all('td')
    rowData = [td.get_text().strip() for td in tds]
    data.append(rowData)

df = pd.DataFrame(data, columns = table_headings)
df


['S', 'Team', 'League', 'GP', 'G', 'A', 'TP', 'PIM', '+/-', '\xa0', 'POST', 'GP', 'G', 'A', 'TP', 'PIM', '+/-']


Unnamed: 0,S,Team,League,GP,G,A,TP,PIM,+/-,Unnamed: 10,POST,GP.1,G.1,A.1,TP.1,PIM.1,+/-.1
0,2014-15,MIT (Mass. Inst. of Tech.),ACHA II,17.0,3.0,9.0,12.0,20.0,,|,,,,,,,
1,2015-16,MIT (Mass. Inst. of Tech.),ACHA II,9.0,1.0,1.0,2.0,2.0,,|,,,,,,,
2,2016-17,MIT (Mass. Inst. of Tech.),ACHA II,12.0,5.0,5.0,10.0,8.0,0.0,|,,,,,,,
3,2017-18,Did not play,,,,,,,,|,,,,,,,
4,2018-19,MIT (Mass. Inst. of Tech.),ACHA III,8.0,5.0,10.0,15.0,8.0,,|,,,,,,,


#### task 3: Scrape fun facts that has the word 'is'

In [113]:
# painful way
fun_facts = soup.find('ul', {"class": "fun-facts"})
fun_facts = fun_facts.find_all('li')
fun_facts = list(map(lambda x: x.get_text().strip() if 'is' in x.get_text().strip() else '', fun_facts))
print(fun_facts)

['', 'Middle name is Ronald', '', 'Dunkin Donuts coffee is better than Starbucks', "A favorite book series of mine is Ender's Game", 'Current video game of choice is Rocket League', "The band that I've seen the most times live is the Zac Brown Band"]


In [124]:
# easy way
fun_facts = soup.select('.fun-facts li')
fun_facts = [fact.find(string = re.compile('is')) for fact in fun_facts]
fun_facts = [fact.find_parent().get_text() for fact in fun_facts if fact]
print(fun_facts)
# fun_facts = fun_facts.find_all('li', string = re.compile('is'))
# fun_facts

['Middle name is Ronald', 'Dunkin Donuts coffee is better than Starbucks', "A favorite book series of mine is Ender's Game", 'Current video game of choice is Rocket League', "The band that I've seen the most times live is the Zac Brown Band"]


#### task 4: download the images

In [130]:
import urllib.request
images = soup.find_all('img')
base_path = 'https://keithgalli.github.io/web-scraping/'
count = 0
for image in images:
    link = base_path + re.sub(r'^.*?i', 'i', image['src'])    
    urllib.request.urlretrieve(link, f"{count}.jpg")
    count += 1

#### task 5: scrape the secret (based on elements and their ids)

In [140]:
base_path = 'https://keithgalli.github.io/web-scraping/'
files = soup.select('.block a')
secret_message = []
for file in files:
    link = base_path + file['href']
    r = requests.get(link)
    sub_soup = bs(r.content)
    secret_message.append(sub_soup.select('#secret-word')[0].string)
print(' '.join(secret_message))

Make sure to smash that like button and subscribe !!!


### Room for more