# Load Libraries

In [128]:
import requests
from bs4 import BeautifulSoup as bs
import re
import pandas as pd

# Load Page Content

In [46]:
# Load the content
r = requests.get('https://keithgalli.github.io/web-scraping/example.html')

# Convert into a BS object
soup = bs(r.content)
print(soup.prettify())

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



# Find & FindAll

In [25]:
# Find individual tags

first_header = soup.find('h2')
print(first_header)

headers = soup.find_all('h2')
print(headers)

<h2>A Header</h2>
[<h2>A Header</h2>, <h2>Another header</h2>]


In [30]:
# Find a list of tags

first_headers = soup.find(['h1', 'h2'])
print(first_headers)

headers = soup.find_all(['h1', 'h2'])
print(headers)

<h1>HTML Webpage</h1>
[<h1>HTML Webpage</h1>, <h2>A Header</h2>, <h2>Another header</h2>]


In [32]:
# Pass attributes to BS

paragraph = soup.find_all('p', attrs = {'id': 'paragraph-id'})
print(paragraph)

[<p id="paragraph-id"><b>Some bold text</b></p>]


In [36]:
# Nest through find/find_all calls
body = soup.find('body')
div = body.find('div')
header = div.find('h1')
print(header)

<h1>HTML Webpage</h1>


In [44]:
# Search for spesific strings

paragraphs = soup.find_all('p', string = re.compile('Some'))
print(paragraphs)

headers = soup.find_all('h2', string = re.compile('(h|H)eader'))
print(headers)

[<p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>]
[<h2>A Header</h2>, <h2>Another header</h2>]


# CSS Selector

In [49]:
soup.body

<body>
<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>
<h2>A Header</h2>
<p><i>Some italicized text</i></p>
<h2>Another header</h2>
<p id="paragraph-id"><b>Some bold text</b></p>
</body>

In [51]:
content = soup.select('div p')
print(content)

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>]


In [53]:
paragraphs = soup.select('h2 ~ p')
print(paragraphs)

[<p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>]


In [54]:
paragraphs = soup.select('p#paragraph-id b')
print(paragraphs)

[<b>Some bold text</b>]


In [56]:
paragraphs = soup.select('body > p ')
print(paragraphs)

[<p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>]


In [58]:
# Get by css property
alignment = soup.select('[align=middle]')
print(alignment)

[<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>]


# Get HTML properties

In [68]:
header = soup.find('h2')
print(header.string)

# Gets a little tricky with nested elements
div = soup.find('div')
print(div.get_text())

A Header

HTML Webpage
Link to more interesting example: keithgalli.github.io/web-scraping/webpage.html



In [78]:
link = soup.find('a')
print(link['href'])

paragraph = soup.select('p#paragraph-id')
print(paragraph[0]['id'])

https://keithgalli.github.io/web-scraping/webpage.html
paragraph-id


# Code Navigation

In [86]:
soup.body.div.p.a

<a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a>

In [96]:
# Terms: Parent, Siblings, Childs

div = soup.body.find('div').find_next_siblings()
print(div)

[<h2>A Header</h2>, <p><i>Some italicized text</i></p>, <h2>Another header</h2>, <p id="paragraph-id"><b>Some bold text</b></p>]


# Practices

### Practice #1
Grab all social links on the webpage

In [101]:
# Load page
r = requests.get('https://keithgalli.github.io/web-scraping/webpage.html')

soup = bs(r.content)

In [116]:
# Using select

socials = [tag['href'] for tag in soup.select('ul.socials li a')]
print(socials)

['https://www.instagram.com/keithgalli/', 'https://twitter.com/keithgalli', 'https://www.linkedin.com/in/keithgalli/', 'https://www.tiktok.com/@keithgalli']


In [124]:
# Using find
socials = [tag['href'] for tag in soup.find('ul', attrs = {'class': 'socials'}).find_all('a')]
print(socials)

['https://www.instagram.com/keithgalli/', 'https://twitter.com/keithgalli', 'https://www.linkedin.com/in/keithgalli/', 'https://www.tiktok.com/@keithgalli']


### Practice #2
Scrape the MIT table

In [153]:
table = soup.select('table.hockey-stats')[0]
columns = table.find('thead').find_all('th')

column_names = [column.string for column in columns]

rows = table.find('tbody').find_all('tr')
data = []
for row in rows:
    td = row.find_all('td')
    data.append([str(data.get_text()).strip() for data in td])

dataset = pd.DataFrame(data, columns = column_names)
dataset

Unnamed: 0,S,Team,League,GP,G,A,TP,PIM,+/-,Unnamed: 10,POST,GP.1,G.1,A.1,TP.1,PIM.1,+/-.1
0,2014-15,MIT (Mass. Inst. of Tech.),ACHA II,17.0,3.0,9.0,12.0,20.0,,|,,,,,,,
1,2015-16,MIT (Mass. Inst. of Tech.),ACHA II,9.0,1.0,1.0,2.0,2.0,,|,,,,,,,
2,2016-17,MIT (Mass. Inst. of Tech.),ACHA II,12.0,5.0,5.0,10.0,8.0,0.0,|,,,,,,,
3,2017-18,Did not play,,,,,,,,|,,,,,,,
4,2018-19,MIT (Mass. Inst. of Tech.),ACHA III,8.0,5.0,10.0,15.0,8.0,,|,,,,,,,


### Practice #3
Grab all fun facts that contain 'is'

In [171]:
facts = soup.select('ul.fun-facts li')
facts = [fact.find_all(string = re.compile('is')) for fact in facts]
facts = [fact for fact in facts if fact != []]
print(facts)

[['Middle name is Ronald'], ['Dunkin Donuts coffee is better than Starbucks'], ['A favorite book series of mine is '], ['Current video game of choice is '], ["The band that I've seen the most times live is the "]]


### Practice #3
Download an Image

In [190]:
img = soup.select('.row .column img')[0]
img_url = 'https://keithgalli.github.io/web-scraping' + '/' + img['src']

img_data = requests.get(img_url).content
with open('lake_como.jpg', 'wb') as handler:
    handler.write(img_data)