## Load in the necessary libraries

In [2]:
import requests # pip install requests
from bs4 import BeautifulSoup as bs # pip install beautifulsoup4

## Load our first page

In [5]:
# Load the webpage content
r = requests.get("https://lgtromm.github.io/ws_example/")

# Convert to a beautiful soup object
soup = bs(r.content)

# Print out HTML
print(soup.prettify())

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    "Link to more interesting example: "
    <a href="https://github.com/lgtromm?tab=repositories">
     lgtromm github
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



## Start using Beautiful Soup to Scrape 

### find and find_all 

In [9]:
first_header = soup.find("h2")

headers = soup.find_all("h2") #create list of h2 element
headers

[<h2>A Header</h2>, <h2>Another header</h2>]

In [11]:
# Pass in a list of elements to look for
first_header = soup.find(["h1", "h2"])

headers = soup.find_all(["h1", "h2"])
headers

[<h1>HTML Webpage</h1>, <h2>A Header</h2>, <h2>Another header</h2>]

In [14]:
# You can pass in attributes to the find/find_all function
paragraph = soup.find_all("p", attrs={"id": "paragraph-id"})
paragraph

[<p id="paragraph-id">
 <b>Some bold text</b>
 </p>]

In [18]:
# You can nest find/find_all calls
body = soup.find("body")
div = body.find('div')
header = div.find("h1")
header

<h1>HTML Webpage</h1>

In [33]:
# You can search specific strings in our find/find_all calls 
import re

headers = soup.find_all("h2", string=re.compile("(H|h)eader"))
headers

[<h2>A Header</h2>, <h2>Another header</h2>]

### select (CSS selector)

In [35]:
print(soup.body.prettify())

<body>
 <div align="middle">
  <h1>
   HTML Webpage
  </h1>
  <p>
   "Link to more interesting example: "
   <a href="https://github.com/lgtromm?tab=repositories">
    lgtromm github
   </a>
  </p>
 </div>
 <h2>
  A Header
 </h2>
 <p>
  <i>
   Some italicized text
  </i>
 </p>
 <h2>
  Another header
 </h2>
 <p id="paragraph-id">
  <b>
   Some bold text
  </b>
 </p>
</body>



In [36]:
content = soup.select("div p")
content

[<p>
                 "Link to more interesting example: "
                 <a href="https://github.com/lgtromm?tab=repositories">lgtromm github
                 </a>
 </p>]

In [37]:
paragraphs = soup.select("h2 ~ p") # paragraphs directly after h2
paragraphs

[<p>
 <i>Some italicized text</i>
 </p>,
 <p id="paragraph-id">
 <b>Some bold text</b>
 </p>]

In [38]:
bold_text = soup.select("p#paragraph-id b")
bold_text

[<b>Some bold text</b>]

In [40]:
paragraphs = soup.select("body > p")
print(paragraphs)

for paragraph in paragraphs:
    print(paragraph.select("i"))

[<p>
<i>Some italicized text</i>
</p>, <p id="paragraph-id">
<b>Some bold text</b>
</p>]
[<i>Some italicized text</i>]
[]


In [42]:
# Grab by element with specific property
soup.select("[align=middle]")

[<div align="middle">
 <h1>HTML Webpage</h1>
 <p>
                 "Link to more interesting example: "
                 <a href="https://github.com/lgtromm?tab=repositories">lgtromm github
                 </a>
 </p>
 </div>]

## Get different properties of the HTML 

In [49]:
# use .string
header = soup.find("h2")
header.string

# If multiple child elements use get_text
div = soup.find("div")
print(div.prettify())
print(div.get_text())

<div align="middle">
 <h1>
  HTML Webpage
 </h1>
 <p>
  "Link to more interesting example: "
  <a href="https://github.com/lgtromm?tab=repositories">
   lgtromm github
  </a>
 </p>
</div>


HTML Webpage

                "Link to more interesting example: "
                lgtromm github
                




In [53]:
# Get a specific property from an element
link = soup.find("a")
link['href']

paragraphs = soup.select("p#paragraph-id")
paragraphs[0]['id']

'paragraph-id'

## Code navigation 

In [56]:
# Path Syntax
print(soup.body.prettify())

<body>
 <div align="middle">
  <h1>
   HTML Webpage
  </h1>
  <p>
   "Link to more interesting example: "
   <a href="https://github.com/lgtromm?tab=repositories">
    lgtromm github
   </a>
  </p>
 </div>
 <h2>
  A Header
 </h2>
 <p>
  <i>
   Some italicized text
  </i>
 </p>
 <h2>
  Another header
 </h2>
 <p id="paragraph-id">
  <b>
   Some bold text
  </b>
 </p>
</body>



In [58]:
# Know the terms: Parent, Sibling, Child

soup.body.find("div").find_next_siblings()

[<h2>A Header</h2>,
 <p>
 <i>Some italicized text</i>
 </p>,
 <h2>Another header</h2>,
 <p id="paragraph-id">
 <b>Some bold text</b>
 </p>]

## Exercise 

In [74]:
## Load the webpage
r = requests.get("https://keithgalli.github.io/web-scraping/webpage.html")

# Convert to a beautiful soup object
webpage = bs(r.content)

# Print out html
print(webpage.prettify())

<html>
 <head>
  <title>
   Keith Galli's Page
  </title>
  <style>
   table {
    border-collapse: collapse;
  }
  th {
    padding:5px;
  }
  td {
    border: 1px solid #ddd;
    padding: 5px;
  }
  tr:nth-child(even) {
    background-color: #f2f2f2;
  }
  th {
    padding-top: 12px;
    padding-bottom: 12px;
    text-align: left;
    background-color: #add8e6;
    color: black;
  }
  .block {
  width: 100px;
  /*float: left;*/
    display: inline-block;
    zoom: 1;
  }
  .column {
  float: left;
  height: 200px;
  /*width: 33.33%;*/
  padding: 5px;
  }

  .row::after {
    content: "";
    clear: both;
    display: table;
  }
  </style>
 </head>
 <body>
  <h1>
   Welcome to my page!
  </h1>
  <img src="./images/selfie1.jpg" width="300px"/>
  <h2>
   About me
  </h2>
  <p>
   Hi, my name is Keith and I am a YouTuber who focuses on content related to programming, data science, and machine learning!
  </p>
  <p>
   Here is a link to my channel:
   <a href="https://www.youtube.com/kgmi

## Grab all of the social links from the webpage

Do this in at least 3 different ways(find/find_all & select)

In [95]:
links = webpage.select("ul.socials a")
actual_links = [link['href'] for link in links]
actual_links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

In [101]:
ulist = webpage.find("ul", attrs={"class": "socials"})
links = ulist.find_all("a")
actual_links = [link['href'] for link in links]
actual_links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

In [103]:
links = webpage.select("li.social a")
actual_links = [link['href'] for link in links]
actual_links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

## Scrape the Table

In [None]:
l = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text for tr in td]
    l.append(row)
pd.DataFrame(l, columns=["A", "B", ...])

In [116]:
import pandas as pd

table = webpage.select("table.hockey-stats")[0] #don't want this to be in a list
columns = table.find("thead").find_all("th")
column_names = [c.string for c in columns]

table_rows = table.find("tbody").find_all("tr")
l = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [str(tr.get_text()).strip() for tr in td]
    l.append(row)
    
df = pd.DataFrame(l, columns=column_names)
df.loc[df['Team'] != "Did not play"].sum()

S                              2014-152015-162016-172018-19
Team      MIT (Mass. Inst. of Tech.)MIT (Mass. Inst. of ...
League                        ACHA IIACHA IIACHA IIACHA III
GP                                                   179128
G                                                      3155
A                                                     91510
TP                                                  1221015
PIM                                                   20288
+/-                                                       0
                                                       ||||
POST                                                       
GP                                                         
G                                                          
A                                                          
TP                                                         
PIM                                                        
+/-                                     

In [120]:
df.head()

Unnamed: 0,S,Team,League,GP,G,A,TP,PIM,+/-,Unnamed: 10,POST,GP.1,G.1,A.1,TP.1,PIM.1,+/-.1
0,2014-15,MIT (Mass. Inst. of Tech.),ACHA II,17.0,3.0,9.0,12.0,20.0,,|,,,,,,,
1,2015-16,MIT (Mass. Inst. of Tech.),ACHA II,9.0,1.0,1.0,2.0,2.0,,|,,,,,,,
2,2016-17,MIT (Mass. Inst. of Tech.),ACHA II,12.0,5.0,5.0,10.0,8.0,0.0,|,,,,,,,
3,2017-18,Did not play,,,,,,,,|,,,,,,,
4,2018-19,MIT (Mass. Inst. of Tech.),ACHA III,8.0,5.0,10.0,15.0,8.0,,|,,,,,,,


## Grab all fun facts that use word "is"

In [129]:
facts = webpage.select("ul.fun-facts li")
facts_with_is = [fact.find_all(string=re.compile("is")) for fact in facts]
facts_with_is = [fact for fact in facts_with_is if fact]
facts_with_is

[['Middle name is Ronald'],
 ['Dunkin Donuts coffee is better than Starbucks'],
 ['A favorite book series of mine is '],
 ['Current video game of choice is '],
 ["The band that I've seen the most times live is the "]]