# Loading the necessary libraries

In [15]:
import requests
from bs4 import BeautifulSoup as bs

### load our first page  (https://keithgalli.github.io/web-scraping/example.html)

In [22]:
r=requests.get('https://keithgalli.github.io/web-scraping/example.html')

# convert to a beautiful soup object
soup=bs(r.content)

#print out our html
# print(soup)

# To print out our html in good way we use prettify   -- prettify gives you the exact indentation and parent,siblings,child relationship
print(soup.prettify())

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



### Start using Beautiful soup to scrap the data

#### will learn find,find_all using beautiful soup

In [25]:
soup

<html>
<head>
<title>HTML Example</title>
</head>
<body>
<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>
<h2>A Header</h2>
<p><i>Some italicized text</i></p>
<h2>Another header</h2>
<p id="paragraph-id"><b>Some bold text</b></p>
</body>
</html>

In [38]:
first_header=soup.find('h2')          # it prints only the first header in the element
print(first_header)

<h2>A Header</h2>


In [39]:
headers=soup.find_all('h2')            # it prints all the header which are present in element
print(headers)                         # it always print through list

[<h2>A Header</h2>, <h2>Another header</h2>]


In [42]:
# pass in a list of element to look for 

first_header=soup.find(['h1','h2'])        # find will print only one header which are present in the sequence
first_header


<h1>HTML Webpage</h1>

In [43]:
# if you change the order also the find will print the h1 insted of h2 because h1 is first occurance when compared to h2
first_header=soup.find(['h2','h1'])
first_header

<h1>HTML Webpage</h1>

In [44]:
# will use find all and print both using find_all
headers=soup.find_all(['h1','h2'])
headers

[<h1>HTML Webpage</h1>, <h2>A Header</h2>, <h2>Another header</h2>]

In [45]:
## you can pass in attributes to the find/find_all function
#p=paragraph

paragraph=soup.find_all('p')
paragraph

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>,
 <p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [49]:
paragraph=soup.find_all('p',attrs={'id':'paragraph-id'})     # attrs=attributes
paragraph

[<p id="paragraph-id"><b>Some bold text</b></p>]

In [51]:
### you can nest find/find_all calls
body=soup.find('body')
print(body)

<body>
<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>
<h2>A Header</h2>
<p><i>Some italicized text</i></p>
<h2>Another header</h2>
<p id="paragraph-id"><b>Some bold text</b></p>
</body>


In [58]:
# now we requires only div(div is only a container in html)

div=body.find('div')
print(div)

# now if we want to find the header
header=div.find('h1')
header

<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>


<h1>HTML Webpage</h1>

In [59]:
### We can search specific string in our find/find_all calls
print(soup.prettify())

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



In [67]:
import re
paragraphs=soup.find_all('p',string=re.compile('Some'))
print(paragraphs)

[<p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>]


In [65]:
headers=soup.find_all('h2',string=re.compile('(h|H)eader'))
headers

[<h2>A Header</h2>, <h2>Another header</h2>]

In [None]:
## select comand    # it works same as find all

In [71]:
soup.select('div  p')

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>]