<a href="https://colab.research.google.com/github/kaushanr/python3-docs/blob/main/Section_32.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Web Scraping with BeautifulSoup

In [None]:
# Web Scraping

  # involves programmatically grabbing data from a web page
  # involves 3 steps - download, extract data, process data

  # Why Scrape?

    # there's data on a site that you want to store or analyse
    # you can't get by other means - like using an API
    # you want to programmatically grab the data (instead of lots of manual copying/pasting)

  # Ethics?

    # some websites don't want people scraping them
    # best practice: consult the robots.txt file
    # if making many requests, time them out
    # if you're too aggressive, you're IP can be blocked
    

In [None]:
# HTML - hyper text markup language

def my_load(file_name,url = 'https://raw.githubusercontent.com/kaushanr/python3-docs/main/docs/'):
  '''my_load(file_name,url = DefaultParameter)'''
  import requests
  url = url + file_name
  r = requests.get(url)
  with open(file_name, 'w') as f:
      f.write(r.text)
  return print(f'Loaded : {file_name}, to workspace.')

my_load('sample.html')

def my_html(html_file):
  from IPython.display import display,HTML,IFrame
  with open(html_file) as f:
    return display(HTML(f.read()))

my_html('sample.html')
with open('sample.html') as f:
  print(f'********************< {f.name} Body >***********************')
  print(f.read())


# CSS - cascading style sheets

  # CSS can be added to HTML documents in 3 ways:
  # Inline - by using the style attribute inside HTML elements
  # Internal - by using a <style> element in the <head> section
  # External - by using a <link> element to link to an external CSS file

# styling with reference to internal head tags
my_load('web_1.html')
my_html('web_1.html')
with open('web_1.html') as f:
  print(f'********************< {f.name} Body >***********************')
  print(f.read())

# styling with reference to tag IDs - IDs are used once on a page - cannot be reused
my_load('web_2.html')
my_html('web_2.html')
with open('web_2.html') as f:
  print(f'********************< {f.name} Body >***********************')
  print(f.read())

# styling with reference to classes - can be used to impact a multiple selection of items unlike IDs
my_load('web_3.html')
my_html('web_3.html')
with open('web_3.html') as f:
  print(f'********************< {f.name} Body >***********************')
  print(f.read())

Loaded : sample.html, to workspace.


********************< sample.html Body >***********************
<!DOCTYPE html>
<html>
  <head>
    <title>A Bread Recipe</title>
  </head>
  <body>
    <h1>Sourdough Loaf</h1>
    <em>written by Colt Steele</em>
    <h3>Ingredients</h3>
    <ul>
      <li>Flour</li>
      <li>Water</li>
      <li>Yeast</li> 
    </ul>
  </body>
</html>

Loaded : web_1.html, to workspace.


********************< web_1.html Body >***********************

<!DOCTYPE html>
<html>
  <head>
    <title>A Bread Recipe</title>
  </head>
  <body>
    <h1 style="color:purple; background:yellow;">Sourdough Loaf</h1>
    <em>written by Colt Steele</em>
    <h3>Ingredients</h3>
    <ul>
      <li style="color:red; font-size:20px;">Flour</li>
      <li style="color:red; font-size:20px;">Water</li>
      <li style="color:red; font-size:20px;">Yeast</li> 
    </ul>
  </body>
</html>

Loaded : web_2.html, to workspace.


********************< web_2.html Body >***********************
<!DOCTYPE html>
<html>
  <head>
    <title>A Bread Recipe</title>
    <style>
      #first{
        color:blue;
      }
    </style>
  </head>
  <body>
    <h1 style="color:purple; background:yellow;">Sourdough Loaf</h1>
    <em id="first">written by Colt Steele</em>
    <h3>Ingredients</h3>
    <ul>
      <li style="color:red; font-size:20px;">Flour</li>
      <li style="color:red; font-size:20px;">Water</li>
      <li style="color:red; font-size:20px;">Yeast</li> 
    </ul>
  </body>
</html>

Loaded : web_3.html, to workspace.


********************< web_3.html Body >***********************
<!DOCTYPE html>
<html>
  <head>
    <title>A Bread Recipe</title>
    <style>
      #first{
        color:blue;
      }
      .green{
        color:teal;
      }
    </style>
  </head>
  <body>
    <h1 style="color:purple; background:yellow;">Sourdough Loaf</h1>
    <em id="first">written by Colt Steele</em>
    <h3>Ingredients</h3>
    <ul>
      <li style="color:red; font-size:20px;">Flour</li>
      <li style="color:red; font-size:20px;">Water</li>
      <li style="color:red; font-size:20px;">Yeast</li> 
      <li class="green">Bowl</li> 
      <li class="green">Utensils</li> 
    </ul>
  </body>
</html>



In [None]:
# BeautifulSoup

  # used for extracting data from HTML
  # lets us navigate through HTML with Python
  # does not download HTML - we need to download HTML using the 'request' module

#!pip install bs4

# Parsing and navigating HTML

  # parse - analyse (a string or text) into logical syntactic components

  # Syntax - BeatifulSoup(html_string,"html.parser") - parse HTML
  # once parsed, there are several ways to navigate
    # by tag name
    # find - returns one matching tag
    # find_all - returns a list of matching tags 

from bs4 import BeautifulSoup

my_load('bs_basics.html')

with open('bs_basics.html') as f:
  html = f"'''{f.read()}'''"

soup = BeautifulSoup(html,'html.parser')
print(soup) # returns the parsed version of the HTML
print(type(soup))

print()

print(soup.body)

print('\n','finding a tag within <body>','\n')

print(soup.body.div) # only a single <div> returned

print('\n','finding with .find','\n')

print(soup.find('div')) # returns a single instance of <div>

print('\n','finding with .find_all','\n')

print(soup.find_all('div')) # returns all instances of <div>

print('\n','finding with IDs','\n')

print(soup.find(id='first'))

print('\n','finding with Classes','\n')

print(soup.find_all(class_='special')) # special syntax used to refer to HTML classes in Python - 'class_'

print('\n','finding with data attributes','\n')

print(soup.find_all(attrs={'data-example':'yes'}))

Loaded : bs_basics.html, to workspace.
'''<!DOCTYPE html>

<html lang="en">
<head>
<meta charset="utf-8" class="special"/>
<title>First HTML Page</title>
</head>
<body>
<div id="first">
<h3 data-example="yes">hi</h3>
<p>more text.</p>
</div>
<ol>
<li class="special">This list item is special.</li>
<li>This list item is not special.</li>
<li class="special super-special">This list item is also special.</li>
</ol>
<div data-example="yes">bye</div>
</body>
</html>
'''
<class 'bs4.BeautifulSoup'>

<body>
<div id="first">
<h3 data-example="yes">hi</h3>
<p>more text.</p>
</div>
<ol>
<li class="special">This list item is special.</li>
<li>This list item is not special.</li>
<li class="special super-special">This list item is also special.</li>
</ol>
<div data-example="yes">bye</div>
</body>

 finding a tag within <body> 

<div id="first">
<h3 data-example="yes">hi</h3>
<p>more text.</p>
</div>

 finding with .find 

<div id="first">
<h3 data-example="yes">hi</h3>
<p>more text.</p>
</div>

 fi

In [None]:
# Using CSS Style Selectors with BeautifulSoup

  # Navigating with CSS selectors

    # 'select' - returns a list of elements matching a CSS selector
    # select by id - #id_name
    # select by class - .class_name
    # select children - div > p
    # select decendants - div p

print('\n','finding with CSS id','\n')

print(soup.select('#first')) # returns a list with the object contained within

print('\n','finding with CSS tag name','\n')

print(soup.select('div')) # returns all matching objects within a list

print('\n','finding with CSS data attribute','\n')

print(soup.select('[data-example]'))


 finding with CSS id 

[<div id="first">
<h3 data-example="yes">hi</h3>
<p>more text.</p>
</div>]

 finding with CSS tag name 

[<div id="first">
<h3 data-example="yes">hi</h3>
<p>more text.</p>
</div>, <div data-example="yes">bye</div>]

 finding with CSS data attribute 

[<h3 data-example="yes">hi</h3>, <div data-example="yes">bye</div>]


In [None]:
# Accessing data in elements

  # .get_text - access the inner text in an element
  # .name - tag name
  # .attrs- dictionary of attributes
  # can access attribute values using brackets

el = soup.select('.special')[1]
print(el)
print(el.get_text()) # returns the text item within the tag

for text in soup.select('.special'):
  print(text.get_text())

a = soup.select('meta')[0] 
print(a)
print(a.get_text()) # returns nothing instead of raising error, if no text found in tag
print('after <meta> tag text call...')

print()

f = soup.select('.special')
for item in f:
  print(item.name) # prints all tag names that belong to class - .special
  print(item.attrs) # accesses all the attributes within each returned tag item

print()

print(soup.find('div')) # returns a bs4 object
print(soup.find('div').attrs) # returns a dictionary of attribute keys and values
attr = soup.find('div')['id'] # finding the value of the attr given the key
print(attr)

<li class="special">This list item is special.</li>
This list item is special.

This list item is special.
This list item is also special.
<meta charset="utf-8" class="special"/>

after <meta> tag text call...

meta
{'charset': 'UTF-8', 'class': ['special']}
li
{'class': ['special']}
li
{'class': ['special', 'super-special']}

<div id="first">
<h3 data-example="yes">hi</h3>
<p>more text.</p>
</div>
{'id': 'first'}
first


In [None]:
# Navigating with BeautifulSoup

  # navigation performed in two method - via tags, via searching

  # HTML hierachly involves parent, child and sibling
  # parent - <body>
  # child - <div> within <body>
  # sibling - <ol> and <div> are siblings with <body> parent

# Via Tags

print(html)

print()

print('Accessing nested tags within <body>****************************************************','\n')
data = soup.body.contents # accessing nested tags within <body>
print(data) # returns a list of the data

print()

print('Accessing child and sibling contents within a nested parent tag************************','\n')
inner_data = data[1].contents # data[1] since data[0] returns '\n' element, which is first in list
print(inner_data) # returns the contents within the <div> tag

print()

print('Moving between siblings****************************************************************','\n')
sibling = data[1].next_sibling.next_sibling # .previous_sibling to move backwards
print(sibling)

print()

print('Moving up the hierachy from child to parent********************************************','\n')
child = soup.find(class_='super-special')
print(child,'\n')
print(child.parent,'\n')
print(child.parent.parent,'\n')

# Via Searching

print()

print('Moving up the hierachy from child to parent using search*******************************','\n')
data = soup.find(id='first')
print(data,'\n')
print(data.find_next_sibling(),'\n') # .find_next_sibling() - returns the next instance of an existing valid HTML element
print(data.find_next_sibling().find_next_sibling(),'\n') 

print()

print('Moving up back between siblings using search*******************************************','\n')
data = soup.select('[data-example]')[1]
print(data,'\n')
print(data.find_previous_sibling(),'\n')

print()

print('Passing an argument to .find_next/previous_sibling(arg)********************************','\n')
data = soup.select('.super-special')[0]
print(data,'\n')
print(data.find_previous_sibling(),'\n') # returns the previous in-line element
print(data.find_previous_sibling(class_='special'),'\n') # returns the previous item that belongs to the passed class argument

print()

print('Finding the parent of a child**********************************************************','\n')
data = soup.find('h3')
print(data,'\n')
print(data.parent,'\n')  # returns the contents of parent element
print(data.find_parent(),'\n') 
print(data.find_parent('body'),'\n') # skips all parents until it finds the requested parent

'''<!DOCTYPE html>
<html lang="en">
<head>
  <meta charset="UTF-8" class="special">
  <title>First HTML Page</title>
</head>
<body>
  <div id="first">
    <h3 data-example="yes">hi</h3>
    <p>more text.</p>
  </div>
  <ol>
    <li class="special">This list item is special.</li>
    <li>This list item is not special.</li>
    <li class="special super-special">This list item is also special.</li>
  </ol>
  <div data-example="yes">bye</div>
</body>
</html>
'''

Accessing nested tags within <body>**************************************************** 

['\n', <div id="first">
<h3 data-example="yes">hi</h3>
<p>more text.</p>
</div>, '\n', <ol>
<li class="special">This list item is special.</li>
<li>This list item is not special.</li>
<li class="special super-special">This list item is also special.</li>
</ol>, '\n', <div data-example="yes">bye</div>, '\n']

Accessing child and sibling contents within a nested parent tag************************ 

['\n', <h3 data-example="yes">hi</h3>, '\n', <

In [147]:
# Web Scraping Example with BeautifulSoup

  # use https://www.rithmschool.com/blog for scraping project

import requests
from bs4 import BeautifulSoup
import csv

response = requests.get('https://www.rithmschool.com/blog')
print(response) # response id - 200 - okay from server
print(response.ok,'\n')

data = response.text
#print(data)
soup = BeautifulSoup(data,'html.parser')
#print(soup)
articles = soup.find_all('article') # saves all <article> element contents in a list

with open('blog_data.csv','w') as file:
  headers = ['Title','URL','Date']
  csv_writer = csv.DictWriter(file,fieldnames=headers)
  csv_writer.writeheader()

  for article in articles:
    #print(article.find('a'))
    title = article.find('a').get_text()
    url = article.find('a')['href']
    datetime = article.find('time').attrs['datetime'] # returns a dictionary
    #print(datetime)
    date = datetime.split(' ')[0]
    #print(date)
    print(title,url,date)
    
    csv_writer.writerow(
        {
            'Title':title,
            'URL':url,
            'Date':date
        }
    )
  
print('\n')

print('blog_data.csv contents...','\n')

with open('blog_data.csv') as file:
  csv_reader = csv.reader(file)
  for row in csv_reader:
    print(row)

<Response [200]>
True 

Rithm Grads Win First Place in StepZen Hackathon /blog/grads-win-stepzen-hackathon 2022-09-06
Choosing the Right Bootcamp /blog/choosing-a-bootcamp 2022-06-23
Interview: Rithm Grads at Google /blog/grads-at-google 2022-05-27
Three Rithm Students Accepted to the 2022 Pinterest Engineering Internship /blog/pinterest-internship 2022-05-13
The Top 5 Questions People Ask About Rithm School /blog/top-5-admissions-questions 2022-03-21
Announcing Rithm's 2022 Scholarship Fund /blog/2022-scholarship 2021-12-08
October 2021 Women's Scholarship Fund /blog/october-2021-womens-scholarship-fund 2021-08-18
R19: 100% Outcomes Within 4 Months Of Graduating /blog/r19-100-outcomes-within-4-months-of-graduating 2021-07-13
Student Interview: R20 On Remote Company Projects  /blog/student-interview-clever-counsel-company-projects 2021-05-25
Good Ideas for Better Variable Names /blog/good-ideas-for-better-variable-names 2021-05-04
TypeScript: How To Get Started /blog/typescript-how-to-

In [171]:
# Web Crawling and Scraping across multiple pages

import requests
from bs4 import BeautifulSoup
import csv
from time import sleep

response = requests.get(f'https://www.rithmschool.com/blog')
data = response.text
soup = BeautifulSoup(data,'html.parser')

current = soup.find('span',class_='current')
next = soup.find('span',class_='next').contents[1]['href']

print()
print(f'Current Page : Home-Page [{current}]','\n')


with open('blog_data_crawler.csv','w') as file:
  headers = ['Title','URL','Date']
  csv_writer = csv.DictWriter(file,fieldnames=headers)
  csv_writer.writeheader()


while next:
  
  with open('blog_data_crawler.csv','a') as file:
    headers = ['Title','URL','Date']
    csv_writer = csv.DictWriter(file,fieldnames=headers)
    articles = soup.find_all('article') # saves all <article> element contents in a list

    for article in articles:
      title = article.find('a').get_text()
      url = article.find('a')['href']
      datetime = article.find('time').attrs['datetime'] # returns a dictionary
      date = datetime.split(' ')[0]
      print(title,url,date)
      
      csv_writer.writerow(
          {
              'Title':title,
              'URL':url,
              'Date':date
          }
      )

  try:
    next = soup.find('span',class_='next').contents[1]['href']
    current = next
  except AttributeError as err:
    print()
    print(f'End of pages: {err}','\n')
    break

  print()
  print(f'Current Page : {current[11:]} [{current}]','\n')

  sleep(2) # delays recurring requests for 2 seconds

  response = requests.get(f'https://www.rithmschool.com{next}')
  data = response.text
  soup = BeautifulSoup(data,'html.parser')


Current Page : Home-Page [<span class="page current">
  1
</span>] 

Rithm Grads Win First Place in StepZen Hackathon /blog/grads-win-stepzen-hackathon 2022-09-06
Choosing the Right Bootcamp /blog/choosing-a-bootcamp 2022-06-23
Interview: Rithm Grads at Google /blog/grads-at-google 2022-05-27
Three Rithm Students Accepted to the 2022 Pinterest Engineering Internship /blog/pinterest-internship 2022-05-13
The Top 5 Questions People Ask About Rithm School /blog/top-5-admissions-questions 2022-03-21
Announcing Rithm's 2022 Scholarship Fund /blog/2022-scholarship 2021-12-08
October 2021 Women's Scholarship Fund /blog/october-2021-womens-scholarship-fund 2021-08-18
R19: 100% Outcomes Within 4 Months Of Graduating /blog/r19-100-outcomes-within-4-months-of-graduating 2021-07-13
Student Interview: R20 On Remote Company Projects  /blog/student-interview-clever-counsel-company-projects 2021-05-25
Good Ideas for Better Variable Names /blog/good-ideas-for-better-variable-names 2021-05-04
TypeScrip