In [1]:
!pip install html5lib

Collecting html5lib
  Downloading html5lib-1.1-py2.py3-none-any.whl (112 kB)
     ---------------------------------------- 0.0/112.2 kB ? eta -:--:--
     ---------------------------------------- 0.0/112.2 kB ? eta -:--:--
     ---------- ---------------------------- 30.7/112.2 kB 1.3 MB/s eta 0:00:01
     ---------- ---------------------------- 30.7/112.2 kB 1.3 MB/s eta 0:00:01
     ------------- ----------------------- 41.0/112.2 kB 487.6 kB/s eta 0:00:01
     ------------- ----------------------- 41.0/112.2 kB 487.6 kB/s eta 0:00:01
     ------------- ----------------------- 41.0/112.2 kB 487.6 kB/s eta 0:00:01
     ------------- ----------------------- 41.0/112.2 kB 487.6 kB/s eta 0:00:01
     -------------------- ---------------- 61.4/112.2 kB 252.2 kB/s eta 0:00:01
     -------------------- ---------------- 61.4/112.2 kB 252.2 kB/s eta 0:00:01
     ----------------------- ------------- 71.7/112.2 kB 206.9 kB/s eta 0:00:01
     ----------------------- ------------- 71.7/112.2 kB 

In [2]:
# installing beautiful soup for html web scraping
!pip install bs4

Collecting bs4
  Downloading bs4-0.0.1.tar.gz (1.1 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: bs4
  Building wheel for bs4 (setup.py): started
  Building wheel for bs4 (setup.py): finished with status 'done'
  Created wheel for bs4: filename=bs4-0.0.1-py3-none-any.whl size=1264 sha256=24f05b9eae4628b7dcfd6be7022b4a81aa68767b8d2c4db076d27314961fe29d
  Stored in directory: c:\users\hp\appdata\local\pip\cache\wheels\d4\c8\5b\b5be9c20e5e4503d04a6eac8a3cd5c2393505c29f02bea0960
Successfully built bs4
Installing collected packages: bs4
Successfully installed bs4-0.0.1


In [3]:
from bs4 import BeautifulSoup # this module helps in web scrapping.
import requests  # this module helps us to download a web page

In [4]:
# storing html as a variable

html="<!DOCTYPE html><html><head><title>Page Title</title></head><body><h3><b id='boldest'>Lebron James</b></h3><p> Salary: $ 92,000,000 </p><h3> Stephen Curry</h3><p> Salary: $85,000, 000 </p><h3> Kevin Durant </h3><p> Salary: $73,200, 000</p></body></html>"

In [5]:
# To parse a document, pass it into the BeautifulSoup constructor.The BeautifulSoup object represents the document
# as a nested data structure

soup = BeautifulSoup(html, 'html5lib')

First, the document is converted to Unicode (similar to ASCII) and HTML entities are converted to Unicode characters. Beautiful Soup transforms a complex HTML document into a complex tree of Python objects. The BeautifulSoup object can create other types of objects. In this lab, we will cover BeautifulSoup and Tag objects, that for the purposes of this lab are identical. Finally, we will look at NavigableString objects.

In [7]:
# We can use the method prettify() to display the HTML in the nested structure:

print(soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  <title>
   Page Title
  </title>
 </head>
 <body>
  <h3>
   <b id="boldest">
    Lebron James
   </b>
  </h3>
  <p>
   Salary: $ 92,000,000
  </p>
  <h3>
   Stephen Curry
  </h3>
  <p>
   Salary: $85,000, 000
  </p>
  <h3>
   Kevin Durant
  </h3>
  <p>
   Salary: $73,200, 000
  </p>
 </body>
</html>



# Tags

In [8]:
# Let's say we want the title of the page and the name of the top paid player. We can use the Tag.
# The Tag object corresponds to an HTML tag in the original document, for example, the tag title.

tag_object=soup.title
print("tag object:",tag_object)

tag object: <title>Page Title</title>


In [9]:
# Tag type

print("tag object type:",type(tag_object))

tag object type: <class 'bs4.element.Tag'>


In [10]:
# If there is more than one Tag with the same name, the first element with that Tag name is called.
# This corresponds to the most paid player:

tag_object = soup.h3
tag_object

<h3><b id="boldest">Lebron James</b></h3>

In [11]:
# We can access the child of the tag or navigate down the branch as follows:

tag_child =tag_object.b
tag_child

<b id="boldest">Lebron James</b>

In [12]:
# You can access the parent with the  parent

parent_tag=tag_child.parent
parent_tag

<h3><b id="boldest">Lebron James</b></h3>

In [14]:
# tag_object parent is the body element

tag_object.parent

<body><h3><b id="boldest">Lebron James</b></h3><p> Salary: $ 92,000,000 </p><h3> Stephen Curry</h3><p> Salary: $85,000, 000 </p><h3> Kevin Durant </h3><p> Salary: $73,200, 000</p></body>

In [15]:
# tag_object sibling is the paragraph element

sibling_1=tag_object.next_sibling
sibling_1

<p> Salary: $ 92,000,000 </p>

In [17]:
# sibling_2 is the header element, which is also a sibling of both sibling_1 and tag_object

sibling_2 = sibling_1.next_sibling
sibling_2

<h3> Stephen Curry</h3>

# Html Attributes

In [18]:
# If the tag has attributes, the tag id="boldest" has an attribute id whose value is boldest.
# You can access a tag's attributes by treating the tag like a dictionary:

tag_child['id']

'boldest'

In [19]:
# Access the dictionary directly as attrs(attributes)

tag_child.attrs

{'id': 'boldest'}

In [20]:
# We can also obtain the content of the attribute of the tag using the Python get() method

tag_child.get('id')

'boldest'

# Navigable String

In [21]:
# A string corresponds to a bit of text or content within a tag. 
# Beautiful Soup uses the NavigableString class to contain this text. 
# In our HTML we can obtain the name of the first player by extracting the string of the Tag object tag_child as follows:

tag_string=tag_child.string
tag_string

'Lebron James'

In [22]:
# A NavigableString is similar to a Python string or Unicode string. 
# To be more precise, the main difference is that it also supports some BeautifulSoup features.
# We can convert it to string object in Python:

unicode_string = str(tag_string)
unicode_string

'Lebron James'

# Filter

Filters allow you to find complex patterns, the simplest filter is a string. In this section we will pass a string to a different filter method and Beautiful Soup will perform a match against that exact string

In [23]:
table="<table><tr><td id='flight'>Flight No</td><td>Launch site</td> <td>Payload mass</td></tr><tr> <td>1</td><td><a href='https://en.wikipedia.org/wiki/Florida'>Florida<a></td><td>300 kg</td></tr><tr><td>2</td><td><a href='https://en.wikipedia.org/wiki/Texas'>Texas</a></td><td>94 kg</td></tr><tr><td>3</td><td><a href='https://en.wikipedia.org/wiki/Florida'>Florida<a> </td><td>80 kg</td></tr></table>"

In [25]:
table_bs = BeautifulSoup(table, 'html5lib')
table_bs

<html><head></head><body><table><tbody><tr><td id="flight">Flight No</td><td>Launch site</td> <td>Payload mass</td></tr><tr> <td>1</td><td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a></a></td><td>300 kg</td></tr><tr><td>2</td><td><a href="https://en.wikipedia.org/wiki/Texas">Texas</a></td><td>94 kg</td></tr><tr><td>3</td><td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a> </a></td><td>80 kg</td></tr></tbody></table></body></html>

# Find All
The find_all() method looks through a tag's descendants and retrieves all descendants that match your filters.The Method signature for find_all(name, attrs, recursive, string, limit, **kwargs)

### Name
When we set the name parameter to a tag name, the method will extract all the tags with that name and its children.

In [26]:
# The find_all() method is used to find all the elements in a soup object that match a certain criteria. 
# In this case, the criteria is the <tr> tag.

table_rows=table_bs.find_all('tr')
table_rows

[<tr><td id="flight">Flight No</td><td>Launch site</td> <td>Payload mass</td></tr>,
 <tr> <td>1</td><td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a></a></td><td>300 kg</td></tr>,
 <tr><td>2</td><td><a href="https://en.wikipedia.org/wiki/Texas">Texas</a></td><td>94 kg</td></tr>,
 <tr><td>3</td><td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a> </a></td><td>80 kg</td></tr>]

In [29]:
# this assigns the first_row to find the first row of the variable table_rows 

first_row = table_rows[0]
first_row

<tr><td id="flight">Flight No</td><td>Launch site</td> <td>Payload mass</td></tr>

In [31]:
first_row.td

<td id="flight">Flight No</td>

In [32]:
# If we iterate through the list each element corresponds to a row in the table

for i,row in enumerate(table_rows):
    print("row",i,"is",row)

row 0 is <tr><td id="flight">Flight No</td><td>Launch site</td> <td>Payload mass</td></tr>
row 1 is <tr> <td>1</td><td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a></a></td><td>300 kg</td></tr>
row 2 is <tr><td>2</td><td><a href="https://en.wikipedia.org/wiki/Texas">Texas</a></td><td>94 kg</td></tr>
row 3 is <tr><td>3</td><td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a> </a></td><td>80 kg</td></tr>


In [33]:
# getting each td in the column tr 

for i,row in enumerate(table_rows):
    print("row",i)
    cells=row.find_all('td')
    for j,cell in enumerate(cells):
        print('colunm',j,"cell",cell)

row 0
colunm 0 cell <td id="flight">Flight No</td>
colunm 1 cell <td>Launch site</td>
colunm 2 cell <td>Payload mass</td>
row 1
colunm 0 cell <td>1</td>
colunm 1 cell <td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a></a></td>
colunm 2 cell <td>300 kg</td>
row 2
colunm 0 cell <td>2</td>
colunm 1 cell <td><a href="https://en.wikipedia.org/wiki/Texas">Texas</a></td>
colunm 2 cell <td>94 kg</td>
row 3
colunm 0 cell <td>3</td>
colunm 1 cell <td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a> </a></td>
colunm 2 cell <td>80 kg</td>


In [34]:
list_input=table_bs .find_all(name=["tr", "td"])
list_input

[<tr><td id="flight">Flight No</td><td>Launch site</td> <td>Payload mass</td></tr>,
 <td id="flight">Flight No</td>,
 <td>Launch site</td>,
 <td>Payload mass</td>,
 <tr> <td>1</td><td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a></a></td><td>300 kg</td></tr>,
 <td>1</td>,
 <td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a></a></td>,
 <td>300 kg</td>,
 <tr><td>2</td><td><a href="https://en.wikipedia.org/wiki/Texas">Texas</a></td><td>94 kg</td></tr>,
 <td>2</td>,
 <td><a href="https://en.wikipedia.org/wiki/Texas">Texas</a></td>,
 <td>94 kg</td>,
 <tr><td>3</td><td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a> </a></td><td>80 kg</td></tr>,
 <td>3</td>,
 <td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a> </a></td>,
 <td>80 kg</td>]

### Attributes

In [35]:
table_bs.find_all(id="flight")

[<td id="flight">Flight No</td>]

In [37]:
# we can find all pics related to florida

list_input=table_bs.find_all(href="https://en.wikipedia.org/wiki/Florida")
list_input

[<a href="https://en.wikipedia.org/wiki/Florida">Florida</a>,
 <a href="https://en.wikipedia.org/wiki/Florida">Florida</a>]

In [39]:
# If we set the href attribute to True, regardless of what the value is, the code finds all tags with href value:
table_bs.find_all(href = True)

[<a href="https://en.wikipedia.org/wiki/Florida">Florida</a>,
 <a href="https://en.wikipedia.org/wiki/Texas">Texas</a>,
 <a href="https://en.wikipedia.org/wiki/Florida">Florida</a>]

In [41]:
table_bs.find_all(string="Florida")

['Florida', 'Florida']

### Find
The find_all() method scans the entire document looking for results. It’s useful if you are looking for one element, as you can use the find() method to find the first element in the document. 

In [42]:
two_tables="<h3>Rocket Launch </h3><p><table class='rocket'><tr><td>Flight No</td><td>Launch site</td> <td>Payload mass</td></tr><tr><td>1</td><td>Florida</td><td>300 kg</td></tr><tr><td>2</td><td>Texas</td><td>94 kg</td></tr><tr><td>3</td><td>Florida </td><td>80 kg</td></tr></table></p><p><h3>Pizza Party  </h3><table class='pizza'><tr><td>Pizza Place</td><td>Orders</td> <td>Slices </td></tr><tr><td>Domino's Pizza</td><td>10</td><td>100</td></tr><tr><td>Little Caesars</td><td>12</td><td >144 </td></tr><tr><td>Papa John's </td><td>15 </td><td>165</td></tr>"

In [43]:
# creating a beautifulshop object two_tables_bs

two_tables_bs= BeautifulSoup(two_tables, 'html.parser')

In [44]:
# finding the first table using the tag name table

two_tables_bs.find("table")

<table class="rocket"><tr><td>Flight No</td><td>Launch site</td> <td>Payload mass</td></tr><tr><td>1</td><td>Florida</td><td>300 kg</td></tr><tr><td>2</td><td>Texas</td><td>94 kg</td></tr><tr><td>3</td><td>Florida </td><td>80 kg</td></tr></table>

In [47]:
# Finding the table where the class is pizza

two_tables_bs.find("table",class_='pizza')

<table class="pizza"><tr><td>Pizza Place</td><td>Orders</td> <td>Slices </td></tr><tr><td>Domino's Pizza</td><td>10</td><td>100</td></tr><tr><td>Little Caesars</td><td>12</td><td>144 </td></tr><tr><td>Papa John's </td><td>15 </td><td>165</td></tr></table>

# Downloading And Scraping The Contents Of A Web Page

In [48]:
# We Download the contents of the web page:

url = "http://www.ibm.com"

In [49]:
#We use get to download the contents of the webpage in text format and store in a variable called data

data  = requests.get(url).text 

In [50]:
# create a soup object using the variable 'data'

soup = BeautifulSoup(data,"html5lib")  

In [52]:
# Scrape all links
# in html anchor/link is represented by the tag <a>
# The find_all() method is used to find all the elements in a soup object that match a certain criteria. 
# In this case, the criteria is the 'a' tag

for link in soup.find_all('a',href=True):  
    print(link.get('href'))


https://newsroom.ibm.com/2023-08-10-IBM-Completes-Acquisition-of-Apptio-Inc
https://www.ibm.com/community/ibm-techxchange-conference
https://www.ibm.com/products/watsonx-ai
https://www.ibm.com/products/watsonx-data
https://www.ibm.com/products/spss-statistics/pricing
https://www.ibm.com/sports/usopen
https://www.ibm.com/cloud?lnk=flatitem
https://www.ibm.com/products
https://www.ibm.com/consulting
https://www.ibm.com/about
https://www.ibm.com/


### Scrape all images Tags

In [53]:
# in html image is represented by the tag <img>
# the html is first printed
# the link which is contained within the 'src' is then printed

for link in soup.find_all('img'):
    print(link)
    print(link.get('src'))

<img alt="Tennis player returns serve in Wimbledon stadium" class="bx--image__img" src="https://1.dam.s81c.com/p/0c627169442d5243/ibm_watsonx_data_closeup_still_4k.jpg.global.sr_1x1.jpg"/>
https://1.dam.s81c.com/p/0c627169442d5243/ibm_watsonx_data_closeup_still_4k.jpg.global.sr_1x1.jpg
<img alt="Concentric illustration showing watsonx.ai capabilities" aria-describedby="bx--image-3" class="bx--image__img" src="https://1.dam.s81c.com/p/0c3ce2dfcccd1f24/watsonx-data-square.jpg"/>
https://1.dam.s81c.com/p/0c3ce2dfcccd1f24/watsonx-data-square.jpg
<img alt="Illustration of workers digital planning with blue dots and post-its" aria-describedby="bx--image-4" class="bx--image__img" src="https://1.dam.s81c.com/p/0c3ce2dfcccd1f25/watsonx-ai-square.jpg"/>
https://1.dam.s81c.com/p/0c3ce2dfcccd1f25/watsonx-ai-square.jpg
<img alt="Illustration of people at large digital dashboard" aria-describedby="bx--image-5" class="bx--image__img" src="https://1.dam.s81c.com/p/0b5258b292cc8c3c/ibm-SPSS-home-card.p

### Scrape data from HTML tables

In [54]:
#The below url contains an html table with data about colors and color codes.
url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DA0321EN-SkillsNetwork/labs/datasets/HTMLColorCodes.html"

In [56]:
# NOTE: Before proceeding to scrape a web site, you need to examine the contents and the way data is organized on the website. 
# Open the above url in your browser and check how many rows and columns there are in the color table.

# get the contents of the webpage in text format and store in a variable called data
data1  = requests.get(url).text

In [57]:
soup = BeautifulSoup(data,"html5lib")

In [58]:
#find a html table in the web page
# in html table is represented by the tag <table>

table = soup.find('table') 

In [61]:
#Get all rows from the table
for row in table.find_all('tr'): # in html table row is represented by the tag <tr>
    # Get all columns in each row.
    cols = row.find_all('td') # in html a column is represented by the tag <td>
    color_name = cols[2].string # store the value in column 3 as color_name
    color_code = cols[3].text # store the value in column 4 as color_code
    print("{}--->{}".format(color_name,color_code))
    
    

Color Name--->Hex Code#RRGGBB
lightsalmon--->#FFA07A
salmon--->#FA8072
darksalmon--->#E9967A
lightcoral--->#F08080
coral--->#FF7F50
tomato--->#FF6347
orangered--->#FF4500
gold--->#FFD700
orange--->#FFA500
darkorange--->#FF8C00
lightyellow--->#FFFFE0
lemonchiffon--->#FFFACD
papayawhip--->#FFEFD5
moccasin--->#FFE4B5
peachpuff--->#FFDAB9
palegoldenrod--->#EEE8AA
khaki--->#F0E68C
darkkhaki--->#BDB76B
yellow--->#FFFF00
lawngreen--->#7CFC00
chartreuse--->#7FFF00
limegreen--->#32CD32
lime--->#00FF00
forestgreen--->#228B22
green--->#008000
powderblue--->#B0E0E6
lightblue--->#ADD8E6
lightskyblue--->#87CEFA
skyblue--->#87CEEB
deepskyblue--->#00BFFF
lightsteelblue--->#B0C4DE
dodgerblue--->#1E90FF
