

# Import Packages

In [2]:
from bs4 import BeautifulSoup
import requests
import numpy as np
import pandas as pd
import re

# 1. Souping the Page

In [2]:
#url = "https://www.tutorialspoint.com/index.htm"

url = "https://www.tutorialspoint.com/index.htm"
req = requests.get(url)

soup = BeautifulSoup(req.content, "html.parser")

print(soup.title)

<title>Quality Tutorials, Video Courses, and eBooks - TutorialsPoint</title>


In [7]:
# Extract all urls from the page
for link in soup.find_all('a'):
   print(link.get('href'))

/index.htm
/tutorialslibrary.htm
/codingground.htm
/job_search.php
/whiteboard.htm
/online_dev_tools.htm
/articles/index.php
https://www.tutorialspoint.com/articles/write-and-earn.php
https://www.tutorialspoint.com/market/index.asp
https://www.tutorialspoint.com/latest/certifications
/market/login.jsp
https://www.facebook.com/tutorialspointindia
https://www.instagram.com/tutorialspoint_/
https://twitter.com/tutorialspoint
https://www.youtube.com/channel/UCVLbzhxVTiTLiVKeGV7WEBg
https://www.linkedin.com/company/tutorialspoint/
https://www.tutorialspoint.com
/top-categories.htm
/artificial_intelligence/index.htm
/machine_learning/index.htm
/machine_learning_with_python/index.htm
/data_science/index.htm
/statistics/index.htm
/natural_language_processing/index.htm
/artificial_neural_network/index.htm
/tensorflow/index.htm
/pytorch/index.htm
/matplotlib/index.htm
/numpy/index.htm
/python_pandas/index.htm
/scipy/index.htm
/big_data_analytics/index.htm
/machine_learning_tutorials.htm
/python/

### 1.1 Soupinp from the local file

In [8]:
with open("index.html") as fp:
    soup = BeautifulSoup(fp, 'html.parser')

FileNotFoundError: [Errno 2] No such file or directory: 'index.html'

### 1.2 Souping from a string 

In [9]:
html = '''
<html>
   <head>
      <title>Hello World</title>
   </head>
   <body>
      <h1 style="text-align:center;">Hello World</h1>
   </body>
</html>
'''
soup = BeautifulSoup(html, 'html.parser')

print(soup)


<html>
<head>
<title>Hello World</title>
</head>
<body>
<h1 style="text-align:center;">Hello World</h1>
</body>
</html>



## beautifulsoup basically converts a complex html page into  four major kinds of objects defined in bs4 package.

In [None]:
# Tag
# NavigableString
# BeautifulSoup
# Comments

## 1.3.1 Tag Object

In [13]:
# Name (tag.name) 
#Every tag contains a name and can be accessed through '.name' as suffix. tag.name will return the type of tag it is
soup = BeautifulSoup('<b class="boldest">TutorialsPoint</b>', 'lxml')
print(soup)
tag = soup.html
tag1 = soup.body
print(type(tag))
print(type(tag1))

<html><body><b class="boldest">TutorialsPoint</b></body></html>
<class 'bs4.element.Tag'>
<class 'bs4.element.Tag'>


In [3]:
# if we change the tag name, same will be reflected in the HTML markup generated by the BeautifulSoup.
soup = BeautifulSoup('<b class="boldest">TutorialsPoint</b>', 'lxml')
tag = soup.html
tag.name = "strong"
print (tag)

<strong><body><b class="boldest">TutorialsPoint</b></body></strong>


In [6]:
# Attributes (tag.attrs)
# A tag object can have any number of attributes.
soup = BeautifulSoup('<input type="text" name="name" value="Raju">', 'lxml')
tag = soup.input
print(type(tag.attrs))
print (tag.attrs)

<class 'dict'>
{'type': 'text', 'name': 'name', 'value': 'Raju'}


In [7]:
# We can do all kind of modifications to our tag's attributes (add/remove/modify), using dictionary operators or methods.
soup = BeautifulSoup('<input type="text" name="name" value="Raju">', 'lxml')
tag = soup.input

print (tag.attrs)
tag['value']='Ravi'
print (soup)
tag['id']='nm'
del tag['value']
print (soup)

{'type': 'text', 'name': 'name', 'value': 'Raju'}
<html><body><input name="name" type="text" value="Ravi"/></body></html>
<html><body><input id="nm" name="name" type="text"/></body></html>


In [10]:
# Multi-valued attributes
# Some of the HTML5 attributes can have multiple values. 
css_soup = BeautifulSoup('<p class="body"></p>', 'lxml')
print ("css_soup.p['class']:", css_soup.p['class'])

css_soup = BeautifulSoup('<p class="body bold"></p>', 'lxml')
print ("css_soup.p['class']:", css_soup.p['class'])

print()
#However, if any attribute contains more than one value 
#but it is not multi-valued attributes by any-version of HTML standard, beautiful soup will leave the attribute alone 
id_soup = BeautifulSoup('<p id="body bold"></p>', 'lxml')
print ("id_soup.p['id']:", id_soup.p['id'])
print ("type(id_soup.p['id']):", type(id_soup.p['id']))

css_soup.p['class']: ['body']
css_soup.p['class']: ['body', 'bold']

id_soup.p['id']: body bold
type(id_soup.p['id']): <class 'str'>


## 1.3.2 NavigableString Object

In [11]:
# The NavigableString object represents the contents of a tag.
# It is an object of bs4.element.NavigableString class. 
# To access the contents, use ".string" with tag.
soup = BeautifulSoup("<h2 id='message'>Hello, Tutorialspoint!</h2>", 'html.parser')

print (soup.string)

print (type(soup.string))

Hello, Tutorialspoint!
<class 'bs4.element.NavigableString'>


In [13]:
#A NavigableString can be converted to a Unicode string with str() function.
soup = BeautifulSoup("<h2 id='message'>Hello, Tutorialspoint!</h2>",'html.parser')

tag = soup.h2
string = str(tag.string)
print (string)
print(type(string))

Hello, Tutorialspoint!
<class 'str'>


In [14]:
# the NavigableString is immutable , but can be replaced with another string using replace_with() method.
soup = BeautifulSoup("<h2 id='message'>Hello, Tutorialspoint!</h2>",'html.parser')

tag = soup.h2
tag.string.replace_with("OnLine Tutorials Library")
print (tag.string)

OnLine Tutorials Library


## 1.4 BeautifulSoup Object

In [4]:
# The BeautifulSoup object represents the entire parsed object. 
resp = requests.get("https://www.tutorialspoint.com/index.htm")
soup = BeautifulSoup(resp.content, 'html.parser')

print (soup)
print (soup.name)
print ('type:',type(soup))

<!DOCTYPE html>

<html lang="en">
<head>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width,initial-scale=1.0,user-scalable=yes, maximum-scale=1" name="viewport"/>
<title>Quality Tutorials, Video Courses, and eBooks - TutorialsPoint</title>
<meta content="Learn the latest technologies and programming languages including CodeWhisperer, Google Assistant, Dall-E, Business Intelligence, Claude AI, SwiftUI, Smart Grid Technology, Prompt Engineering, Generative AI, Python, DSA, C, C++, Java, PHP, Machine Learning, Data science etc." name="Description">
<meta content="CodeWhisperer, Google Assistant, Dall-E, Business Intelligence, Claude AI, SwiftUI, Smart, Grid, Technology, SQL, MySQL, FastAPI, Snowflake, Pyramid, Falcon, XlsxWriter, WebDriverIO, Pygame, HTML, Python, CSS, SQL, JavaScript, How to, PHP, Java, C++, jQuery, Bootstrap, C#, XML, MySQL, NodeJS, React, Angular, R, AI, Git, Data Science, Code Game, Tutorials, Programming,

In [5]:
# Two parsed documents can be combined 
# if you pass a BeautifulSoup object as an argument to a certain function such as replace_with()

obj1 = BeautifulSoup("<book><title>Python</title></book>", features="xml")
obj2 = BeautifulSoup("<b>Beautiful Soup parser</b>", "lxml")

obj2.find('b').replace_with(obj1)
print (obj2)

<html><body><book><title>Python</title></book></body></html>


## 1.5 Comment Object

In [7]:
markup = "<b><!--This is a comment text in HTML--></b>"
soup = BeautifulSoup(markup, 'html.parser')
comment = soup.b.string
print (comment, type(comment))

# The Comment object is a special type of NavigableString object. 
# The prettify() method displays the comment text with special formatting
print (soup.b.prettify())

This is a comment text in HTML <class 'bs4.element.Comment'>
<b>
 <!--This is a comment text in HTML-->
</b>



# 1.6 Reminder: Inspect the Structure before further processing the Soup 

In [None]:
# In order to scrape a web page with BeautifulSoup and Python, 
# your first step for any web scraping project should be to explore the website that you want to scrape. 
# So, first visit the website to understand the site structure before you start extracting the information that's relevant for you

# 1.7 Scrap Web Content

In [None]:
# Scrap through requests
url = "https://www.tutorialspoint.com/index.htm"
req = requests.get(url)

# Scrap through urllib
import urllib.request
response =  urllib.request.urlopen('http://python.org/') 
html = response.read()

# 2. Navigating by Tags

### soup.head

In [9]:
# The soup.head function returns the contents put inside the <head> .. </head> element of a HTML page
resp = requests.get("https://www.tutorialspoint.com/index.htm")
soup = BeautifulSoup(resp.content, 'html.parser')

print(soup.head)

<head>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width,initial-scale=1.0,user-scalable=yes, maximum-scale=1" name="viewport"/>
<title>Quality Tutorials, Video Courses, and eBooks - TutorialsPoint</title>
<meta content="Learn the latest technologies and programming languages including CodeWhisperer, Google Assistant, Dall-E, Business Intelligence, Claude AI, SwiftUI, Smart Grid Technology, Prompt Engineering, Generative AI, Python, DSA, C, C++, Java, PHP, Machine Learning, Data science etc." name="Description">
<meta content="CodeWhisperer, Google Assistant, Dall-E, Business Intelligence, Claude AI, SwiftUI, Smart, Grid, Technology, SQL, MySQL, FastAPI, Snowflake, Pyramid, Falcon, XlsxWriter, WebDriverIO, Pygame, HTML, Python, CSS, SQL, JavaScript, How to, PHP, Java, C++, jQuery, Bootstrap, C#, XML, MySQL, NodeJS, React, Angular, R, AI, Git, Data Science, Code Game, Tutorials, Programming, Web Development, Training, Learni

### soup.body

In [11]:
# return the contents of body part of HTML page
resp = requests.get("https://www.tutorialspoint.com/index.htm")
soup = BeautifulSoup(resp.content, 'html.parser')

print(soup.body)

<body>
<div class="accent-header">
<nav class="accent-nav library-accent-nav">
<div class="accent-nav__content">
<ul class="accent-nav__list">
<li class="accent-nav__item">
<a class="accent-nav__link" href="/index.htm" title="Home"><i class="fa-sharp fa-light fa-home"></i> Home</a>
</li>
<li class="accent-nav__item mblockMenu">
<a class="accent-nav__link" href="/tutorialslibrary.htm"><svg fill="none" height="14" viewbox="0 0 15 14" width="15" xmlns="http://www.w3.org/2000/svg"><path d="M13.7675 0.781179C13.4295 0.49851 13.0333 0.293734 12.6072 0.18139C12.1811 0.0690464 11.7355 0.0518903 11.3021 0.13114L9.00377 0.548464C8.47315 0.645905 7.9907 0.918899 7.63394 1.32358C7.27624 0.9182 6.79241 0.645144 6.2605 0.548464L3.96582 0.13114C3.53235 0.0518209 3.08675 0.0687522 2.66056 0.180735C2.23436 0.292719 1.83799 0.497016 1.4995 0.779166C1.16101 1.06132 0.888682 1.41442 0.701786 1.81349C0.51489 2.21255 0.417998 2.64782 0.417969 3.08848L0.417969 9.57865C0.418003 10.2827 0.665102 10.9644 1.1161

### extract specific tags directly with the tag name soup.tag

In [12]:
resp = requests.get("https://www.tutorialspoint.com/index.htm")
soup = BeautifulSoup(resp.content, 'html.parser')

print(soup.p)
print(soup.h1)

<p class="mt">Newly Added and Updated Tutorials</p>
<h1 class="hero__title">Simple &amp; Easy <span class="text-accent-700">Learning</span></h1>


## 2.1 Tag.contents

In [13]:
# A Tag object may have one or more PageElements. 
# The Tag object's contents property returns a list of all elements included in it.
resp = requests.get("https://www.tutorialspoint.com/index.htm")
soup = BeautifulSoup(resp.content, 'html.parser')

tag = soup.head
print (tag.contents)

['\n', <meta charset="utf-8"/>, '\n', <meta content="IE=edge" http-equiv="X-UA-Compatible"/>, '\n', <meta content="width=device-width,initial-scale=1.0,user-scalable=yes, maximum-scale=1" name="viewport"/>, '\n', <title>Quality Tutorials, Video Courses, and eBooks - TutorialsPoint</title>, '\n', <meta content="Learn the latest technologies and programming languages including CodeWhisperer, Google Assistant, Dall-E, Business Intelligence, Claude AI, SwiftUI, Smart Grid Technology, Prompt Engineering, Generative AI, Python, DSA, C, C++, Java, PHP, Machine Learning, Data science etc." name="Description">
<meta content="CodeWhisperer, Google Assistant, Dall-E, Business Intelligence, Claude AI, SwiftUI, Smart, Grid, Technology, SQL, MySQL, FastAPI, Snowflake, Pyramid, Falcon, XlsxWriter, WebDriverIO, Pygame, HTML, Python, CSS, SQL, JavaScript, How to, PHP, Java, C++, jQuery, Bootstrap, C#, XML, MySQL, NodeJS, React, Angular, R, AI, Git, Data Science, Code Game, Tutorials, Programming, Web D

# 2.2 Tag.children

In [16]:
#The structure of tags in a HTML script is hierarchical. The elements are nested one inside the other. 
resp = requests.get("https://www.tutorialspoint.com/index.htm")
soup = BeautifulSoup(resp.content, 'html.parser')

tag = soup.ul
print(list(tag.children))

['\n', <li class="accent-nav__item">
<a class="accent-nav__link" href="/index.htm" title="Home"><i class="fa-sharp fa-light fa-home"></i> Home</a>
</li>, '\n', <li class="accent-nav__item mblockMenu">
<a class="accent-nav__link" href="/tutorialslibrary.htm"><svg fill="none" height="14" viewbox="0 0 15 14" width="15" xmlns="http://www.w3.org/2000/svg"><path d="M13.7675 0.781179C13.4295 0.49851 13.0333 0.293734 12.6072 0.18139C12.1811 0.0690464 11.7355 0.0518903 11.3021 0.13114L9.00377 0.548464C8.47315 0.645905 7.9907 0.918899 7.63394 1.32358C7.27624 0.9182 6.79241 0.645144 6.2605 0.548464L3.96582 0.13114C3.53235 0.0518209 3.08675 0.0687522 2.66056 0.180735C2.23436 0.292719 1.83799 0.497016 1.4995 0.779166C1.16101 1.06132 0.888682 1.41442 0.701786 1.81349C0.51489 2.21255 0.417998 2.64782 0.417969 3.08848L0.417969 9.57865C0.418003 10.2827 0.665102 10.9644 1.11619 11.5049C1.56729 12.0455 2.19376 12.4106 2.88643 12.5366L6.6664 13.2239C7.30618 13.3402 7.96169 13.3402 8.60148 13.2239L12.3844 

In [17]:
# we can use a for loop to traverse the children
for child in tag.children:
   print (child)



<li class="accent-nav__item">
<a class="accent-nav__link" href="/index.htm" title="Home"><i class="fa-sharp fa-light fa-home"></i> Home</a>
</li>


<li class="accent-nav__item mblockMenu">
<a class="accent-nav__link" href="/tutorialslibrary.htm"><svg fill="none" height="14" viewbox="0 0 15 14" width="15" xmlns="http://www.w3.org/2000/svg"><path d="M13.7675 0.781179C13.4295 0.49851 13.0333 0.293734 12.6072 0.18139C12.1811 0.0690464 11.7355 0.0518903 11.3021 0.13114L9.00377 0.548464C8.47315 0.645905 7.9907 0.918899 7.63394 1.32358C7.27624 0.9182 6.79241 0.645144 6.2605 0.548464L3.96582 0.13114C3.53235 0.0518209 3.08675 0.0687522 2.66056 0.180735C2.23436 0.292719 1.83799 0.497016 1.4995 0.779166C1.16101 1.06132 0.888682 1.41442 0.701786 1.81349C0.51489 2.21255 0.417998 2.64782 0.417969 3.08848L0.417969 9.57865C0.418003 10.2827 0.665102 10.9644 1.11619 11.5049C1.56729 12.0455 2.19376 12.4106 2.88643 12.5366L6.6664 13.2239C7.30618 13.3402 7.96169 13.3402 8.60148 13.2239L12.3844 12.5366C13

# 2.3 Tag.find_all()

In [None]:
#This method returns a result set of contents of all the tags matching with the argument tag provided.
resp = requests.get("https://www.tutorialspoint.com/index.htm")
soup = BeautifulSoup(resp.content, 'html.parser')

#find all a tag
result = soup.find_all("a")
print (result)

