# This is a tutorial to familiarise with BS operations.

In [1]:
## get the title
from bs4 import BeautifulSoup
import requests

url = "https://www.tutorialspoint.com/index.htm"
req = requests.get(url)

soup = BeautifulSoup(req.content,'html.parser')

soup.title

<title>Quality Tutorials, Video Courses, and eBooks</title>

In [2]:
# get all urls  in thw webpage

for link in soup.find_all('a'):
    print(link)

<a class="accent-nav__link" href="/index.htm" title="Home">
<svg fill="white" height="1em" viewbox="0 0 576 512" xmlns="https://www.w3.org/2000/svg"><path d="M303.5 13.7L288 .5 272.5 13.7l-264 224 31.1 36.6L64 253.5V488v24H88 488h24V488 253.5l24.5 20.8 31.1-36.6-264-224zM112 464V212.8L288 63.5 464 212.8V464H384V296 272H360 216 192v24V464H112zm128 0V320h96V464H240z"></path></svg> Home
						</a>
<a class="accent-nav__link" href="/codingground.htm" title="Coding Ground">
<svg fill="white" height="1em" viewbox="0 0 640 512" xmlns="https://www.w3.org/2000/svg"><path d="M376.5 1.8l-7.3 22.8-144 448-7.3 22.8 45.7 14.7 7.3-22.8 144-448 7.3-22.8L376.5 1.8zm61.6 135.3l17.5 16.4L564.9 256 455.6 358.5l-17.5 16.4 32.8 35 17.5-16.4 128-120L635.1 256l-18.7-17.5-128-120-17.5-16.4-32.8 35zm-269-35l-17.5 16.4-128 120L4.9 256l18.7 17.5 128 120 17.5 16.4 32.8-35-17.5-16.4L75.1 256 184.4 153.5l17.5-16.4-32.8-35z"></path></svg> Coding Ground
						</a>
<a class="accent-nav__link" href="/job_search.php" tit

In [3]:
for link in soup.find_all('a'):
    print(link.get('href'))

/index.htm
/codingground.htm
/job_search.php
/whiteboard.htm
/online_dev_tools.htm
/articles/index.php
https://www.tutorialspoint.com/articles/write-and-earn.php
/market/login.jsp
https://www.facebook.com/tutorialspointindia
https://www.instagram.com/tutorialspoint_/
https://twitter.com/tutorialspoint
https://www.youtube.com/channel/UCVLbzhxVTiTLiVKeGV7WEBg
https://www.linkedin.com/company/tutorialspoint/
https://www.tutorialspoint.com
/tutorialslibrary.htm
artificial_intelligence/index.htm
machine_learning/index.htm
machine_learning_with_python/index.htm
data_science/index.htm
statistics/index.htm
natural_language_processing/index.htm
artificial_neural_network/index.htm
tensorflow/index.htm
pytorch/index.htm
matplotlib/index.htm
numpy/index.htm
python_pandas/index.htm
scipy/index.htm
big_data_analytics/index.htm
machine_learning_tutorials.htm
python/index.htm
java/index.htm
cplusplus/index.htm
cprogramming/index.htm
php/index.htm
go/index.htm
kotlin/index.htm
r/index.htm
asp.net/index

In [4]:
# web page stored locally

# from bs4 import BeautifulSoup
# 
# with open("index.html") as fp:
#     soup = BeautifulSoup(fp, 'html.parser')
# 
# print(soup)

## Beautiful Soup - Kinds of objects

#### Name (tag.name)

In [5]:
from bs4 import BeautifulSoup

soup = BeautifulSoup('<b class="boldest">TutorialsPoint</b>', 'lxml')
tag = soup.html
print (type(tag))

<class 'bs4.element.Tag'>


In [6]:
tag.name

'html'

In [7]:
# change the tag name
tag.name = 'strong'
tag

<strong><body><b class="boldest">TutorialsPoint</b></body></strong>

#### Attributes (tag.attrs)

In [8]:
from bs4 import BeautifulSoup

soup = BeautifulSoup('<input type="text" name="name" value="Raju">', 'lxml')
tag = soup.input

print (tag.attrs)

{'type': 'text', 'name': 'name', 'value': 'Raju'}


In [9]:
# modify attrs
tag['value']='ken'

print(soup)

<html><body><input name="name" type="text" value="ken"/></body></html>


In [10]:
from bs4 import BeautifulSoup

css_soup = BeautifulSoup('<p class="body"></p>', 'lxml')
print ("css_soup.p['class']:", css_soup.p['class'])

css_soup = BeautifulSoup('<p class="body bold"></p>', 'lxml')
print ("css_soup.p['class']:", css_soup.p['class'])

css_soup.p['class']: ['body']
css_soup.p['class']: ['body', 'bold']


#### NavigableString object

In [11]:
from bs4 import BeautifulSoup
soup = BeautifulSoup("<h2 id='message'>Hello, Tutorialspoint!</h2>", 'html.parser')

print (soup.string)

print (type(soup.string))

Hello, Tutorialspoint!
<class 'bs4.element.NavigableString'>


In [12]:
soup = BeautifulSoup("<h2 id='message'>Hello, Tutorialspoint!</h2>",'html.parser')

tag = soup.h2
print(tag.string)
print(tag['id'])
print(tag.attrs)

Hello, Tutorialspoint!
message
{'id': 'message'}


In [13]:
tag.string.replace_with("OnLine Tutorials Library")
print (tag.string)

OnLine Tutorials Library


#### Comment object

In [14]:
markup = "<b><!--This is a comment text in HTML--></b>"
soup = BeautifulSoup(markup, 'html.parser')
comment = soup.b.string
print (comment, type(comment))

This is a comment text in HTML <class 'bs4.element.Comment'>


## Navigating by Tags

#### soup.head

In [15]:
with open('../data/index.html') as fp:
    soup = BeautifulSoup(fp,'html.parser')
    
soup.head

<head>
<title>TutorialsPoint</title>
<script>
         document.write("Welcome to TutorialsPoint");
      </script>
</head>

#### soup.body

In [16]:
soup.body

<body>
<h1>Tutorialspoint Online Library</h1>
<p><b>It's all Free</b></p>
</body>

In [17]:
print(soup.body.h1)
print(soup.body.h1.string)

<h1>Tutorialspoint Online Library</h1>
Tutorialspoint Online Library


#### soup.p

In [18]:
print(soup.p)
print(soup.p.string)

<p><b>It's all Free</b></p>
It's all Free


#### Tag.contents

In [19]:
soup.head.contents

['\n',
 <title>TutorialsPoint</title>,
 '\n',
 <script>
          document.write("Welcome to TutorialsPoint");
       </script>,
 '\n']

#### tag.children

In [20]:
with open("../data/index2.html") as fp:
   soup = BeautifulSoup(fp, 'html.parser')

tag = soup.ul
list(tag.children)

['\n',
 <li>Accounts</li>,
 '\n',
 <ul>
 <li>Anand</li>
 <li>Mahesh</li>
 </ul>,
 '\n',
 <li>HR</li>,
 '\n',
 <ul>
 <li>Rani</li>
 <li>Ankita</li>
 </ul>,
 '\n']

In [21]:
# iterate teh children
for child in tag.children:
    print(child)



<li>Accounts</li>


<ul>
<li>Anand</li>
<li>Mahesh</li>
</ul>


<li>HR</li>


<ul>
<li>Rani</li>
<li>Ankita</li>
</ul>




#### Tag.find_all()

In [22]:
results = soup.find_all('a')
results

[<a class="prog" href="https://www.tutorialspoint.com/java/java_overview.htm" id="link1">Java</a>,
 <a class="prog" href="https://www.tutorialspoint.com/cprogramming/index.htm" id="link2">C</a>,
 <a class="prog" href="https://www.tutorialspoint.com/python/index.htm" id="link3">Python</a>,
 <a class="prog" href="https://www.tutorialspoint.com/javascript/javascript_overview.htm" id="link4">JavaScript</a>,
 <a class="prog" href="https://www.tutorialspoint.com/ruby/index.htm" id="link5">C</a>]

## Find Elements by ID

In [23]:
soup.find(id='nm').attrs

{'type': 'text', 'id': 'nm', 'name': 'name'}

In [24]:
soup.find_all(id='nm')

[<input id="nm" name="name" type="text"/>]

#### select() method

In [25]:
obj = soup.select("#nm")
print (obj)

[<input id="nm" name="name" type="text"/>]


In [26]:
obj = soup.select_one("#nm")
print (obj)

<input id="nm" name="name" type="text"/>


## Find Elements by Class

In [27]:
soup.find_all(attrs={'class':'mainmenu'})

[<li class="mainmenu">Accounts</li>, <li class="mainmenu">HR</li>]

In [28]:
soup.find_all(attrs={'class':['mainmenu','submenu']})

[<li class="mainmenu">Accounts</li>,
 <li class="submenu">Anand</li>,
 <li class="submenu">Mahesh</li>,
 <li class="mainmenu">HR</li>,
 <li class="submenu">Rani</li>,
 <li class="submenu">Ankita</li>]

In [29]:
soup.select('.heading')

[<h2 class="heading">Departmentwise Employees</h2>]

In [30]:
soup.select_one('.submenu')

<li class="submenu">Anand</li>

## Searching the Tree

In [31]:
html = """
<html><head><title>TutorialsPoint</title></head>
   <body>
      <p class="title"><b>Online Tutorials Library</b></p>

      <p class="story">TutorialsPoint has an excellent collection of tutorials on:
      <a href="https://tutorialspoint.com/Python" class="lang" id="link1">Python</a>,
      <a href="https://tutorialspoint.com/Java" class="lang" id="link2">Java</a> and
      <a href="https://tutorialspoint.com/PHP" class="lang" id="link3">PHP</a>;
      Enhance your Programming skills.</p>

      <p class="tutorial">...</p>
"""

In [33]:
soup = BeautifulSoup(html,'html.parser')
print(soup.head.prettify())

<head>
 <title>
  TutorialsPoint
 </title>
</head>

