# Request

In [1]:
from urllib.request import urlopen

In [2]:
# requesting html of page1.html
html = urlopen("http://pythonscraping.com/pages/page1.html")

In [3]:
html.getheaders()

[('Date', 'Mon, 04 May 2020 17:58:46 GMT'),
 ('Server', 'Apache'),
 ('Last-Modified', 'Sat, 09 Jun 2018 19:15:58 GMT'),
 ('ETag', '"4121bc8-234-56e3a58b39172"'),
 ('Accept-Ranges', 'bytes'),
 ('Content-Length', '564'),
 ('Cache-Control', 'max-age=1209600'),
 ('Expires', 'Mon, 18 May 2020 17:58:46 GMT'),
 ('Connection', 'close'),
 ('Content-Type', 'text/html')]

In [4]:
html.getheader("Date")

'Mon, 04 May 2020 17:58:46 GMT'

In [5]:
html.geturl()

'http://pythonscraping.com/pages/page1.html'

In [6]:
html.read()

b'<html>\n<head>\n<title>A Useful Page</title>\n</head>\n<body>\n<h1>An Interesting Title</h1>\n<div>\nLorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.\n</div>\n</body>\n</html>\n'

# BeautifulSoup

In [11]:
from urllib.request import urlopen

In [17]:
from bs4 import BeautifulSoup as bs

In [24]:
# request html of page1.html located at web_root/pages/page1.html
html = urlopen("http://www.pythonscraping.com/exercises/exercise1.html")

In [25]:
html.status

200

In [26]:
# we can see by using beautiful soup obtained html is well formatted where as when we did only html.read it was so messy
bs(html.read())

<html>
<head>
<title>A Useful Page</title>
</head>
<body>
<h1>An Interesting Title</h1>
<div>
Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
</div>
</body>
</html>

In [29]:
# you cannot read html twice
html = urlopen("http://www.pythonscraping.com/exercises/exercise1.html")
obj = bs(html.read())

In [30]:
obj.h1

<h1>An Interesting Title</h1>

In [32]:
# we can also do
obj.html.body.h1

<h1>An Interesting Title</h1>

# Possible Errors
- The page is not found on the server (or there was some error in retrieving it) -> “404 Page Not Found,” “500 Internal Server Error”  (HTTPError)
- The server is not found -> returns null
if we apply method(find tag) to null we get AttributeError


we need to apply try and exception to prevent code break

In [34]:
type(obj.tag_do_not_exit)

NoneType

In [35]:
obj.tag_do_not_exit.another_tag

AttributeError: 'NoneType' object has no attribute 'another_tag'

# Handling all errors

In [37]:
from urllib.error import HTTPError

In [42]:
def getTitle(url):
    try:
        html = urlopen(url)
    except HTTPError as e:
        return None
    try:
        obj = bs(html.read())
        title = obj.title
    except AttributeError as e:
        return None
    return title

In [43]:
getTitle("http://www.pythonscraping.com/exercises/exercise1.html")

<title>A Useful Page</title>

# findAll 
 - used to find the tag with specific attribute for eg.  bsObj(tagname, attribute)
 
# Extracting text
 - get_text() method is used to find the text present in the extracted tag
 - try to keep data in tags as far as possible. converting them to text should be the last step

In [46]:
html = urlopen("http://www.pythonscraping.com/pages/warandpeace.html")

In [47]:
obj = bs(html.read())

In [53]:
tag = "span"
attribute = {"class":"green"}
obj.findAll(name=tag, attrs=attribute)

[<span class="green">Anna
 Pavlovna Scherer</span>,
 <span class="green">Empress Marya
 Fedorovna</span>,
 <span class="green">Prince Vasili Kuragin</span>,
 <span class="green">Anna Pavlovna</span>,
 <span class="green">St. Petersburg</span>,
 <span class="green">the prince</span>,
 <span class="green">Anna Pavlovna</span>,
 <span class="green">Anna Pavlovna</span>,
 <span class="green">the prince</span>,
 <span class="green">the prince</span>,
 <span class="green">the prince</span>,
 <span class="green">Prince Vasili</span>,
 <span class="green">Anna Pavlovna</span>,
 <span class="green">Anna Pavlovna</span>,
 <span class="green">the prince</span>,
 <span class="green">Wintzingerode</span>,
 <span class="green">King of Prussia</span>,
 <span class="green">le Vicomte de Mortemart</span>,
 <span class="green">Montmorencys</span>,
 <span class="green">Rohans</span>,
 <span class="green">Abbe Morio</span>,
 <span class="green">the Emperor</span>,
 <span class="green">the prince</span>,
 

In [54]:
noun = obj.findAll(name=tag, attrs=attribute)
for i in noun:
    print(i.get_text())

Anna
Pavlovna Scherer
Empress Marya
Fedorovna
Prince Vasili Kuragin
Anna Pavlovna
St. Petersburg
the prince
Anna Pavlovna
Anna Pavlovna
the prince
the prince
the prince
Prince Vasili
Anna Pavlovna
Anna Pavlovna
the prince
Wintzingerode
King of Prussia
le Vicomte de Mortemart
Montmorencys
Rohans
Abbe Morio
the Emperor
the prince
Prince Vasili
Dowager Empress Marya Fedorovna
the baron
Anna Pavlovna
the Empress
the Empress
Anna Pavlovna's
Her Majesty
Baron
Funke
The prince
Anna
Pavlovna
the Empress
The prince
Anatole
the prince
The prince
Anna
Pavlovna
Anna Pavlovna


**If <code>recursion</code> is set to True, the findAll function looks into children, and children’s children, for tags that match your parameters. If it is false, it will look only at the toplevel
tags in your document.**

In [56]:
obj.findAll(id="text")  # keyword argument helps to select particular attribute

[<div id="text">
 "<span class="red">Well, Prince, so Genoa and Lucca are now just family estates of the
 Buonapartes. But I warn you, if you don't tell me that this means war,
 if you still try to defend the infamies and horrors perpetrated by
 that Antichrist- I really believe he is Antichrist- I will have
 nothing more to do with you and you are no longer my friend, no longer
 my 'faithful slave,' as you call yourself! But how do you do? I see
 I have frightened you- sit down and tell me all the news.</span>"
 <p></p>
 It was in July, 1805, and the speaker was the well-known <span class="green">Anna
 Pavlovna Scherer</span>, maid of honor and favorite of the <span class="green">Empress Marya
 Fedorovna</span>. With these words she greeted <span class="green">Prince Vasili Kuragin</span>, a man
 of high rank and importance, who was the first to arrive at her
 reception. <span class="green">Anna Pavlovna</span> had had a cough for some days. She was, as
 she said, suffering from la gr

In [58]:
obj.findAll("", {"id":"text"})  # statement above is identical to this statement

[<div id="text">
 "<span class="red">Well, Prince, so Genoa and Lucca are now just family estates of the
 Buonapartes. But I warn you, if you don't tell me that this means war,
 if you still try to defend the infamies and horrors perpetrated by
 that Antichrist- I really believe he is Antichrist- I will have
 nothing more to do with you and you are no longer my friend, no longer
 my 'faithful slave,' as you call yourself! But how do you do? I see
 I have frightened you- sit down and tell me all the news.</span>"
 <p></p>
 It was in July, 1805, and the speaker was the well-known <span class="green">Anna
 Pavlovna Scherer</span>, maid of honor and favorite of the <span class="green">Empress Marya
 Fedorovna</span>. With these words she greeted <span class="green">Prince Vasili Kuragin</span>, a man
 of high rank and importance, who was the first to arrive at her
 reception. <span class="green">Anna Pavlovna</span> had had a cough for some days. She was, as
 she said, suffering from la gr

In [64]:
obj.findAll(class="green")  # class is protected keyword in python

SyntaxError: invalid syntax (<ipython-input-64-04e2b7e62769>, line 1)

In [65]:
obj.findAll(class_="green")  # a clumsy solution

[<span class="green">Anna
 Pavlovna Scherer</span>,
 <span class="green">Empress Marya
 Fedorovna</span>,
 <span class="green">Prince Vasili Kuragin</span>,
 <span class="green">Anna Pavlovna</span>,
 <span class="green">St. Petersburg</span>,
 <span class="green">the prince</span>,
 <span class="green">Anna Pavlovna</span>,
 <span class="green">Anna Pavlovna</span>,
 <span class="green">the prince</span>,
 <span class="green">the prince</span>,
 <span class="green">the prince</span>,
 <span class="green">Prince Vasili</span>,
 <span class="green">Anna Pavlovna</span>,
 <span class="green">Anna Pavlovna</span>,
 <span class="green">the prince</span>,
 <span class="green">Wintzingerode</span>,
 <span class="green">King of Prussia</span>,
 <span class="green">le Vicomte de Mortemart</span>,
 <span class="green">Montmorencys</span>,
 <span class="green">Rohans</span>,
 <span class="green">Abbe Morio</span>,
 <span class="green">the Emperor</span>,
 <span class="green">the prince</span>,
 

### alternative

In [133]:
bs.findAll("", {"class":"green"})  

SyntaxError: unexpected EOF while parsing (<ipython-input-133-b29cda2772b9>, line 1)

# BeautifulSoup Object

1. BeautifulSoup Object
2. Tag Objects
3. Navigable Strings --- these are strings within the tags
4. Comment object

# Children, siblings And Parent

In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup as bs

In [3]:
html = urlopen("http://www.pythonscraping.com/pages/page3.html")

In [4]:
obj = bs(html)

In [6]:
# obj

In [12]:
list(obj.find("table", {"id":"giftList"}).children)

['\n',
 <tr><th>
 Item Title
 </th><th>
 Description
 </th><th>
 Cost
 </th><th>
 Image
 </th></tr>,
 '\n',
 <tr class="gift" id="gift1"><td>
 Vegetable Basket
 </td><td>
 This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
 <span class="excitingNote">Now with super-colorful bell peppers!</span>
 </td><td>
 $15.00
 </td><td>
 <img src="../img/gifts/img1.jpg"/>
 </td></tr>,
 '\n',
 <tr class="gift" id="gift2"><td>
 Russian Nesting Dolls
 </td><td>
 Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! <span class="excitingNote">8 entire dolls per set! Octuple the presents!</span>
 </td><td>
 $10,000.52
 </td><td>
 <img src="../img/gifts/img2.jpg"/>
 </td></tr>,
 '\n',
 <tr class="gift" id="gift3"><td>
 Fish Painting
 </td><td>
 If something seems fishy about this painting, it's because it's a fish! <span class="excitingNote">Also hand-painted by trained monkeys!</span>
 </td><td>


In [16]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen("http://www.pythonscraping.com/pages/page3.html")
bsObj = BeautifulSoup(html)
for child in bsObj.find("table",{"id":"giftList"}).children:
    print(child)



<tr><th>
Item Title
</th><th>
Description
</th><th>
Cost
</th><th>
Image
</th></tr>


<tr class="gift" id="gift1"><td>
Vegetable Basket
</td><td>
This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
<span class="excitingNote">Now with super-colorful bell peppers!</span>
</td><td>
$15.00
</td><td>
<img src="../img/gifts/img1.jpg"/>
</td></tr>


<tr class="gift" id="gift2"><td>
Russian Nesting Dolls
</td><td>
Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! <span class="excitingNote">8 entire dolls per set! Octuple the presents!</span>
</td><td>
$10,000.52
</td><td>
<img src="../img/gifts/img2.jpg"/>
</td></tr>


<tr class="gift" id="gift3"><td>
Fish Painting
</td><td>
If something seems fishy about this painting, it's because it's a fish! <span class="excitingNote">Also hand-painted by trained monkeys!</span>
</td><td>
$10,005.00
</td><td>
<img src="../img/gifts/img3.jpg"/>


## next_siblings, previous_siblings, parents -- Gives list
## next_sibling, previous_sibling, parent -- gives single element

In [24]:
(list(obj.find("table").tr.next_siblings))

['\n',
 <tr class="gift" id="gift1"><td>
 Vegetable Basket
 </td><td>
 This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
 <span class="excitingNote">Now with super-colorful bell peppers!</span>
 </td><td>
 $15.00
 </td><td>
 <img src="../img/gifts/img1.jpg"/>
 </td></tr>,
 '\n',
 <tr class="gift" id="gift2"><td>
 Russian Nesting Dolls
 </td><td>
 Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! <span class="excitingNote">8 entire dolls per set! Octuple the presents!</span>
 </td><td>
 $10,000.52
 </td><td>
 <img src="../img/gifts/img2.jpg"/>
 </td></tr>,
 '\n',
 <tr class="gift" id="gift3"><td>
 Fish Painting
 </td><td>
 If something seems fishy about this painting, it's because it's a fish! <span class="excitingNote">Also hand-painted by trained monkeys!</span>
 </td><td>
 $10,005.00
 </td><td>
 <img src="../img/gifts/img3.jpg"/>
 </td></tr>,
 '\n',
 <tr class="gift" id="

# Using regular expression

In [47]:
import re

In [41]:
html = urlopen("http://www.pythonscraping.com/pages/page3.html")

In [42]:
obj = bs(html)

In [53]:
tag = "img"
pattern = re.compile(r"^(\.)+/img/gifts/img[0-9].jpg")  # only find tags that matches this characteristic
attribute = {"src":pattern}
obj.findAll(tag, attribute)

[<img src="../img/gifts/img1.jpg"/>,
 <img src="../img/gifts/img2.jpg"/>,
 <img src="../img/gifts/img3.jpg"/>,
 <img src="../img/gifts/img4.jpg"/>,
 <img src="../img/gifts/img6.jpg"/>]

### A regular expression can be inserted as any argument in a BeautifulSoup expression,allowing you a great deal of flexibility in finding target elements.

---

# Accessing Attributes

In [62]:
obj.img.attrs

{'src': '../img/gifts/logo.jpg', 'style': 'float:left;'}

In [63]:
obj.img.attrs["src"]

'../img/gifts/logo.jpg'

In [64]:
obj.img.attrs["style"]

'float:left;'

# Six Degrees of Wikipedia solution finder -- Project

In [66]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import datetime
import random
import re
random.seed(datetime.datetime.now())
def getLinks(articleUrl):
    html = urlopen("http://en.wikipedia.org"+articleUrl)
    bsObj = BeautifulSoup(html)
    return bsObj.find("div", {"id":"bodyContent"}).findAll("a",
                        href=re.compile("^(/wiki/)((?!:).)*$"))
links = getLinks("/wiki/Kevin_Bacon")
while len(links) > 0:
    newArticle = links[random.randint(0, len(links)-1)].attrs["href"]
    print(newArticle)
    links = getLinks(newArticle)

/wiki/Teen_Choice_Awards
/wiki/Rachel_Platten
/wiki/Adult_Top_40
/wiki/List_of_Billboard_number-one_Americana/folk_albums_of_the_2010%27s
/wiki/The_Head_and_the_Heart
/wiki/Conan_(talk_show)
/wiki/Eastern_Time_Zone
/wiki/North_Carolina
/wiki/Governor%27s_Palace,_New_Bern
/wiki/Brunswick_Town,_North_Carolina
/wiki/Pitch_(resin)
/wiki/Ogogoro
/wiki/Urhobo_people
/wiki/Nupe_people
/wiki/Itsekiri_people
/wiki/Igbo_people
/wiki/Mambila_people
/wiki/Chamba_people
/wiki/Banda_people
/wiki/ISBN_(identifier)
/wiki/Business_Process_Model_and_Notation
/wiki/ISO_15398
/wiki/Language_Of_Temporal_Ordering_Specification
/wiki/Film_speed
/wiki/View_camera
/wiki/C-41_process
/wiki/Monolight
/wiki/Guide_number
/wiki/Diagonal_method
/wiki/Color
/wiki/English_in_the_Commonwealth_of_Nations
/wiki/Institute_of_Commonwealth_Studies
/wiki/Westfield_College
/wiki/Adrian_Chiles
/wiki/Birmingham
/wiki/Luxembourg_in_the_Eurovision_Song_Contest
/wiki/Norway_in_the_Eurovision_Song_Contest
/wiki/Fly_on_the_Wings_of_

KeyboardInterrupt: 

# Finding all links of wikipedia article

In [67]:
from urllib.request import urlopen
from bs4 import BeautifulSoup as bs
import re

In [69]:
html = urlopen("https://en.wikipedia.org/wiki/Kevin_Bacon")

In [70]:
obj = bs(html)

In [130]:
tag = "a"
pattern = re.compile("^(/wiki/)[A-z]*(?!:)$")
attribute = {"href":pattern, "class":''}
links = obj.findAll(tag, attribute)

In [131]:
print(len(obj.findAll(tag, attribute)))

281


In [132]:
links[0]

{'href': '/wiki/Philadelphia', 'title': 'Philadelphia'}