# 02. Web scraping with BeautifulSoup

### 1. Basic usage of BeautifulSoup

In [1]:
# BeautifulSoup import
from bs4 import BeautifulSoup

#### Understanding html structure

In [2]:
html = '''
<html>
    <body>
        <h1>What is scraping?</h1>
        <p id="one">page1</p>
        <p id="two">page2</p>
        <span class="redColor">
            <p>page3</p>
        </span>

        <ul>
            <li><a href="www.daum.net">daum</a></li>
            <li><a href="www.naver.com">naver</a></li>
        </ul>
    </body>
</html>
'''


In [3]:
# create BS object with data and parser
soup = BeautifulSoup(html, "html.parser")

In [4]:
soup.html

<html>
<body>
<h1>What is scraping?</h1>
<p id="one">page1</p>
<p id="two">page2</p>
<span class="redColor">
<p>page3</p>
</span>
<ul>
<li><a href="www.daum.net">daum</a></li>
<li><a href="www.naver.com">naver</a></li>
</ul>
</body>
</html>

In [5]:
print(soup.html.h1)
print(soup.html.p)

<h1>What is scraping?</h1>
<p id="one">page1</p>


In [6]:
type(soup.html.p)

bs4.element.Tag

In [7]:
print(soup.html.p.next_sibling.next_sibling)
print(soup.html.span.p.string)

<p id="two">page2</p>
page3


In [8]:
print(soup.find(id="one"))
print(soup.find(id="one").string)

<p id="one">page1</p>
page1


In [9]:
'''
select()

- search with css selector
- return type: list
- use values with index
'''

print(soup.select(".redColor"))
print(soup.select('.redColor p')[0].string)

[<span class="redColor">
<p>page3</p>
</span>]
page3


In [10]:
# find_all: find all tags and save to list
links = soup.find_all("a")
print(links)

for a in links:
    href = a.attrs['href']
    text = a.string
    print("***", href, "---", text)

[<a href="www.daum.net">daum</a>, <a href="www.naver.com">naver</a>]
*** www.daum.net --- daum
*** www.naver.com --- naver


In [11]:
# <p id="two">page2</p>
print(soup.select_one('#two'))
print(soup.select_one('#two').string)

<p id="two">page2</p>
page2


In [12]:
# html file open
html = open("./sample_html/bs4_test.html", encoding="UTF-8-sig")

# create BS object with data and parser
soup = BeautifulSoup(html.read(), "html.parser")

In [13]:
# extract data with css selector
print(soup.select_one("li:nth-of-type(8)").string)
print(soup.select_one("#ve-list > li:nth-of-type(4)").string)

아보카도
아보카도


In [14]:
print(soup.select("#ve-list > li[data-lo='us']")[1].string)
print(soup.select("#ve-list > li.black")[1].string)

아보카도
아보카도


In [15]:
'''
extract data with find()

- Complicated document has many of duplicated attributes, so searching with multiple attributes would be needed.
- Compose with atrributes with dict type
'''
att_datas = {"data-lo":"us", "class":"black"}

In [16]:
print(soup.find("li", att_datas).string)
print(soup.find(id="ve-list").find("li", att_datas).string)

아보카도
아보카도


#### Search information from real-web

In [17]:
from urllib.request import urlopen

In [18]:
html = urlopen("http://www.pythonscraping.com/exercises/exercise1.html")
bs_obj= BeautifulSoup(html.read(), "html.parser")

print(bs_obj.h1)

<h1>An Interesting Title</h1>


In [7]:
print(bs_obj.html.body.h1)

<h1>An Interesting Title</h1>


In [19]:
try:
    html = urlopen("http://www.pythonscraping.com/exercises/exercise1.html")
except HTTPError as e:
    print(e)
else:
    pass

In [20]:
if html is None:
    print("URL is not found")
else:
    pass

In [21]:
try:
    bad_content = bs_obj.find("nonExisting").anotherTag
    print(bad_content)
except AttributeError as e:
    print("Tag was not found")
else:
    if badContent == None:
        print("Tag was not found")
    else:
        print(bad_content)

Tag was not found


In [22]:
from urllib.error import HTTPError

def get_title(url):
    try:
        html = urlopen(url)
    except HTTPError as e:
        return None
    try:
        bsObj = BeautifulSoup(html.read(), "html.parser")
        title = bsObj.body.h1
    except AttributeError as e:
        return None
    return title

title = get_title("http://www.pythonscraping.com/exercises/exercise1.html")

if title == None:
    print("Title could not be found")
else:
    print(title)

<h1>An Interesting Title</h1>


In [23]:
html = urlopen("http://www.pythonscraping.com/pages/warandpeace.html")
bs_obj = BeautifulSoup(html)



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


In [24]:
name_list = bs_obj.findAll("span", {"class":"green"})

for name in name_list:
    print(name.get_text())

Anna
Pavlovna Scherer
Empress Marya
Fedorovna
Prince Vasili Kuragin
Anna Pavlovna
St. Petersburg
the prince
Anna Pavlovna
Anna Pavlovna
the prince
the prince
the prince
Prince Vasili
Anna Pavlovna
Anna Pavlovna
the prince
Wintzingerode
King of Prussia
le Vicomte de Mortemart
Montmorencys
Rohans
Abbe Morio
the Emperor
the prince
Prince Vasili
Dowager Empress Marya Fedorovna
the baron
Anna Pavlovna
the Empress
the Empress
Anna Pavlovna's
Her Majesty
Baron
Funke
The prince
Anna
Pavlovna
the Empress
The prince
Anatole
the prince
The prince
Anna
Pavlovna
Anna Pavlovna


In [27]:
html = urlopen("http://www.pythonscraping.com/pages/page3.html")
bs_obj = BeautifulSoup(html)

In [28]:
for child in bs_obj.find("table", {"id": "giftList"}).children:
    print(child)



<tr><th>
Item Title
</th><th>
Description
</th><th>
Cost
</th><th>
Image
</th></tr>


<tr class="gift" id="gift1"><td>
Vegetable Basket
</td><td>
This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
<span class="excitingNote">Now with super-colorful bell peppers!</span>
</td><td>
$15.00
</td><td>
<img src="../img/gifts/img1.jpg"/>
</td></tr>


<tr class="gift" id="gift2"><td>
Russian Nesting Dolls
</td><td>
Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! <span class="excitingNote">8 entire dolls per set! Octuple the presents!</span>
</td><td>
$10,000.52
</td><td>
<img src="../img/gifts/img2.jpg"/>
</td></tr>


<tr class="gift" id="gift3"><td>
Fish Painting
</td><td>
If something seems fishy about this painting, it's because it's a fish! <span class="excitingNote">Also hand-painted by trained monkeys!</span>
</td><td>
$10,005.00
</td><td>
<img src="../img/gifts/img3.jpg"/>


In [29]:
for child in bs_obj.find("table", {"id": "giftList"}).descendants:
    print(child)



<tr><th>
Item Title
</th><th>
Description
</th><th>
Cost
</th><th>
Image
</th></tr>
<th>
Item Title
</th>

Item Title

<th>
Description
</th>

Description

<th>
Cost
</th>

Cost

<th>
Image
</th>

Image



<tr class="gift" id="gift1"><td>
Vegetable Basket
</td><td>
This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
<span class="excitingNote">Now with super-colorful bell peppers!</span>
</td><td>
$15.00
</td><td>
<img src="../img/gifts/img1.jpg"/>
</td></tr>
<td>
Vegetable Basket
</td>

Vegetable Basket

<td>
This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
<span class="excitingNote">Now with super-colorful bell peppers!</span>
</td>

This vegetable basket is the perfect gift for your health conscious (or overweight) friends!

<span class="excitingNote">Now with super-colorful bell peppers!</span>
Now with super-colorful bell peppers!


<td>
$15.00
</td>

$15.00

<td>
<img src="../img/gifts/img1.jpg"

In [30]:
for sibling in bs_obj.find("table", {"id": "giftList"}).tr.next_siblings:
    print(sibling)



<tr class="gift" id="gift1"><td>
Vegetable Basket
</td><td>
This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
<span class="excitingNote">Now with super-colorful bell peppers!</span>
</td><td>
$15.00
</td><td>
<img src="../img/gifts/img1.jpg"/>
</td></tr>


<tr class="gift" id="gift2"><td>
Russian Nesting Dolls
</td><td>
Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! <span class="excitingNote">8 entire dolls per set! Octuple the presents!</span>
</td><td>
$10,000.52
</td><td>
<img src="../img/gifts/img2.jpg"/>
</td></tr>


<tr class="gift" id="gift3"><td>
Fish Painting
</td><td>
If something seems fishy about this painting, it's because it's a fish! <span class="excitingNote">Also hand-painted by trained monkeys!</span>
</td><td>
$10,005.00
</td><td>
<img src="../img/gifts/img3.jpg"/>
</td></tr>


<tr class="gift" id="gift4"><td>
Dead Parrot
</td><td>
This is an ex-parr

In [31]:
print(bs_obj.find("img", {"src": "../img/gifts/img1.jpg"}).parent.previous_sibling.get_text())


$15.00



#### BS with Regex

In [32]:
import re
images = bs_obj.findAll("img", {"src":re.compile("\.\./img/gifts/img.*\.jpg")})
print(images)

for image in images:
    print(image["src"])

[<img src="../img/gifts/img1.jpg"/>, <img src="../img/gifts/img2.jpg"/>, <img src="../img/gifts/img3.jpg"/>, <img src="../img/gifts/img4.jpg"/>, <img src="../img/gifts/img6.jpg"/>]
../img/gifts/img1.jpg
../img/gifts/img2.jpg
../img/gifts/img3.jpg
../img/gifts/img4.jpg
../img/gifts/img6.jpg


### 2. Basic scraping samples

In [33]:
f1 = open('./sample_html/html_test.html', 'r', encoding='utf-8')
html = f1.read()
f1.close()

In [34]:
soup = BeautifulSoup(html, 'html.parser')
soup.find('a', attrs={'class':'BS_English'}).get('href')

'http://www.pythonforbeginners.com/python-on-the-web/beautifulsoup-4-python/'

In [35]:
title = soup.find('title').text
title

' Online Data Scraping Test'

In [36]:
f2 = open('./sample_html/html_test2.html', 'r', encoding='utf-8')
html2 = f2.read()

f2.close()

In [37]:
soup2 = BeautifulSoup(html2, 'html.parser')
soup2.find_all('p')[3]

<p> text 4 </p>

In [38]:
soup2.find('div', attrs={'class': 'table2'}).find_all('p')[1]

<p> text 5 </p>

In [39]:
import re
soup.find_all('a', {'class':re.compile(r'BS')})

[<a class="BS_English" href="http://www.pythonforbeginners.com/python-on-the-web/beautifulsoup-4-python/"> BS4 for beginners</a>,
 <a class="BS_Korean" href="http://coreapython.hosting.paran.com/etc/beautifulsoup4.html"> BS4 for beginners (한글버전)</a>]

In [40]:
soup.find_all('a', {'class':re.compile(r'English')})

[<a class="BS_English" href="http://www.pythonforbeginners.com/python-on-the-web/beautifulsoup-4-python/"> BS4 for beginners</a>]

In [41]:
soup.find('head').contents[1].text

' Online Data Scraping Test'

In [42]:
soup.find('title').parent

<head>
<title> Online Data Scraping Test</title>
</head>

In [43]:
soup.find('p').next_sibling.next_sibling

<p class="description">This code will be used as HTML source code for the BS lecture. For more detailed information about BS, you can click
			<a class="BS_English" href="http://www.pythonforbeginners.com/python-on-the-web/beautifulsoup-4-python/"> BS4 for beginners</a></p>

In [44]:
soup.find_all('p')[1].previous_sibling.previous_sibling

<p class="title">HTML code for the lecture</p>

### 3. Simple scraping with IMDB web site

In [45]:
import requests

url = 'http://www.imdb.com/title/tt1396484/?ref_inth_ov_tt'
r = requests.get(url)

In [46]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(r.text, 'html.parser')
print(soup.title)

movie_title = soup.title.text
movie_title = movie_title.split('-')[0]
print(movie_title)

<title>It (2017) - IMDb</title>
It (2017) 


In [47]:
import json

url ='http://www.imdb.com/title/tt1856101/?ref_=inth_ov_tt'
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')

user_rating = json.loads(soup.find('script', attrs={'type': 'application/ld+json'}).text)['aggregateRating']['ratingValue']
print(user_rating)

8


In [48]:
url ='http://www.imdb.com/title/tt1856101/?ref_=inth_ov_tt'
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')

user_rating = json.loads(soup.find('script', attrs={'type': 'application/ld+json'}).text)['aggregateRating']['ratingValue']
print(user_rating)

8


### 4. Move to more big site

In [49]:
import ssl

In [50]:
html = urlopen("https://en.wikipedia.org/wiki/Kevin_Bacon", context=ssl._create_unverified_context())

bs_obj = BeautifulSoup(html)
for link in bs_obj.findAll("a"):
    if 'href' in link.attrs:
        print(link.attrs['href'])



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


/wiki/Wikipedia:Protection_policy#semi
#mw-head
#searchInput
/wiki/Kevin_Bacon_(disambiguation)
/wiki/File:Kevin_Bacon_SDCC_2014.jpg
/wiki/Philadelphia,_Pennsylvania
/wiki/Kyra_Sedgwick
/wiki/Sosie_Bacon
#cite_note-1
/wiki/Edmund_Bacon_(architect)
/wiki/Michael_Bacon_(musician)
/wiki/Holly_Near
http://baconbros.com/
#cite_note-2
#cite_note-actor-3
/wiki/Footloose_(1984_film)
/wiki/JFK_(film)
/wiki/A_Few_Good_Men
/wiki/Apollo_13_(film)
/wiki/Mystic_River_(film)
/wiki/Balto_(film)
/wiki/Sleepers
/wiki/The_Woodsman_(2004_film)
/wiki/Animal_House
/wiki/Diner_(1982_film)
/wiki/Tremors_(1990_film)
/wiki/Crazy,_Stupid,_Love
/wiki/Friday_the_13th_(1980_film)
/wiki/Flatliners
/wiki/The_River_Wild
/wiki/Wild_Things_(film)
/wiki/Stir_of_Echoes
/wiki/Hollow_Man
/wiki/Frost/Nixon_(film)
/wiki/X-Men:_First_Class
/wiki/Black_Mass_(film)
/wiki/Patriots_Day_(film)
/wiki/Fox_Broadcasting_Company
/wiki/The_Following
/wiki/HBO
/wiki/Taking_Chance
/wiki/Golden_Globe_Award
/wiki/Screen_Actors_Guild_Award
/w

In [51]:
import re

for link in bs_obj.find("div", {"id": "bodyContent"}).findAll("a", href=re.compile("^(/wiki/)((?!:).)*$")):
    if 'href' in link.attrs:
        print(link.attrs['href'])

/wiki/Kevin_Bacon_(disambiguation)
/wiki/Philadelphia,_Pennsylvania
/wiki/Kyra_Sedgwick
/wiki/Sosie_Bacon
/wiki/Edmund_Bacon_(architect)
/wiki/Michael_Bacon_(musician)
/wiki/Holly_Near
/wiki/Footloose_(1984_film)
/wiki/JFK_(film)
/wiki/A_Few_Good_Men
/wiki/Apollo_13_(film)
/wiki/Mystic_River_(film)
/wiki/Balto_(film)
/wiki/Sleepers
/wiki/The_Woodsman_(2004_film)
/wiki/Animal_House
/wiki/Diner_(1982_film)
/wiki/Tremors_(1990_film)
/wiki/Crazy,_Stupid,_Love
/wiki/Friday_the_13th_(1980_film)
/wiki/Flatliners
/wiki/The_River_Wild
/wiki/Wild_Things_(film)
/wiki/Stir_of_Echoes
/wiki/Hollow_Man
/wiki/Frost/Nixon_(film)
/wiki/Black_Mass_(film)
/wiki/Patriots_Day_(film)
/wiki/Fox_Broadcasting_Company
/wiki/The_Following
/wiki/HBO
/wiki/Taking_Chance
/wiki/Golden_Globe_Award
/wiki/Screen_Actors_Guild_Award
/wiki/Primetime_Emmy_Award
/wiki/Streaming_television
/wiki/I_Love_Dick_(TV_series)
/wiki/Golden_Globe_Award_for_Best_Actor_%E2%80%93_Television_Series_Musical_or_Comedy
/wiki/The_Guardian
/wi

In [52]:
import datetime
import random

random.seed(datetime.datetime.now())

def get_links(article_url):
    html = urlopen("http://en.wikipedia.org" + article_url, context=ssl._create_unverified_context())
    bs_obj = BeautifulSoup(html)
    return bs_obj.find("div", {"id": "bodyContent"}).findAll("a", href=re.compile("^(/wiki/)((?!:).)*$"))

links = get_links("/wiki/Kevin_Bacon")

while len(links) > 0:
    new_article = links[random.randint(0, len(links) - 1)].attrs["href"]
    print(new_article)
    links = get_links(new_article)

/wiki/Jeff_Bridges
/wiki/Sacha_Baron_Cohen


KeyboardInterrupt: 

In [53]:
pages = set()

def get_links(page_url):
    global pages
    html = urlopen("http://en.wikipedia.org" + page_url, context=ssl._create_unverified_context())
    bs_obj = BeautifulSoup(html)
    for link in bs_obj.findAll("a", href=re.compile("^(/wiki/)")):
        if 'href' in link.attrs:
            if link.attrs['href'] not in pages:
                # We have encountered a new page
                new_page = link.attrs['href']
                print(new_page)
                pages.add(new_page)
                get_links(new_page)
                
get_links("")

/wiki/Wikipedia
/wiki/Wikipedia:Protection_policy#semi


KeyboardInterrupt: 

In [54]:
pages = set()

def get_links(page_url):
    global pages
    
    html = urlopen("http://en.wikipedia.org" + page_url, context=ssl._create_unverified_context())
    bs_obj = BeautifulSoup(html)
    
    try:
        print(bs_obj.h1.get_text())
        print(bs_obj.find(id ="mw-content-text").findAll("p")[0])
        print(bs_obj.find(id="ca-edit").find("span").find("a").attrs['href'])
    except AttributeError:
        print("This page is missing something! No worries though!")
    
    for link in bs_obj.findAll("a", href=re.compile("^(/wiki/)")):
        if 'href' in link.attrs:
            if link.attrs['href'] not in pages:
                #We have encountered a new page
                new_page = link.attrs['href']
                print("----------------\n"+new_page)
                pages.add(new_page)
                get_links(new_page)

get_links("")

Main Page
<p>The <b><a href="/wiki/Indian_roller" title="Indian roller">Indian roller</a></b> is a bird of the family <a href="/wiki/Coraciidae" title="Coraciidae">Coraciidae</a>. It is 30–34 cm (12–13 in) long with a wingspan of 65–74 cm (26–29 in) and weighs 166–176 g (5.9–6.2 oz). The face and throat are pinkish, the head and back are brown, and the rump is blue. The brightly contrasting light and dark blue markings on the wings and tail are prominent in flight. The sexes appear similar. It  occurs widely from <a href="/wiki/Western_Asia" title="Western Asia">West Asia</a> to the <a href="/wiki/Indian_subcontinent" title="Indian subcontinent">Indian subcontinent</a>. Often found perched on roadside trees and wires, it is common in open grassland and scrub forest habitats, and has adapted well to human-modified landscapes. It mainly feeds on insects, especially <a href="/wiki/Beetle" title="Beetle">beetles</a>. The species is best known for the aerobatic displays of males during the 

KeyboardInterrupt: 

In [55]:
pages = set()
random.seed(datetime.datetime.now())

# Retrieves a list of all Internal links found on a page
def get_internal_links(bs_obj, include_url):
    internal_links = []
    
    #Finds all links that begin with a "/"
    for link in bs_obj.findAll("a", href=re.compile("^(/|.*" + include_url + ")")):
        if link.attrs['href'] is not None:
            if link.attrs['href'] not in internal_links:
                internal_links.append(link.attrs['href'])
    return internal_links

# Retrieves a list of all external links found on a page
def get_external_links(bs_obj, exclude_url):
    external_links = []

    # Finds all links that start with "http" or "www" that do
    # not contain the current URL
    for link in bs_obj.findAll("a", href=re.compile("^(http|www)((?!" + exclude_url + ").)*$")):
        if link.attrs['href'] is not None:
            if link.attrs['href'] not in external_links:
                external_links.append(link.attrs['href'])
    return external_links

def split_address(address):
    address_parts = address.replace("http://", "").split("/")
    return address_parts

def get_random_external_link(starting_page):
    html = urlopen(starting_page, context=ssl._create_unverified_context())
    bs_obj = BeautifulSoup(html)
    external_links = get_external_links(bs_obj, split_address(starting_page)[0])
    if len(external_links) == 0:
        internal_links = get_internal_links(starting_page)
        return get_next_external_link(internal_links[random.randint(0, len(internal_links)-1)])

    else:
        return external_links[random.randint(0, len(external_links)-1)]

def follow_external_only(starting_site):
    external_link = get_random_external_link("http://oreilly.com")
    print("Random external link is: " + external_link)
    follow_external_only(external_link)
    
follow_external_only("http://oreilly.com")

Random external link is: https://channelstore.roku.com/details/c8a2d0096693eb9455f6ac165003ee06/oreilly
Random external link is: https://itunes.apple.com/us/app/safari-to-go/id881697395
Random external link is: https://www.linkedin.com/company/oreilly-media
Random external link is: https://play.google.com/store/apps/details?id=com.safariflow.queue


KeyboardInterrupt: 

In [56]:
# Collects a list of all external URLs found on the site
all_external_links = set()
all_internal_links = set()

def get_all_external_links(site_url):
    try:
        html = urlopen(site_url, context=ssl._create_unverified_context())
        bs_obj = BeautifulSoup(html)

        internal_links = get_internal_links(bs_obj, split_address(site_url)[0])
        external_links = get_external_links(bs_obj, split_address(site_url)[0])
    
        for link in external_links:
            if link not in all_external_links:
                all_external_links.add(link)
                print(link)
            
        for link in internal_links:
            if link not in all_internal_links:
                print("About to get link: " + link)
                all_internal_links.add(link)
                get_all_external_links(link)
        
    except ValueError:
        print("Wrong address")
        

get_all_external_links("http://oreilly.com")

https://twitter.com/oreillymedia
https://www.facebook.com/OReilly/
https://www.linkedin.com/company/oreilly-media
https://www.youtube.com/user/OreillyMedia
https://itunes.apple.com/us/app/safari-to-go/id881697395
https://play.google.com/store/apps/details?id=com.safariflow.queue
https://channelstore.roku.com/details/c8a2d0096693eb9455f6ac165003ee06/oreilly
https://www.amazon.com/OReilly-Media-Inc/dp/B087YYHL5C/ref=sr_1_2?dchild=1&keywords=oreilly&qid=1604964116&s=mobile-apps&sr=1-2
About to get link: https://www.oreilly.com
https://www.oreilly.com
https://learning.oreilly.com/accounts/login-check/
https://www.oreilly.com/online-learning/try-now.html
https://www.oreilly.com/online-learning/teams.html
https://www.oreilly.com/online-learning/business.html
https://www.oreilly.com/online-learning/government.html
https://www.oreilly.com/online-learning/academic.html
https://www.oreilly.com/online-learning/individuals.html
https://www.oreilly.com/online-learning/features.html
https://www.orei

KeyboardInterrupt: 