In [1]:
from urllib.request import urlopen

In [2]:
url = "http://olympus.realpython.org/profiles/aphrodite"

In [3]:
#To open the web page, pass url to urlopen():
page = urlopen(url)

In [4]:
page

<http.client.HTTPResponse at 0x23447c975b0>

In [5]:
#To extract the HTML from the page, first use the HTTPResponse object’s .read() method,
#which returns a sequence of bytes. Then use .decode() to decode the bytes to a string using UTF-8:
html_bytes = page.read()
html = html_bytes.decode("utf-8")

In [6]:
#Now you can print the HTML to see the contents of the web page:
print(html)

<html>
<head>
<title>Profile: Aphrodite</title>
</head>
<body bgcolor="yellow">
<center>
<br><br>
<img src="/static/aphrodite.gif" />
<h2>Name: Aphrodite</h2>
<br><br>
Favorite animal: Dove
<br><br>
Favorite color: Red
<br><br>
Hometown: Mount Olympus
</center>
</body>
</html>



## Extract Text From HTML With String Methods

In [7]:
#Let’s extract the title of the web page you requested in the previous example.
#If you know the index of the first character of the title and the first character of the closing </title> tag, 
#then you can use a string slice to extract the title.


#html.find("<title>") returns -1 because the exact substring "<title>" doesn’t exist.
#When -1 is added to len("<title>"), which is 7, the start_index variable is assigned the value 6.

title_index = html.find("<title>")
title_index

14

In [8]:
start_index = title_index + len("<title>")
start_index

21

In [9]:
end_index = html.find("</title>")
end_index

39

In [10]:
title = html[start_index:end_index]
title

'Profile: Aphrodite'

In [11]:
url = "http://olympus.realpython.org/profiles/poseidon"

In [12]:
page = urlopen(url)
html = page.read().decode("utf-8")
start_index = html.find("<title>") + len("<title>")
end_index = html.find("</title>")
title = html[start_index:end_index]
title

'\n<head>\n<title >Profile: Poseidon'

## A Primer on Regular Expressions

In [13]:
import re

In [14]:
#In the following example, you use findall() to find any text within a string that matches a given regular expression:
re.findall("ab*c", "ac")

['ac']

In [15]:
re.findall("ab*c","abcd")

['abc']

In [16]:
re.findall("ab*c", "acc")

['ac']

In [17]:
re.findall("ab*c", "abcac")

['abc', 'ac']

In [18]:
re.findall("ab*c", "abdc")

[]

In [19]:
re.findall("ab*c", "ABC")

[]

In [20]:
re.findall("ab*c", "ABC", re.IGNORECASE)

['ABC']

In [21]:
#You can use a period (.) to stand for any single character in a regular expression. 
#For instance, you could find all the strings that contain the letters "a" and "c" separated by a single character as follows:

re.findall("a.c","abc")

['abc']

In [22]:
re.findall("a.c","abbc")

[]

In [23]:
re.findall("a.*c","abbc")

['abbc']

In [24]:
re.findall("a.*c","acc")

['acc']

In [25]:
match_result = re.search("ab*c","ABC", re.IGNORECASE)
match_result.group()

'ABC'

In [26]:
#There’s one more function in the re module that’s useful for parsing out text. re.sub(), 
#which is short for substitute, allows you to replace text in a string that matches a regular expression with new text. 
#It behaves sort of like the .replace() string method.

string ="Everything is <replaced> if it is in <tag>" 
string = re.sub("<.*>","ELEPHANTS",string)
string

'Everything is ELEPHANTS'

In [27]:
string ="Everything is <replaced> if it is in <tag>" 
string = re.sub("<.*?>","ELEPHANTS",string)
string

'Everything is ELEPHANTS if it is in ELEPHANTS'

## Extract Text From HTML With Regular Expressions

In [28]:
#.*?> matches any text after <TITLE up to the first instance of >.


url = "http://olympus.realpython.org/profiles/dionysus"
page = urlopen(url)
html = page.read().decode("utf-8")

pattern = "<title.*?>.*?</title.*?>"
match_results = re.search(pattern, html , re.IGNORECASE)
title = match_results.group()
title = re.sub("<.*?>","", title) #remove html tags
print(title)

Profile: Dionysus


In [29]:
#Exercise
#Write a program that grabs the full HTML from the following URL:
#Then use .find() to display the text following “Name:” and “Favorite Color:” 
#(not including any leading spaces or trailing HTML tags that might appear on the same line).


url = "http://olympus.realpython.org/profiles/dionysus"
html_page = urlopen(url)
html_text = html_page.read().decode("utf-8")

for string in ["Name: ", "Favorite Color:"]:
    string_start_idx = html_text.find(string)
    text_start_idx = string_start_idx + len(string)

    next_html_tag_offset = html_text[text_start_idx:].find("<")
    text_end_idx = text_start_idx + next_html_tag_offset

    raw_text = html_text[text_start_idx : text_end_idx]
    clean_text = raw_text.strip(" \r\n\t")
    print(clean_text)

Dionysus
Wine


## Use an HTML Parser for Web Scraping in Python

#### Create a BeautifulSoup Object

In [30]:
from bs4 import BeautifulSoup
from urllib.request import urlopen

url = "http://olympus.realpython.org/profiles/dionysus"
page = urlopen(url)
html = page.read().decode("utf-8")
soup = BeautifulSoup(html, "html.parser")

In [31]:
#you can use the soup variable in the interactive window to parse the content of html in various ways.
#For example, BeautifulSoup objects have a .get_text() method 
#that can be used to extract all the text from the document and automatically remove any HTML tags.

print(soup.get_text())



Profile: Dionysus





Name: Dionysus

Hometown: Mount Olympus

Favorite animal: Leopard 

Favorite Color: Wine






In [32]:
# you want to retrieve the URLs for all the images on the page. These links are contained in the src attribute of <img> HTML tags.
#In this case, you can use find_all() to return a list of all instances of that particular tag:

soup.find_all("img")

[<img src="/static/dionysus.jpg"/>, <img src="/static/grapes.png"/>]

In [33]:
#Let’s explore this a little by first unpacking the Tag objects from the list:

image1, image2 = soup.find_all("img")

In [34]:
image1.name

'img'

In [35]:
image2["src"]

'/static/grapes.png'

In [36]:
soup.title

<title>Profile: Dionysus</title>

In [37]:
soup.title.string

'Profile: Dionysus'

In [38]:
soup.find_all("img", src="/static/dionysus.jpg")

[<img src="/static/dionysus.jpg"/>]

In [39]:
#Exercise
#Using Beautiful Soup, print out a list of all the links on the page by looking for HTML tags 
#with the name a and retrieving the value taken on by the href attribute of each tag.

base_url = "http://olympus.realpython.org"

html_page = urlopen(base_url + "/profiles")
html_text = html_page.read().decode("utf-8")

In [40]:
#With the HTML source downloaded and decoded, you can create a new BeautifulSoup object to parse the HTML:

soup = BeautifulSoup(html_text, "html.parser")

In [41]:
#The relative URL for each link can be accessed through the "href" subscript. 
#Concatenate this value with base_url to create the full link_url.

for link in soup.find_all("a"):
    link_url = base_url + link["href"]
    print(link_url)

http://olympus.realpython.org/profiles/aphrodite
http://olympus.realpython.org/profiles/poseidon
http://olympus.realpython.org/profiles/dionysus
