In [2]:
import requests
import bs4

### requests - send HTTP/HTTPS requests. It’s one of the most popular third-party libraries for working with web APIs.



In [9]:
result = requests.get("http://www.example.com")
type(result)

requests.models.Response

In [10]:
result.text

'<!doctype html>\n<html>\n<head>\n    <title>Example Domain</title>\n\n    <meta charset="utf-8" />\n    <meta http-equiv="Content-type" content="text/html; charset=utf-8" />\n    <meta name="viewport" content="width=device-width, initial-scale=1" />\n    <style type="text/css">\n    body {\n        background-color: #f0f0f2;\n        margin: 0;\n        padding: 0;\n        font-family: -apple-system, system-ui, BlinkMacSystemFont, "Segoe UI", "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;\n        \n    }\n    div {\n        width: 600px;\n        margin: 5em auto;\n        padding: 2em;\n        background-color: #fdfdff;\n        border-radius: 0.5em;\n        box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02);\n    }\n    a:link, a:visited {\n        color: #38488f;\n        text-decoration: none;\n    }\n    @media (max-width: 700px) {\n        div {\n            margin: 0 auto;\n            width: auto;\n        }\n    }\n    </style>    \n</head>\n\n<body>\n<div>\n    <

### bs4(BeautifulSoup4) — used for parsing HTML and XML documents

In [14]:
soup = bs4.BeautifulSoup(result.text,"lxml")
soup

<!DOCTYPE html>
<html>
<head>
<title>Example Domain</title>
<meta charset="utf-8"/>
<meta content="text/html; charset=utf-8" http-equiv="Content-type"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<style type="text/css">
    body {
        background-color: #f0f0f2;
        margin: 0;
        padding: 0;
        font-family: -apple-system, system-ui, BlinkMacSystemFont, "Segoe UI", "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
        
    }
    div {
        width: 600px;
        margin: 5em auto;
        padding: 2em;
        background-color: #fdfdff;
        border-radius: 0.5em;
        box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02);
    }
    a:link, a:visited {
        color: #38488f;
        text-decoration: none;
    }
    @media (max-width: 700px) {
        div {
            margin: 0 auto;
            width: auto;
        }
    }
    </style>
</head>
<body>
<div>
<h1>Example Domain</h1>
<p>This domain is for use in illustrative examples

In [16]:
soup.select('title')

[<title>Example Domain</title>]

In [18]:
soup.select('h1')

[<h1>Example Domain</h1>]

In [23]:
soup.select('title')[0].getText()

'Example Domain'

In [33]:
sp = soup.select('p')
sp[0].getText()

'\n'

### Grabbing class

In [34]:
res = requests.get('https://en.wikipedia.org/wiki/Grace_Hopper')
soup = bs4.BeautifulSoup(res.text,"lxml")
soup.select(".vector-toc-text")

[<div class="vector-toc-text">(Top)</div>,
 <div class="vector-toc-text">
 <span class="vector-toc-numb">1</span>
 <span>Early life and education</span>
 </div>,
 <div class="vector-toc-text">
 <span class="vector-toc-numb">2</span>
 <span>Career</span>
 </div>,
 <div class="vector-toc-text">
 <span class="vector-toc-numb">2.1</span>
 <span>World War II</span>
 </div>,
 <div class="vector-toc-text">
 <span class="vector-toc-numb">2.2</span>
 <span>UNIVAC</span>
 </div>,
 <div class="vector-toc-text">
 <span class="vector-toc-numb">2.3</span>
 <span>COBOL</span>
 </div>,
 <div class="vector-toc-text">
 <span class="vector-toc-numb">2.4</span>
 <span>Standards</span>
 </div>,
 <div class="vector-toc-text">
 <span class="vector-toc-numb">3</span>
 <span>Retirement</span>
 </div>,
 <div class="vector-toc-text">
 <span class="vector-toc-numb">4</span>
 <span>Post-retirement</span>
 </div>,
 <div class="vector-toc-text">
 <span class="vector-toc-numb">5</span>
 <span>Anecdotes</span>
 </div>

In [36]:
for item in soup.select(".vector-toc-text"):
    print(item.text)

(Top)

1
Early life and education


2
Career


2.1
World War II


2.2
UNIVAC


2.3
COBOL


2.4
Standards


3
Retirement


4
Post-retirement


5
Anecdotes


6
Death


7
Dates of rank


8
Awards and honors


8.1
Military awards


8.2
Other awards


9
Legacy


9.1
Places


9.2
Programs


9.3
In popular culture


9.3.1
Grace Hopper Celebration of Women in Computing


10
See also


11
Notes


12
References


13
Obituary notices


14
Further reading


15
External links



### Grabbing Image

In [38]:
res = requests.get("https://en.wikipedia.org/wiki/Deep_Blue_(chess_computer)")
soup = bs4.BeautifulSoup(res.text,'lxml')
image_info = soup.select('.mw-file-element')
image_info

[<img alt="This is a good article. Click here for more information." class="mw-file-element" data-file-height="185" data-file-width="180" decoding="async" height="20" src="//upload.wikimedia.org/wikipedia/en/thumb/9/94/Symbol_support_vote.svg/20px-Symbol_support_vote.svg.png" srcset="//upload.wikimedia.org/wikipedia/en/thumb/9/94/Symbol_support_vote.svg/40px-Symbol_support_vote.svg.png 1.5x" width="19"/>,
 <img class="mw-file-element" data-file-height="601" data-file-width="400" decoding="async" height="376" src="//upload.wikimedia.org/wikipedia/commons/thumb/b/be/Deep_Blue.jpg/250px-Deep_Blue.jpg" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/b/be/Deep_Blue.jpg/375px-Deep_Blue.jpg 1.5x, //upload.wikimedia.org/wikipedia/commons/b/be/Deep_Blue.jpg 2x" width="250"/>,
 <img class="mw-file-element" data-file-height="64" data-file-width="64" decoding="async" height="150" src="//upload.wikimedia.org/wikipedia/commons/thumb/5/52/Chess_Programming.svg/150px-Chess_Programming.svg.png" 

In [40]:
print(len(image_info))
computer = image_info[1]
type(computer)

11


bs4.element.Tag

In [42]:
computer['src']

'//upload.wikimedia.org/wikipedia/commons/thumb/b/be/Deep_Blue.jpg/250px-Deep_Blue.jpg'

In [74]:
image_link = requests.get('https://upload.wikimedia.org/wikipedia/commons/thumb/b/be/Deep_Blue.jpg/250px-Deep_Blue.jpg')
#image_link.content

### Example Project - Working with Multiple Pages and Items

In [47]:
base_url = 'http://books.toscrape.com/catalogue/page-{}.html'

In [50]:
pg_num = 12
base_url.format(pg_num)

'http://books.toscrape.com/catalogue/page-12.html'

In [52]:
res = requests.get(base_url.format('1'))
soup = bs4.BeautifulSoup(res.text,"lxml")
soup.select(".product_pod")

[<article class="product_pod">
 <div class="image_container">
 <a href="a-light-in-the-attic_1000/index.html"><img alt="A Light in the Attic" class="thumbnail" src="../media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg"/></a>
 </div>
 <p class="star-rating Three">
 <i class="icon-star"></i>
 <i class="icon-star"></i>
 <i class="icon-star"></i>
 <i class="icon-star"></i>
 <i class="icon-star"></i>
 </p>
 <h3><a href="a-light-in-the-attic_1000/index.html" title="A Light in the Attic">A Light in the ...</a></h3>
 <div class="product_price">
 <p class="price_color">Â£51.77</p>
 <p class="instock availability">
 <i class="icon-ok"></i>
     
         In stock
     
 </p>
 <form>
 <button class="btn btn-primary btn-block" data-loading-text="Adding..." type="submit">Add to basket</button>
 </form>
 </div>
 </article>,
 <article class="product_pod">
 <div class="image_container">
 <a href="tipping-the-velvet_999/index.html"><img alt="Tipping the Velvet" class="thumbnail" src="../media/cach

In [54]:
products = soup.select(".product_pod")
example = products[0]
type(example)

bs4.element.Tag

In [56]:
example.attrs

{'class': ['product_pod']}

In [58]:
list(example.children)

['\n',
 <div class="image_container">
 <a href="a-light-in-the-attic_1000/index.html"><img alt="A Light in the Attic" class="thumbnail" src="../media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg"/></a>
 </div>,
 '\n',
 <p class="star-rating Three">
 <i class="icon-star"></i>
 <i class="icon-star"></i>
 <i class="icon-star"></i>
 <i class="icon-star"></i>
 <i class="icon-star"></i>
 </p>,
 '\n',
 <h3><a href="a-light-in-the-attic_1000/index.html" title="A Light in the Attic">A Light in the ...</a></h3>,
 '\n',
 <div class="product_price">
 <p class="price_color">Â£51.77</p>
 <p class="instock availability">
 <i class="icon-ok"></i>
     
         In stock
     
 </p>
 <form>
 <button class="btn btn-primary btn-block" data-loading-text="Adding..." type="submit">Add to basket</button>
 </form>
 </div>,
 '\n']

In [60]:
example.select('.star-rating.Three')

[<p class="star-rating Three">
 <i class="icon-star"></i>
 <i class="icon-star"></i>
 <i class="icon-star"></i>
 <i class="icon-star"></i>
 <i class="icon-star"></i>
 </p>]

In [64]:
example.select('a')

[<a href="a-light-in-the-attic_1000/index.html"><img alt="A Light in the Attic" class="thumbnail" src="../media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg"/></a>,
 <a href="a-light-in-the-attic_1000/index.html" title="A Light in the Attic">A Light in the ...</a>]

In [66]:
example.select('a')[1]

<a href="a-light-in-the-attic_1000/index.html" title="A Light in the Attic">A Light in the ...</a>

In [68]:
example.select('a')[1]['title']

'A Light in the Attic'

In [72]:
two_star_titles = []
for n in range(1,51):

    scrape_url = base_url.format(n)
    res = requests.get(scrape_url)
    
    soup = bs4.BeautifulSoup(res.text,"lxml")
    books = soup.select(".product_pod")
    
    for book in books:
        if len(book.select('.star-rating.Two')) != 0:
            two_star_titles.append(book.select('a')[1]['title'])

In [73]:
two_star_titles

['Starving Hearts (Triangular Trade Trilogy, #1)',
 'Libertarianism for Beginners',
 "It's Only the Himalayas",
 'How Music Works',
 'Maude (1883-1993):She Grew Up with the country',
 "You can't bury them all: Poems",
 'Reasons to Stay Alive',
 'Without Borders (Wanderlove #1)',
 'Soul Reader',
 'Security',
 'Saga, Volume 5 (Saga (Collected Editions) #5)',
 'Reskilling America: Learning to Labor in the Twenty-First Century',
 'Political Suicide: Missteps, Peccadilloes, Bad Calls, Backroom Hijinx, Sordid Pasts, Rotten Breaks, and Just Plain Dumb Mistakes in the Annals of American Politics',
 'Obsidian (Lux #1)',
 'My Paris Kitchen: Recipes and Stories',
 'Masks and Shadows',
 'Lumberjanes, Vol. 2: Friendship to the Max (Lumberjanes #5-8)',
 'Lumberjanes Vol. 3: A Terrible Plan (Lumberjanes #9-12)',
 'Judo: Seven Steps to Black Belt (an Introductory Guide for Beginners)',
 'I Hate Fairyland, Vol. 1: Madly Ever After (I Hate Fairyland (Compilations) #1-5)',
 'Giant Days, Vol. 2 (Giant Day