In [31]:
import re
from bs4 import BeautifulSoup

In [2]:
ITEM_HTML = '''<html><head></head><body>
<li class="col-xs-6 col-sm-4 col-md-3 col-lg-3">
    <article class="product_pod">
            <div class="image_container">
                    <a href="catalogue/a-light-in-the-attic_1000/index.html"><img src="media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg" alt="A Light in the Attic" class="thumbnail"></a>
            </div>
                <p class="star-rating Three">
                    <i class="icon-star"></i>
                    <i class="icon-star"></i>
                    <i class="icon-star"></i>
                    <i class="icon-star"></i>
                    <i class="icon-star"></i>
                </p>
            <h3><a href="catalogue/a-light-in-the-attic_1000/index.html" title="A Light in the Attic">A Light in the ...</a></h3>
            <div class="product_price">
        <p class="price_color">£51.77</p>
<p class="instock availability">
    <i class="icon-ok"></i>
        In stock
</p>
    <form>
        <button type="submit" class="btn btn-primary btn-block" data-loading-text="Adding...">Add to basket</button>
    </form>
            </div>
    </article>
</li>
</body></html>
'''

In [3]:
soup = BeautifulSoup(ITEM_HTML, 'html.parser')

## CSS Locator search method
### idea to locate the element is to find its parent and give that to a locator variable###

If we need to find the article name - we can see that it is under the 'a' tag, which is in 'h3' tag, which in turn is under
the 'article' tag

In [27]:
def find_item_name():
    locator = 'article.product_pod h3 a' # we are looking for article that has class - product pod as seen in the html
    item_link = soup.select_one(locator) # only one such tag will be searched. If there is a list of such heirarcy, its not going to work
    item_name = item_link.attrs['title']
    print(item_name)

In [26]:
find_item_name()

A Light in the Attic


Find a link to this items page using the locator method

In [29]:
def find_item_link():
    locator = 'article.product_pod h3 a'
    item_link = soup.select_one(locator)
    link_to_product = item_link.attrs['href']
    print(link_to_product)

In [30]:
find_item_link()

catalogue/a-light-in-the-attic_1000/index.html


Access specific class of the tags using locator -
Eg. Finding item-price from the above html - its located under **p.price_color** tag under the **article.product_pod** tag.

In [54]:
def find_item_price():
    locator = 'article.product_pod p.price_color'
    item_link = soup.select_one(locator)
    price = item_link.string
    
    #Covert the above string into float using re
    
    pattern = '£([0-9]+\.[0-9]+)'
    matcher = re.search(pattern, price)
    print(matcher.group(0)) #price with pound sign
    print(matcher.group(1)) #price without pound sign - second group in the matched cases

In [55]:
find_item_price()

£51.77
51.77


Compile everything in one Class

In [57]:
class ParsedItemLocators:
    """
    Locators for an item in the HTML page.
    This allows us to easily see what our code will be looking at
    as well as change it quickly if we notice it is now different.
    """
    NAME_LOCATOR = 'article.product_pod h3 a'
    LINK_LOCATOR = 'article.product_pod h3 a'
    PRICE_LOCATOR = 'article.product_pod p.price_color'
    RATING_LOCATOR = 'article.product_pod p.star-rating'


class ParsedItem:
    def __init__(self, page):
        self.soup = BeautifulSoup(page, 'html.parser')
    
    def find_item_name():
        locator = ParsedItemLocators.NAME_LOCATOR # we are looking for article that has class - product pod as seen in the html
        item_link = self.soup.select_one(locator) # only one such tag will be searched. If there is a list of such heirarcy, its not going to work
        item_name = item_link.attrs['title']
        print(item_name)
    
    def find_item_link():
        locator = ParsedItemLocators.LINK_LOCATOR
        item_link = self.soup.select_one(locator)
        link_to_product = item_link.attrs['href']
        print(link_to_product)
    
    def find_item_price():
        locator = ParsedItemLocators.PRICE_LOCATOR
        item_link = soup.select_one(locator)
        price = item_link.string
        
        #Covert the above string into float using re
        pattern = '£([0-9]+\.[0-9]+)'
        matcher = re.search(pattern, price)
        print(matcher.group(1)) #price without pound sign - second group in the matched cases

In [59]:
import requests

In [None]:
class quote_locators:
    """Locators for content inside a quote"""
    
    CONTENT = 'span.text'
    AUTHOR = 'small.author'
    TAGS = 'div.tags a.tag'


class quote_page_locators:
    """Locators to find quote divisions inside an html page"""
    
    QUOTE = 'div.quote'
    

class QuoteParser:
    """take a parent tag of a quote and look for its content/author/tags"""
    
    def __init__(self, parent):
        self.parent = parent
    
    def __repr__(self):
        return f'<Quote {self.content}, by {self.author}>'
    
    @property
    def content(self):
        locator = quote_locators.CONTENT
        return(self.parent.select_one(locator).string)
    
    @property
    def author(self):
        locator = quote_locators.AUTHOR
        return(self.parent.select_one(locator).string)
    
    @property
    def tags(self):
        locator = quote_locators.TAGS
        return(self.parent.select(locator))


class QuotesPage:
    """take and html page of quotes and parse that to find all the quotes"""
    
    def __init__(self,page):
        self.soup = BeautifulSoup(page, 'html.parser')
    
    @property
    def quotes(self):
        return[QuoteParser(e) for e in self.soup.select(quote_page_locators.QUOTE)]


In [84]:
class ParsedItemLocators:
    """
    Locators for an item in the HTML page.
    This allows us to easily see what our code will be looking at
    as well as change it quickly if we notice it is now different.
    """
    NAME_LOCATOR = 'h3 a'
    LINK_LOCATOR = 'h3 a'
    PRICE_LOCATOR = 'p.price_color'
    RATING_LOCATOR = 'p.star-rating'
    
class products_onpage_locators:
    """Locators to find quote divisions inside an html page"""
    
    PRODUCT = 'li article.product_pod'


class ProductParser:
    """take a parent tag of a product and look for its name/link/price"""
    
    def __init__(self, parent):
        self.parent = parent
    
    def __repr__(self):
        return f'<Title {self.name} priced at {self.price}>'
    
    @property
    def name(self):
        locator = ParsedItemLocators.NAME_LOCATOR
        return(self.parent.select_one(locator).attrs['title'])
    
    @property
    def link(self):
        locator = ParsedItemLocators.LINK_LOCATOR
        return(self.parent.select_one(locator).attrs['href'])
    
    @property
    def price(self):
        locator = ParsedItemLocators.PRICE_LOCATOR
        return(self.parent.select_one(locator).string)


class ProductsPage:
    """take and html page of products and parse that to find all the product tags"""
    
    def __init__(self,page):
        self.soup = BeautifulSoup(page, 'html.parser')
    
    @property
    def products(self):
        return[ProductParser(e) for e in self.soup.select(products_onpage_locators.PRODUCT)]
    

In [85]:
page_content = requests.get('http://books.toscrape.com/').content
page = ProductsPage(page_content)

In [88]:
for products in page.products:
    print(products)

<Title A Light in the Attic priced at £51.77>
<Title Tipping the Velvet priced at £53.74>
<Title Soumission priced at £50.10>
<Title Sharp Objects priced at £47.82>
<Title Sapiens: A Brief History of Humankind priced at £54.23>
<Title The Requiem Red priced at £22.65>
<Title The Dirty Little Secrets of Getting Your Dream Job priced at £33.34>
<Title The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull priced at £17.93>
<Title The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics priced at £22.60>
<Title The Black Maria priced at £52.15>
<Title Starving Hearts (Triangular Trade Trilogy, #1) priced at £13.99>
<Title Shakespeare's Sonnets priced at £20.66>
<Title Set Me Free priced at £17.46>
<Title Scott Pilgrim's Precious Little Life (Scott Pilgrim #1) priced at £52.29>
<Title Rip it Up and Start Again priced at £35.02>
<Title Our Band Could Be Your Life: Scenes from the American Indie Underground, 1981-1991 pri