In [1]:
# Import pandas for a dataframe which will be used to store the book information prior to writing to a csv
import pandas as pd

# Import the requests package for sending HTTP requests
import requests

# Import the beautifulsoup package for parsing the website's HTML
from bs4 import BeautifulSoup

# Import re for RegEx build which is used to extract the availability of a book
import re

In [2]:
# Function creates a list of book categories by passing the url of the site and returning a list of categories
def get_book_categories(site_url):
    # Get the site's HTML
    page = requests.get(site_url)

    # Create soup object for parsing
    soup = BeautifulSoup(page.content, 'html.parser')

    # Look for the side_categories container which contains a list of categories on the site
    category_container = soup.find("div", class_="side_categories")

    # Create a list to store all the extracted categories by a key-value pair with the get being the name of the book and the value being the book's url
    category_list = {}

    # Loop through each 'a' tag found in the category_container
    for category in category_container.find_all("a", href=True):
        # Define the key as the name of the category
        key = category.text.strip()

        # Define the value as the URL of the category
        value = site_url+category["href"]

        # Write the key-value pair to the category_list
        category_list[key] = value

    # Remove the first item in the list since it is a header and not a category
    category_list.pop(list(category_list.keys())[0])
    
    return category_list

In [3]:
# Function creates a list of all books in a category by passing the URL of the category and returning a list of books in the category
def get_books_in_category(category_url):
    # Create a list to store the extracted book URLs
    books_in_category_url = []

    # Split the category's URL so the index can be changed for each page in the category
    base_url = category_url.split("index.html")[0]

    # Define the first page's index
    current_page_index = "index.html"

    # Check to see if the category has pagination, if it does loop through each page
    while True:
        # Define the URL of the page in the category by concaenating the base URL and the page index
        current_page_url = base_url+current_page_index

        # Get the category page's HTML
        page = requests.get(current_page_url)

        # Create soup object for parsing
        soup = BeautifulSoup(page.content, 'html.parser')

        # Look for the table on the page whcih contains the books
        books_container = soup.find_all("h3")

        # Get the URL of the books on the current page by looping through each item in the books_container
        for image_container in books_container:
            # Define the URL by finding the 'a' tag
            book_href = image_container.find("a", href=True)

            # Create the final book URL by appending the base URL and the HREF of the book
            # Since the HREF of the book is a relative URL, replace the relative URL with an absolute URL
            books_in_category_url.append(book_href["href"].replace("../../../","https://books.toscrape.com/catalogue/"))    

        # Check to see if there is another page for the category by looking for the 'next' list item
        next_page = (soup.find("li", class_="next"))

        # If there is a next page, get the url of the page at set that as the curren_page_index so when the function loops it navigates to the next page
        if next_page:
            current_page_index = next_page.find("a")["href"]
        # If there is not another page to extract data from, exit the loop
        else:
            break

    return books_in_category_url

In [4]:
# Function scrapes data for a single book by passing the book's URL and the book's elements are returned
def scrape_book_data(book_page_url):
    # Get the book's page HTML
    page = requests.get(book_page_url)
    
    # Create soup object for parsing
    soup = BeautifulSoup(page.content, 'html.parser')

    # Get the Product Description by finding the product description container
    description_container = soup.find("div", id="product_description")
    
    # As some books do not contain a description on the page, check to see if the container exists
    # If the container does exist get the description text
    if description_container:
        # Find the next 'p' tag after the descrption container
        product_description =  description_container.find_next("p").text.strip()
    else:
        # Return nothing if the description container is not found on the page
        product_description = ""

    # Get book category by finding the breadcrumb
    breadcrumb_container = soup.find("ul", class_="breadcrumb")

    # Extract all of the 'a' tags from the breadcrumb
    list_item = breadcrumb_container.find_all("a")

    # Get the category as the 3rd entry in the list
    category = list_item[2].text.strip()

    # Get the book title from the list item's in the breadcrumb
    book_title = breadcrumb_container.find("li",class_="active").text

    # Find the container that holds the book's rating
    rating_container = soup.find("div",class_="col-sm-6 product_main")

    # Get the rating from the rating container by extracting the class for star-rating
    # The class extracts a list where the first item is the class and the second item is the actual rating
    review_rating = rating_container.find("p", class_="star-rating").get("class")[1]

    # Get the Product Information table data which contains multiple elements for the book
    product_information_container = soup.find("table", class_="table table-striped")

    # Since multiple data points need to be extracted from the table, create a dictionary to store all the data points in
    product_information = {}

    # loop the table rows in the product information container
    for table_row in product_information_container.find_all("tr"):
        # Define the key as the extracted table header (TH) which is the name of the element
        key = table_row.find("th").text.strip()
        
        # Define the value as the extracted table description (TD) whichs is the value of the element
        value = table_row.find("td").text.strip()

        # Write the key:value pair to the dictionary
        product_information[key] = value

    # Define the variables and gather the required information
    universal_product_code = product_information["UPC"]
    price_including_tax = product_information["Price (incl. tax)"]
    price_excluding_tax = product_information["Price (excl. tax)"]

    # Get the quantity available. The extracted value comes out as 'In stock (XX availabile)". Extract the actual number via RegEx
    quantity_available = re.findall(r"\d+",product_information["Availability"])[0]
    
    # Get the Image URL by finding the thumbnail container
    thumbnail_container = soup.find("div",class_="thumbnail")

    # Create the URL for the image. Since the image URL is relative, make it absolute
    image_url = thumbnail_container.find("img")["src"].replace("../..","https://books.toscrape.com")

    # Write the extracted values to the book_extracted_information list to be returned
    book_extracted_information = [book_page_url,universal_product_code,book_title,price_including_tax,price_excluding_tax,quantity_available,product_description,category,review_rating,image_url]
    
    return book_extracted_information

In [5]:
# Create pandas dataframe and set the column headers to the data elements being extracted
book_data = pd.DataFrame(columns=["product_page_url","universal_product_code","book_title","price_including_tax","price_excluding_tax","quantity_available","product_description","category","review_rating","image_url"])

# Define the URL to scrape data off of and define it as the url variable
site_page_url = "https://books.toscrape.com/"

# Gather list of categories on the site by calling the function
scraped_category_list = get_book_categories(site_page_url)

# Gather list of books in a category by calling function and passing the category url
# Start by looping through each category gathered
for category, category_url in scraped_category_list.items():
    # For each category call the function that creates the list of books in the category
    books_in_category_list = get_books_in_category(category_url)

    # Loop the list of books in the category
    for book_url in books_in_category_list:
        # For each book, call the function which extracts the data for the book
        scraped_book_data = scrape_book_data(book_url)

        # Check to see if the extracted book is already in the list of scraped book data, if it is not, add it to the dataframe
        if not book_data["universal_product_code"].isin([scraped_book_data[1]]).any():
            book_data.loc[len(book_data)] = scraped_book_data

print("Completed")

Completed


In [6]:
# Write the book_data dataframe to CSV
book_data.to_csv("Phase_III_Extracted_Book_Data.csv", index=False)