In [8]:
# Import pandas for a dataframe which will be used to store the book information prior to writing to a csv
import pandas as pd

# Import the requests package for sending HTTP requests
import requests

# Import the beautifulsoup package for parsing the website's HTML
from bs4 import BeautifulSoup

# Import re for RegEx build which is used to extract the availability of a book
import re

In [9]:
# Function scrapes data for a single book by passing the book's URL and the book's elements are returned
def scrape_book_data(book_page_url):
    # Get the book's page HTML
    page = requests.get(book_page_url)
    
    # Create soup object for parsing
    soup = BeautifulSoup(page.content, 'html.parser')

    # Get the Product Description by finding the product description container
    description_container = soup.find("div", id="product_description")
    
    # As some books do not contain a description on the page, check to see if the container exists
    # If the container does exist get the description text
    if description_container:
        # Find the next 'p' tag after the descrption container
        product_description =  description_container.find_next("p").text.strip()
    else:
        # Return nothing if the description container is not found on the page
        product_description = ""

    # Get book category by finding the breadcrumb
    breadcrumb_container = soup.find("ul", class_="breadcrumb")

    # Extract all of the 'a' tags from the breadcrumb
    list_item = breadcrumb_container.find_all("a")

    # Get the category as the 3rd entry in the list
    category = list_item[2].text.strip()

    # Get the book title from the list item's in the breadcrumb
    book_title = breadcrumb_container.find("li",class_="active").text

    # Find the container that holds the book's rating
    rating_container = soup.find("div",class_="col-sm-6 product_main")

    # Get the rating from the rating container by extracting the class for star-rating
    # The class extracts a list where the first item is the class and the second item is the actual rating
    review_rating = rating_container.find("p", class_="star-rating").get("class")[1]

    # Get the Product Information table data which contains multiple elements for the book
    product_information_container = soup.find("table", class_="table table-striped")

    # Since multiple data points need to be extracted from the table, create a dictionary to store all the data points in
    product_information = {}

    # loop the table rows in the product information container
    for table_row in product_information_container.find_all("tr"):
        # Define the key as the extracted table header (TH) which is the name of the element
        key = table_row.find("th").text.strip()
        
        # Define the value as the extracted table description (TD) whichs is the value of the element
        value = table_row.find("td").text.strip()

        # Write the key:value pair to the dictionary
        product_information[key] = value

    # Define the variables and gather the required information
    universal_product_code = product_information["UPC"]
    price_including_tax = product_information["Price (incl. tax)"]
    price_excluding_tax = product_information["Price (excl. tax)"]

    # Get the quantity available. The extracted value comes out as 'In stock (XX availabile)". Extract the actual number via RegEx
    quantity_available = re.findall(r"\d+",product_information["Availability"])[0]
    
    # Get the Image URL by finding the thumbnail container
    thumbnail_container = soup.find("div",class_="thumbnail")

    # Create the URL for the image. Since the image URL is relative, make it absolute
    image_url = thumbnail_container.find("img")["src"].replace("../..","https://books.toscrape.com")

    # Write the extracted values to the book_extracted_information list to be returned
    book_extracted_information = [book_page_url,universal_product_code,book_title,price_including_tax,price_excluding_tax,quantity_available,product_description,category,review_rating,image_url]
    
    return book_extracted_information

In [10]:
# Create pandas dataframe and set the column headers to the data elements being extracted
book_data = pd.DataFrame(columns=["product_page_url","universal_product_code","book_title","price_including_tax","price_excluding_tax","quantity_available","product_description","category","review_rating","image_url"])

# Book URL
book_url = "https://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html"

# Call function to extract book data
scraped_book_data = scrape_book_data(book_url)

# Check to see if the extracted book is already in the list of scraped book data, if it is not, add it to the dataframe
if not book_data["universal_product_code"].isin([scraped_book_data[1]]).any():
    book_data.loc[len(book_data)] = scraped_book_data

print("Completed")

Completed


In [11]:
# Write the book_data dataframe to CSV
book_data.to_csv("Phase_I_Extracted_Book_Data.csv", index=False)