In [1]:
# Import pandas for a dataframe
import pandas as pd

# Import the requests package
import requests

# Import the beautifulsoup package
from bs4 import BeautifulSoup

# Import time to do some time tracking of the processing
import time

In [2]:
#create function that creates a list of book categories by passing the url and returning a list of categories
def get_book_categories(site_url):
    page = requests.get(site_url)

    # create soup object
    soup = BeautifulSoup(page.content, 'html.parser')

    #look for the side_categories container
    category_container = soup.find("div", class_="side_categories")

    #create a list to store all the extracted categories
    category_list = {}
    for category in category_container.find_all("a", href=True):
        key = category.text.strip()
        value = "https://books.toscrape.com/"+category["href"]
        category_list[key] = value

    #remove first item in the list since it is a header
    category_list.pop(list(category_list.keys())[0])
    
    return category_list

In [3]:
#get the books on a page
def get_books_in_category(category_url):
    books_in_category_url = []
    books_in_category_url.clear()

    base_url = category_url.split("index.html")[0]
    current_page_index = "index.html"

    #check to see if the page has pagination, if it does will loop the pages
    while True:
        current_page_url = base_url+current_page_index
        page = requests.get(current_page_url)

        # create soup object
        soup = BeautifulSoup(page.content, 'html.parser')

        #look for books in the table on the page
        books_container = soup.find_all("h3")

        #get the URL of the books on the current page
        for image_container in books_container:
            book_href = image_container.find("a", href=True)
            books_in_category_url.append("https://books.toscrape.com/catalogue/"+book_href["href"].replace("../../../",""))    

        #check to see if there is another page
        next_page = (soup.find("li", class_="next"))

        #if there is a next page, get the url of the page
        if next_page:
            current_page_index = next_page.find("a")["href"]
        else:
            break

    #print(books_in_category_url)

    return books_in_category_url

In [6]:
#create function which scrapes data for a single book, pass URL return elements
def scrape_book_data(book_page_url):
    print(book_page_url)
    # set the page variable to the extract of the url's HTML
    page = requests.get(book_page_url)
    
    # create soup object
    soup = BeautifulSoup(page.content, 'html.parser')

    # Get the Product Description
    #find the description container by ID
    description_container = soup.find("div", id="product_description")
    #find the next P tag after the descrption container
    product_description =  description_container.find_next("p").text.strip()
    #print(product_description)

    # Get book category
    breadcrumb_container = soup.find("ul", class_="breadcrumb")
    #print(breadcrumb_container)

    #extract all the A from the container
    list_item = breadcrumb_container.find_all("a")
    #print(list_item)

    #get the category as the 3rd entry in the list
    category = list_item[2].text.strip()
    #print(category)

    # Get the book Title as the 4th entry in the list
    book_title = breadcrumb_container.find("li",class_="active").text
    #print(book_title)

    # Get the rating from the product_main div by extracting the class for star-rating
    rating_container = soup.find("div",class_="col-sm-6 product_main")
    review_rating = rating_container.find("p", class_="star-rating").get("class")[1]
    #print(review_rating)

    # Get the Product Information table data
    product_information_container = soup.find("table", class_="table table-striped")

    # since we need to extract multiple data points from the table, create a dictionary to store all the data points in the product information table
    product_information = {}

    # loop the rows in the container looking for the table row (TR) tag
    for table_row in product_information_container.find_all("tr"):
        # define the key as the extracted table header (TH)
        key = table_row.find("th").text.strip()
        # define the value as the extracted table description (TD)
        value = table_row.find("td").text.strip()

        #write the key:value pair to the dictionary
        product_information[key] = value

    #print(product_information)

    #define the variables of the required information
    universal_product_code = product_information["UPC"]
    price_including_tax = product_information["Price (incl. tax)"]
    price_excluding_tax = product_information["Price (excl. tax)"]
    quantity_available = product_information["Availability"]   #NEED TO EXTRACT JUST THE NUMBER ONLY this should be fine as is without extracting extra number. if extract number , do regex
    #print(quantity_available)

    # Get the Image URL
    thumbnail_container = soup.find("div",class_="thumbnail")
    #print(thumbnail_container)

    image_url = "https://books.toscrape.com"+thumbnail_container.find("img")["src"].replace("../..","")
    #print(image_url)

    book_extracted_information = [book_page_url,universal_product_code,book_title,price_including_tax,price_excluding_tax,quantity_available,product_description,category,review_rating,image_url]

    return book_extracted_information

In [7]:
# create a list of data elements which will be used as column headers in the final output
book_data = pd.DataFrame(columns=["product_page_url","universal_product_code","book_title","price_including_tax","price_excluding_tax","quantity_available","product_description","category","review_rating","image_url"])

# define the URL to scrape data off of and define it as the url variable
site_page_url = "https://books.toscrape.com/"

#gather list of categories by calling function
scraped_category_list = get_book_categories(site_page_url)

#gather list of books in a category by calling function and passing the category url
start_time_all = time.perf_counter()
for category, category_url in scraped_category_list.items():
    start_time = time.perf_counter()
    print("Starting category: "+category+"     URL: "+category_url+"     Start Time: "+str(start_time))
    books_in_category_list = get_books_in_category(category_url)
    print("Gathered book list in "+category+" category: "+str(len(books_in_category_list))+" books identified")

    #call the scrape_book_data which is a function that gathers all details about a single book
    for book_url in books_in_category_list:
        scraped_book_data = scrape_book_data(book_url)

        #check to see if the extracted book is already in the list of scraped book data, if it is not, add it to the list
        if not book_data["universal_product_code"].isin([scraped_book_data[1]]).any():
            book_data.loc[len(book_data)] = scraped_book_data
    end_time = time.perf_counter()
    print("Finished gathering book details for :"+category+"     End Time: "+str(end_time)+"     Total time: "+str(end_time-start_time))
end_time_all = time.perf_counter()
print("Completed!"+" Start Time: "+str(start_time_all)+" End Time: "+str(end_time_all)+" Total Time: "+str(end_time_all-start_time_all))

Starting category: Travel     URL: https://books.toscrape.com/catalogue/category/books/travel_2/index.html     Start Time: 8982.9822386
Gathered book list in Travel category: 11 books identified
https://books.toscrape.com/catalogue/its-only-the-himalayas_981/index.html
https://books.toscrape.com/catalogue/full-moon-over-noahs-ark-an-odyssey-to-mount-ararat-and-beyond_811/index.html
https://books.toscrape.com/catalogue/see-america-a-celebration-of-our-national-parks-treasured-sites_732/index.html
https://books.toscrape.com/catalogue/vagabonding-an-uncommon-guide-to-the-art-of-long-term-world-travel_552/index.html
https://books.toscrape.com/catalogue/under-the-tuscan-sun_504/index.html
https://books.toscrape.com/catalogue/a-summer-in-europe_458/index.html
https://books.toscrape.com/catalogue/the-great-railway-bazaar_446/index.html
https://books.toscrape.com/catalogue/a-year-in-provence-provence-1_421/index.html
https://books.toscrape.com/catalogue/the-road-to-little-dribbling-adventures-

AttributeError: 'NoneType' object has no attribute 'find_next'

In [None]:
# write the book_data dataframe to CSV
book_data.to_csv("Phase_1_Output.csv", index=False)