# Acquire the Data

For this project, I will be web scraping thousands of books for their genres and cover blurbs. The overall goal of the project is to use NLP to predict a book's genre based on the brief summary usually available on the back of or inside the cover.

In [109]:
import numpy as np
import pandas as pd
from requests import get
from bs4 import BeautifulSoup
import re
import time

I will be getting the book information from Barnes and Noble's website. Based on the layout of the website, I believe I will need to first acquire the url's for each genre. From there, I will have to acquire the urls for each book inside that genre. Finally, I'll be able to access the book's individual url and scrape the book's cover blurb.

In [2]:
#To test whether or not I can scrape B&N, I will do a quick check here
url = 'https://www.barnesandnoble.com/b/books/fiction/horror/_/N-29Z8q8Z1d51'

In [3]:
response = get(url, headers = {'user-agent': 'Codeup DS Germain'})

In [34]:
response

200

Test Successful! Now begin getting the urls for all sub-genres of horror.

In [5]:
soup = BeautifulSoup(response.content, 'html.parser')

In [8]:
#Find the list of sub-genres
sub_genres = soup.find('ul', id = 'sidebar-section-0')

In [10]:
#Now find all the anchor tags that contain the urls for each sub-genre
links = sub_genres.find_all('a')

In [14]:
#Now extract all of the href urls
links[0].attrs['href']

'/b/books/horror/ghost-stories/_/N-29Z8q8Z1d5e;jsessionid=2FF65370C9537EDACE18DAEDF6E7AFB7.prodny_store02-atgap13'

In [26]:
#Create empty list to store links
horror_links = []

In [27]:
#Loop through each item in the links list
for link in links:
    #Build the complete link
    complete_link = 'barnesandnoble.com' + link.attrs['href']
    
    #Everything after the semicolon in the url is not needed. Remove it
    complete_link = re.sub(r';.*', '', complete_link)
    
    #Append the new link to the list
    horror_links.append(complete_link)

In [28]:
horror_links

['barnesandnoble.com/b/books/horror/ghost-stories/_/N-29Z8q8Z1d5e',
 'barnesandnoble.com/b/books/horror/ghost-stories-other/_/N-29Z8q8Z1d52',
 'barnesandnoble.com/b/books/horror/gothic-horror/_/N-29Z8q8Z1d5b',
 'barnesandnoble.com/b/books/horror/gothic-horror-other/_/N-29Z8q8Z1d56',
 'barnesandnoble.com/b/books/horror/horror-inner-demons/_/N-29Z8q8Z1d5g',
 'barnesandnoble.com/b/books/horror/horror-lost-abandoned-and-alone/_/N-29Z8q8Z1d5h',
 'barnesandnoble.com/b/books/horror/horror-other/_/N-29Z8q8Z1d53',
 'barnesandnoble.com/b/books/horror/horror-sacred-and-profane/_/N-29Z8q8Z1d5i',
 'barnesandnoble.com/b/books/horror/horror-anthologies/_/N-29Z8q8Z1d5d',
 'barnesandnoble.com/b/books/horror/horror-anthologies-other/_/N-29Z8q8Z1d54',
 'barnesandnoble.com/b/books/horror/horror-classics/_/N-29Z8q8Z1d5f',
 'barnesandnoble.com/b/books/horror/horror-comics/_/N-29Z8q8Z1d5j',
 'barnesandnoble.com/b/books/horror/horror-short-stories/_/N-29Z8q8Z1d5c',
 'barnesandnoble.com/b/books/horror/psycholo

In [118]:
#Now build a function to do this for each over arching genre
#I'd like to get books in Horror, Romance, Mystery and Crime, and Sci-Fi and Fantasy 
def get_sub_genre_urls(genre_list):
    """
    This function takes in a list of dictionaries containing urls for the Horror, Romance, Sci-fi and Fantasy genres
    at barnesandnoble.com. Each dictionary also has the name of the genre for the link. 
    It then loops through each of them, web scrapes the urls for each
    sub genre, and finally returns a list of those urls.
    """
    #Create empty list to contain genre name and sub-genre url dicts
    sub_genres = []
    
    #Loop through each url in the genre_list
    for genre in genre_list:
        #Get the web content of the main genre
        response = get(genre['url'], headers = {'user-agent': 'Codeup DS Germain'})
        
        #Check status code, print message
        if response.status_code == 200:
            print(f'Response status code: {response.status_code}. The {genre["genre"]} genre is good to go.')
        else:
            print(f'Response status code: {response.status_code}. Something went wrong with the {genre["genre"]} genre!')
            
        #Parse the response
        soup = BeautifulSoup(response.content, 'html.parser')
        
        #Find the list of sub-genres
        sub_genre_list = soup.find('ul', id = 'sidebar-section-0')
        
        #Now find all the anchor tags that contain the urls for each sub-genre
        links = sub_genre_list.find_all('a')
        
        #Loop through each item in the links list
        for link in links:
            #Build the complete link
            complete_link = 'https://www.barnesandnoble.com' + link.attrs['href']

            #Everything after the semicolon in the url is not needed. Remove it
            complete_link = re.sub(r';.*', '', complete_link)

            #Build the dict to store the data
            temp_dict = {'genre': genre['genre'],
                         'sub-genre': complete_link.split('/')[-3],
                         'url': complete_link }
            
            #Append the new link to the list
            sub_genres.append(temp_dict)
            
    return sub_genres
        

In [57]:
#Test the above function
#Since there are only 4 overall genres, I will manually grab those urls.
genres = [{
            'genre': 'Horror',
            'url' : 'https://www.barnesandnoble.com/b/books/fiction/horror/_/N-29Z8q8Z1d51'
          },
          {
            'genre': 'Romance',
            'url': 'https://www.barnesandnoble.com/b/books/romance/_/N-29Z8q8Z17y3'
          },
          {
              'genre': 'Mystery and Crime',
              'url' : 'https://www.barnesandnoble.com/b/books/mystery-crime/_/N-29Z8q8Z16g4'
          },
          {
              'genre': 'Sci-Fi and Fantasy',
              'url': 'https://www.barnesandnoble.com/b/books/science-fiction-fantasy/_/N-29Z8q8Z180l'
          }]

In [119]:
#Now run the function and get the list of dictionaries containing the sub-genre urls
sub_genres = get_sub_genre_urls(genres)

Response status code: 200. The Horror genre is good to go.
Response status code: 200. The Romance genre is good to go.
Response status code: 200. The Mystery and Crime genre is good to go.
Response status code: 200. The Sci-Fi and Fantasy genre is good to go.


In [120]:
sub_genres[0]

{'genre': 'Horror',
 'sub-genre': 'ghost-stories',
 'url': 'https://www.barnesandnoble.com/b/books/horror/ghost-stories/_/N-29Z8q8Z1d5e'}

In [121]:
len(sub_genres)

43

Now, I have 43 different links. From here, I will create a process to gather all of the individual book links from each sub-genre. This will involve going through every page associated with the sub-genre.

In [66]:
response = get(sub_genres[0]['url'], headers = {'user-agent':'Codeup DS Germain'})

In [67]:
response

<Response [200]>

In [68]:
soup = BeautifulSoup(response.content, 'html.parser')

In [70]:
#First find out how many pages there are for the sub-genre
page_list = soup.find('ul', class_ = 'pagination search-pagination')

In [75]:
#Now gather the anchor tags in the list
a_tags = page_list.find_all('a')

In [77]:
#Since the last anchor tag is for the very next page,
#Select the second to last anchor tag. It will have the number associated with 
#the last page in the sub-genre
a_tags[-2]

<a href="https://www.barnesandnoble.com/b/books/horror/ghost-stories/_/N-29Z8q8Z1d5e?Nrpp=20&amp;page=29">
<span class="sr-only">Page</span>
29</a>

In [84]:
last_page = int(a_tags[-2].text.split('\n')[2])
last_page

29

Now, that we have the last page, we can loop through each page of the sub-genre without worry.

In [85]:
#Now grab the section that contains all of the books on the page
book_container = soup.find('div', class_ = 'product-shelf-grid')

In [97]:
#Now get all the links for each book
book_anchors = book_container.find_all('a', class_ = 'pImageLink')

In [98]:
book_anchors[0].attrs['href']

'/w/a-christmas-carol-charles-dickens/1116639901;jsessionid=5F788B4D0EBF133BDB386599E415C617.prodny_store02-atgap12?ean=9780486268651'

In [99]:
#Create list to hold all of the individual book urls
book_urls = []

In [100]:
#Now complete the href
#Loop through each item in the links list
for anchor in book_anchors:
    #Build the complete link
    complete_link = 'https://www.barnesandnoble.com' + anchor.attrs['href']
    
    #Everything after the semicolon in the url is not needed. Remove it
    complete_link = re.sub(r';.*', '', complete_link)
    
    #Create a temp_dict to store info
    temp_dict = {'genre': 'Horror',
                 'url': complete_link}
    
    #Append the new link to the list
    book_urls.append(temp_dict)

In [103]:
book_urls

[{'genre': 'Horror',
  'url': 'https://www.barnesandnoble.com/w/a-christmas-carol-charles-dickens/1116639901'},
 {'genre': 'Horror',
  'url': 'https://www.barnesandnoble.com/w/we-have-always-lived-in-the-castle-shirley-jackson/1100733872'},
 {'genre': 'Horror',
  'url': 'https://www.barnesandnoble.com/w/haunting-of-hill-house-shirley-jackson/1100285791'},
 {'genre': 'Horror',
  'url': 'https://www.barnesandnoble.com/w/nothing-but-blackened-teeth-cassandra-khaw/1138427401'},
 {'genre': 'Horror',
  'url': 'https://www.barnesandnoble.com/w/horrorstor-grady-hendrix/1117004682'},
 {'genre': 'Horror',
  'url': 'https://www.barnesandnoble.com/w/the-whispering-dead-darcy-coates/1137754132'},
 {'genre': 'Horror',
  'url': 'https://www.barnesandnoble.com/w/classic-ghost-stories-various/1134585800'},
 {'genre': 'Horror',
  'url': 'https://www.barnesandnoble.com/w/a-christmas-carol-kareen-taylerson/1104780873'},
 {'genre': 'Horror',
  'url': 'https://www.barnesandnoble.com/w/heart-shaped-box-joe-h

In [104]:
#Create a function that gets the total number of pages available for a sub-genre
def get_last_page(soup):
    """
    This function takes in the content of an html page. It should be the first page for 
    any sub-genre. It will find and return the number corresponding to the last page 
    available for the sub-genre.
    """
    #Now, find out how many pages there are in the sub-genre
    page_list = soup.find('ul', class_ = 'pagination search-pagination')

    #Now gather the anchor tags in the list
    a_tags = page_list.find_all('a')
    
    #Since the last anchor tag is for the very next page,
    #Select the second to last anchor tag. It will have the number associated with 
    #the last page in the sub-genre
    last_page = int(a_tags[-2].text.split('\n')[2])
    
    return last_page

In [124]:
#Now build a function that gathers the book urls from all available pages in the sub-genre
def get_book_urls(sub_genres):
    """
    This function takes in a dictionary of sub-genre urls. It will loop through each of them
    and gather the individual book urls from all available pages in the sub-genre.
    It will return a list of dictionaries, with each dictionary containing the book's 
    overall genre and the individual book's url.
    """
    #First create an empty list to hold the book dictionaries
    book_urls = []
    
    #Now loop through each sub-genre
    for sub_genre in sub_genres:
        #Get the initial page for the sub-genre
        response = get(sub_genre['url'], headers = {'user-agent':'Codeup DS Germain'})
        
        #Check response
        if response.status_code == 200:
            print(f'Got first page of sub-genre: {sub_genre["genre"]}, {sub_genre["sub-genre"]}')
        else:
            print(f'Something went wrong at {sub_genre["genre"]}, {sub_genre["sub-genre"]}. Status code: {response.status_code}')
        
        #Parse it 
        soup = BeautifulSoup(response.content, 'html.parser')
        
        #Get the number corresponding to the last available page for the sub-genre
        last_page = get_last_page(soup)
        
        #Now increment through the pages
        for i in range(2, last_page + 1):
            
            #Scrape all book urls on the current page
            #Grab the section that contains all of the books on the page
            book_container = soup.find('div', class_ = 'product-shelf-grid')
            
            #Now get all the links for each book
            book_anchors = book_container.find_all('a', class_ = 'pImageLink')
    
            #Loop through each item in the links list
            for anchor in book_anchors:
                #Build the complete link
                complete_link = 'https://www.barnesandnoble.com' + anchor.attrs['href']

                #Everything after the semicolon in the url is not needed. Remove it
                complete_link = re.sub(r';.*', '', complete_link)

                #Create a temp_dict to store info
                temp_dict = {'genre': sub_genre['genre'],
                             'sub-genre': sub_genre['sub-genre'],
                             'url': complete_link}

                #Append the new link to the list
                book_urls.append(temp_dict)
                
            #Now create the url for the next page
            #I believe Nrpp stands for Num results per page, so we can change that if we want
            next_page = sub_genre['url'] + '?Nrpp=20&page=' + str(i)
            
            #Now get the content of the next page
            response = get(next_page, headers = {'user-agent': 'Codeup DS Germain'})
            
            #Check the response code again
            if response.status_code == 200:
                print(f'Acquired page {i} of {last_page} pages.')
            else:
                print(f'Something went wrong at page {i} of {last_page} pages! Status Code: {response.status_code}')
            
            #Now parse and update the soup variable for the next loop iteration
            soup = BeautifulSoup(response.content, 'html.parser')
            
    return book_urls

In [125]:
#Now run the function above and get the individual book urls
#Hopefully this is not going to take long
#Note: This took about half an hour
book_urls = get_book_urls(sub_genres)

Got first page of sub-genre: Horror, ghost-stories
Acquired page 2 of 29 pages.
Acquired page 3 of 29 pages.
Acquired page 4 of 29 pages.
Acquired page 5 of 29 pages.
Acquired page 6 of 29 pages.
Acquired page 7 of 29 pages.
Acquired page 8 of 29 pages.
Acquired page 9 of 29 pages.
Acquired page 10 of 29 pages.
Acquired page 11 of 29 pages.
Acquired page 12 of 29 pages.
Acquired page 13 of 29 pages.
Acquired page 14 of 29 pages.
Acquired page 15 of 29 pages.
Acquired page 16 of 29 pages.
Acquired page 17 of 29 pages.
Acquired page 18 of 29 pages.
Acquired page 19 of 29 pages.
Acquired page 20 of 29 pages.
Acquired page 21 of 29 pages.
Acquired page 22 of 29 pages.
Acquired page 23 of 29 pages.
Acquired page 24 of 29 pages.
Acquired page 25 of 29 pages.
Acquired page 26 of 29 pages.
Acquired page 27 of 29 pages.
Acquired page 28 of 29 pages.
Acquired page 29 of 29 pages.
Got first page of sub-genre: Horror, ghost-stories-other
Acquired page 2 of 50 pages.
Acquired page 3 of 50 pages.
Ac

Acquired page 16 of 28 pages.
Acquired page 17 of 28 pages.
Acquired page 18 of 28 pages.
Acquired page 19 of 28 pages.
Acquired page 20 of 28 pages.
Acquired page 21 of 28 pages.
Acquired page 22 of 28 pages.
Acquired page 23 of 28 pages.
Acquired page 24 of 28 pages.
Acquired page 25 of 28 pages.
Acquired page 26 of 28 pages.
Acquired page 27 of 28 pages.
Acquired page 28 of 28 pages.
Got first page of sub-genre: Horror, psychological-horror
Acquired page 2 of 27 pages.
Acquired page 3 of 27 pages.
Acquired page 4 of 27 pages.
Acquired page 5 of 27 pages.
Acquired page 6 of 27 pages.
Acquired page 7 of 27 pages.
Acquired page 8 of 27 pages.
Acquired page 9 of 27 pages.
Acquired page 10 of 27 pages.
Acquired page 11 of 27 pages.
Acquired page 12 of 27 pages.
Acquired page 13 of 27 pages.
Acquired page 14 of 27 pages.
Acquired page 15 of 27 pages.
Acquired page 16 of 27 pages.
Acquired page 17 of 27 pages.
Acquired page 18 of 27 pages.
Acquired page 19 of 27 pages.
Acquired page 20 of 

Acquired page 12 of 50 pages.
Acquired page 13 of 50 pages.
Acquired page 14 of 50 pages.
Acquired page 15 of 50 pages.
Acquired page 16 of 50 pages.
Acquired page 17 of 50 pages.
Acquired page 18 of 50 pages.
Acquired page 19 of 50 pages.
Acquired page 20 of 50 pages.
Acquired page 21 of 50 pages.
Acquired page 22 of 50 pages.
Acquired page 23 of 50 pages.
Acquired page 24 of 50 pages.
Acquired page 25 of 50 pages.
Acquired page 26 of 50 pages.
Acquired page 27 of 50 pages.
Acquired page 28 of 50 pages.
Acquired page 29 of 50 pages.
Acquired page 30 of 50 pages.
Acquired page 31 of 50 pages.
Acquired page 32 of 50 pages.
Acquired page 33 of 50 pages.
Acquired page 34 of 50 pages.
Acquired page 35 of 50 pages.
Acquired page 36 of 50 pages.
Acquired page 37 of 50 pages.
Acquired page 38 of 50 pages.
Acquired page 39 of 50 pages.
Acquired page 40 of 50 pages.
Acquired page 41 of 50 pages.
Acquired page 42 of 50 pages.
Acquired page 43 of 50 pages.
Acquired page 44 of 50 pages.
Acquired p

Acquired page 31 of 50 pages.
Acquired page 32 of 50 pages.
Acquired page 33 of 50 pages.
Acquired page 34 of 50 pages.
Acquired page 35 of 50 pages.
Acquired page 36 of 50 pages.
Acquired page 37 of 50 pages.
Acquired page 38 of 50 pages.
Acquired page 39 of 50 pages.
Acquired page 40 of 50 pages.
Acquired page 41 of 50 pages.
Acquired page 42 of 50 pages.
Acquired page 43 of 50 pages.
Acquired page 44 of 50 pages.
Acquired page 45 of 50 pages.
Acquired page 46 of 50 pages.
Acquired page 47 of 50 pages.
Acquired page 48 of 50 pages.
Acquired page 49 of 50 pages.
Acquired page 50 of 50 pages.
Got first page of sub-genre: Mystery and Crime, multicultural-detectives-fiction
Acquired page 2 of 50 pages.
Acquired page 3 of 50 pages.
Acquired page 4 of 50 pages.
Acquired page 5 of 50 pages.
Acquired page 6 of 50 pages.
Acquired page 7 of 50 pages.
Acquired page 8 of 50 pages.
Acquired page 9 of 50 pages.
Acquired page 10 of 50 pages.
Acquired page 11 of 50 pages.
Acquired page 12 of 50 page

Acquired page 49 of 50 pages.
Acquired page 50 of 50 pages.
Got first page of sub-genre: Sci-Fi and Fantasy, fantasy-fiction
Acquired page 2 of 50 pages.
Acquired page 3 of 50 pages.
Acquired page 4 of 50 pages.
Acquired page 5 of 50 pages.
Acquired page 6 of 50 pages.
Acquired page 7 of 50 pages.
Acquired page 8 of 50 pages.
Acquired page 9 of 50 pages.
Acquired page 10 of 50 pages.
Acquired page 11 of 50 pages.
Acquired page 12 of 50 pages.
Acquired page 13 of 50 pages.
Acquired page 14 of 50 pages.
Acquired page 15 of 50 pages.
Acquired page 16 of 50 pages.
Acquired page 17 of 50 pages.
Acquired page 18 of 50 pages.
Acquired page 19 of 50 pages.
Acquired page 20 of 50 pages.
Acquired page 21 of 50 pages.
Acquired page 22 of 50 pages.
Acquired page 23 of 50 pages.
Acquired page 24 of 50 pages.
Acquired page 25 of 50 pages.
Acquired page 26 of 50 pages.
Acquired page 27 of 50 pages.
Acquired page 28 of 50 pages.
Acquired page 29 of 50 pages.
Acquired page 30 of 50 pages.
Acquired page

Acquired page 31 of 50 pages.
Acquired page 32 of 50 pages.
Acquired page 33 of 50 pages.
Acquired page 34 of 50 pages.
Acquired page 35 of 50 pages.
Acquired page 36 of 50 pages.
Acquired page 37 of 50 pages.
Acquired page 38 of 50 pages.
Acquired page 39 of 50 pages.
Acquired page 40 of 50 pages.
Acquired page 41 of 50 pages.
Acquired page 42 of 50 pages.
Acquired page 43 of 50 pages.
Acquired page 44 of 50 pages.
Acquired page 45 of 50 pages.
Acquired page 46 of 50 pages.
Acquired page 47 of 50 pages.
Acquired page 48 of 50 pages.
Acquired page 49 of 50 pages.
Acquired page 50 of 50 pages.
Got first page of sub-genre: Sci-Fi and Fantasy, social-science-fiction
Acquired page 2 of 50 pages.
Acquired page 3 of 50 pages.
Acquired page 4 of 50 pages.
Acquired page 5 of 50 pages.
Acquired page 6 of 50 pages.
Acquired page 7 of 50 pages.
Acquired page 8 of 50 pages.
Acquired page 9 of 50 pages.
Acquired page 10 of 50 pages.
Acquired page 11 of 50 pages.
Acquired page 12 of 50 pages.
Acquir

In [126]:
len(book_urls)

29340

Awesome, now that I have the URLs for each individual book, I can begin scraping their cover blurbs. However, I will first convert 'book_urls' to a df, remove any duplicates, and save it as a .csv.

In [127]:
#Convert to data frame
url_df = pd.DataFrame(book_urls)

In [128]:
url_df.head()

Unnamed: 0,genre,sub-genre,url
0,Horror,ghost-stories,https://www.barnesandnoble.com/w/a-christmas-c...
1,Horror,ghost-stories,https://www.barnesandnoble.com/w/we-have-alway...
2,Horror,ghost-stories,https://www.barnesandnoble.com/w/haunting-of-h...
3,Horror,ghost-stories,https://www.barnesandnoble.com/w/nothing-but-b...
4,Horror,ghost-stories,https://www.barnesandnoble.com/w/horrorstor-gr...


In [131]:
url_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29340 entries, 0 to 29339
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   genre      29340 non-null  object
 1   sub-genre  29340 non-null  object
 2   url        29340 non-null  object
dtypes: object(3)
memory usage: 687.8+ KB


In [132]:
#Remove any duplicate urls
url_df = url_df.drop_duplicates(subset = ['url'])

In [133]:
url_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21640 entries, 0 to 29339
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   genre      21640 non-null  object
 1   sub-genre  21640 non-null  object
 2   url        21640 non-null  object
dtypes: object(3)
memory usage: 676.2+ KB


We started with 29,340 entries, but after removing duplicates, we are left with 21,640 entries.

How many are in each genre?

In [136]:
url_df.genre.value_counts(normalize = True)

Sci-Fi and Fantasy    0.317329
Horror                0.303235
Mystery and Crime     0.205499
Romance               0.173937
Name: genre, dtype: float64

I don't think these proportions are ridiculous.

In [137]:
#Now save url_df as a .csv
url_df.to_csv('book_urls.csv', index = False)

In [138]:
url_df = url_df.to_dict()

In [160]:
#Now get the cover blurbs
response = get(book_urls[1]['url'], headers = {'user-agent': 'Codeup DS Germain'})

In [161]:
response

<Response [200]>

In [162]:
soup = BeautifulSoup(response.content, 'html.parser')

In [163]:
#Get the description
blurb = soup.find('div', itemprop = 'description')

In [164]:
blurb.text.strip().replace('\xa0 ', '')

'Part of the Penguin Orange Collection, a limited-run series of twelve influential and beloved American classics in a bold series design offering a modern take on the iconic Penguin paperbackWinner of the 2016 AIGA + Design Observer 50 Books | 50 Covers competitionFor the seventieth anniversary of Penguin Classics, the Penguin Orange Collection celebrates the heritage of Penguin’s iconic book design with twelve influential American literary classics representing the breadth and diversity of the Penguin Classics library. These collectible editions are dressed in the iconic orange and white tri-band cover design, first created in 1935, while french flaps, high-quality paper, and striking cover illustrations provide the cutting-edge design treatment that is the signature of Penguin Classics Deluxe Editions today.We Have Always Lived in the Castle Taking readers deep into a labyrinth of dark neurosis, We Have Always Lived in the Castle is perhaps the crowning achievement of Shirley Jackson

In [166]:
#Now create a function that loops through all the urls and retrieves the description of each book
def get_book_blurbs(book_urls):
    """
    This function takes in a list of dictionaries that contain a book's overall genre, its sub-genre,
    and its unique url. This function will loop through each url in the given list of dictionaries,
    and scrape the description of each book. It will finally return a new list of dictionaries with 
    the books' descriptions included. It will also keep track of how long the function runs.
    """
    #Start the timer
    time_start = time.perf_counter()
    
    #Create the empty list to store the dictionaries
    book_blurbs = []
    
    #Now loop through each book url in the 'book_urls'
    #Enumerate so that you can track progress with 'i'
    for i, book in enum(book_urls):
        #Get the page content
        response = get(book['url'], headers = {'user-agent': 'Codeup DS Germain'})
        
        #Check the status code. If there is a problem, leave a message
        if response.status_code != 200:
            print(f'Something went wrong at book url #{i}! Status Code: {response.status_code}')
            print(f'Genre, Subgenre: {book["genre"]}, {book["sub-genre"]}')
            print(f'Book URL: {book["url"]}')
            break
        
        #parse it
        soup = BeautifulSoup(response.content, 'html.parser')
        
        #Get the description
        blurb = soup.find('div', itemprop = 'description')
        
        #Remove leading and trailing whitespace and a set of characters I found 
        #That seem to appear occasionally
        blurb = blurb.text.strip().replace('\xa0 ', '')
        
        #Create temp dict
        temp_dict = {'genre': book['genre'],
                     'sub-genre': book['sub-genre'],
                     'blurb': blurb
                    }
        
        #Append the temp_dict to the book_blurbs list
        book_blurbs.append(temp_dict)
        
        #Every 500 entries, leave a progress message
        if i % 500 == 0:
            print(f'Total Entries So Far: {i}')
            print(f'Total Entries Left: {len(book_urls) - i}')
            print(f'Total Time So Far: { (time.perf_counter() - time_start) / 60} minutes\n')
        
    #Inform the user that the function is done. Provide total time
    print(f'Function Complete! Total Time: { (time.perf_counter() - time_start) / 60} minutes')
    
    return book_blurbs

In [None]:
#Now use the function above to retrieve all the individual book blurbs
book_blurbs = get_book_blurbs(book_urls)