In [1]:
!pip install pandas



You should consider upgrading via the 'python -m pip install --upgrade pip' command.




In [2]:
##Necessary Imports

import requests
from bs4 import BeautifulSoup
import re
import pandas as pd 

In [3]:
##function to get - 
##a)HTML from website's main page using requets 
##b)beautify the HTML content using BeautifulSoup to enhance readibility"

def getAndParseURL(url):
    try:
        result = requests.get(url)
        soup = BeautifulSoup(result.text, 'html.parser')
        return(soup)
    except:
        print("URL Not Accessible")

In [5]:
##function to find all Books URLs on a page using findall()

def getBooksURLs(url):
    soup = getAndParseURL(url)
    # remove the index.html part of the base url before returning the results
    return(["/".join(url.split("/")[:-1]) + "/" + x.div.a.get('href') for x in soup.findAll("article", class_ = "product_pod")])

In [6]:
##website link to be scraped
main_url = "http://books.toscrape.com/index.html"

In [9]:
# store all the results into a list
pages_urls = [main_url]

soup = getAndParseURL(pages_urls[0])

# while we get two matches, this means that the web page contains a 'previous' and a 'next' button
# if there is only one button, this means that we are either on the first page or on the last page
# we stop when we get to the last page

while len(soup.findAll("a", href=re.compile("page"))) == 2 or len(pages_urls) == 1:
    
    # get the new complete url by adding the fetched URL to the base URL (and removing the .html part of the base URL)
    new_url = "/".join(pages_urls[-1].split("/")[:-1]) + "/" + soup.findAll("a", href=re.compile("page"))[-1].get("href")
    
    # add the URL to the list
    pages_urls.append(new_url)
    
    # parse the next page
    soup = getAndParseURL(new_url)
    

# Explanation of the code where new_url is being created is below

#"/".join(pages_urls[-1].split("/")[:-1])-> splits the url by ("/") and rejoins all the words by "/" except the last "page-x.html"

#soup.findAll("a", href=re.compile("page"))[-1].get("href") -> Suppose a page in the end has links to "previous page" and "next page"
# This will find all href tags which has the word "page" in it. The returned list has 2 links
# First link would correspond to the link of the previous page
# Second link corresponds to the link of the next page
# We are taking the link of the next page hence selecting the last link ([-1])

In [19]:
soup


<!DOCTYPE html>

<!--[if lt IE 7]>      <html lang="en-us" class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]-->
<!--[if IE 7]>         <html lang="en-us" class="no-js lt-ie9 lt-ie8"> <![endif]-->
<!--[if IE 8]>         <html lang="en-us" class="no-js lt-ie9"> <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js" lang="en-us"> <!--<![endif]-->
<head>
<title>
    1,000 Places to See Before You Die | Books to Scrape - Sandbox
</title>
<meta content="text/html; charset=utf-8" http-equiv="content-type"/>
<meta content="24th Jun 2016 09:29" name="created"/>
<meta content="
    Around the World, continent by continent, here is the best the world has to offer: 1,000 places guaranteed to give travelers the shivers. Sacred ruins, grand hotels, wildlife preserves, hilltop villages, snack shacks, castles, festivals, reefs, restaurants, cathedrals, hidden islands, opera houses, museums, and more. Each entry tells exactly why it's essential to visit. Th Around the World, continent by continent, here 

In [10]:
##getting all the products' URLs

booksURLs = []
for page in pages_urls:
    booksURLs.extend(getBooksURLs(page))

In [17]:
booksURLs[:10]

['http://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html',
 'http://books.toscrape.com/catalogue/tipping-the-velvet_999/index.html',
 'http://books.toscrape.com/catalogue/soumission_998/index.html',
 'http://books.toscrape.com/catalogue/sharp-objects_997/index.html',
 'http://books.toscrape.com/catalogue/sapiens-a-brief-history-of-humankind_996/index.html',
 'http://books.toscrape.com/catalogue/the-requiem-red_995/index.html',
 'http://books.toscrape.com/catalogue/the-dirty-little-secrets-of-getting-your-dream-job_994/index.html',
 'http://books.toscrape.com/catalogue/the-coming-woman-a-novel-based-on-the-life-of-the-infamous-feminist-victoria-woodhull_993/index.html',
 'http://books.toscrape.com/catalogue/the-boys-in-the-boat-nine-americans-and-their-epic-quest-for-gold-at-the-1936-berlin-olympics_992/index.html',
 'http://books.toscrape.com/catalogue/the-black-maria_991/index.html']

In [11]:
##function to get Name, Price, Availability, Image, Category and Rating of every product


%%time

names = []
prices = []
nb_in_stock = []
img_urls = []
categories = []
ratings = []

# scrape data for every book URL: this may take some time
for url in booksURLs:
    soup = getAndParseURL(url)
    # product name
    names.append(soup.find("div", class_ = re.compile("product_main")).h1.text)
    # product price
    prices.append(soup.find("p", class_ = "price_color").text[2:]) # get rid of the pound sign
    # number of available products
    nb_in_stock.append(re.sub("[^0-9]", "", soup.find("p", class_ = "instock availability").text)) # get rid of non numerical characters
    # image url
    img_urls.append(url.replace("index.html", "") + soup.find("img").get("src"))
    # product category
    categories.append(soup.find("a", href = re.compile("../category/books/")).get("href").split("/")[3])
    # ratings
    ratings.append(soup.find("p", class_ = re.compile("star-rating")).get("class")[1])

Wall time: 11min 3s


In [12]:
##Converting the scraped data into a Pandas Dataframe

scraped_data = pd.DataFrame({'name': names, 'price': prices, 'nb_in_stock': nb_in_stock, "url_img": img_urls, "product_category": categories, "rating": ratings})
scraped_data.head()

Unnamed: 0,name,price,nb_in_stock,url_img,product_category,rating
0,A Light in the Attic,51.77,22,http://books.toscrape.com/catalogue/a-light-in...,poetry_23,Three
1,Tipping the Velvet,53.74,20,http://books.toscrape.com/catalogue/tipping-th...,historical-fiction_4,One
2,Soumission,50.1,20,http://books.toscrape.com/catalogue/soumission...,fiction_10,One
3,Sharp Objects,47.82,20,http://books.toscrape.com/catalogue/sharp-obje...,mystery_3,Four
4,Sapiens: A Brief History of Humankind,54.23,20,http://books.toscrape.com/catalogue/sapiens-a-...,history_32,Five


In [20]:
##Exporting Data to CSV

scraped_data.to_csv (r'C:\Users\User\Documents\books_scraped_data.csv', index = False, header=True)

