# Requirements Preparation
Install requirements for the session and save them for reuse (Environment recreation)

# Requirements Import
Import base requirements for this section
*It's advisable to import just what you need to reduce the data held in memory*

In [1]:
from bs4 import BeautifulSoup as bs #for scraping
import pandas as pd #data manipulation
import requests #http requests
import re #regex manipulation
import os #system access

# Data Discovery and Testing
Let's perform some basic data discovery

In [2]:
main_url="http://books.toscrape.com/"
result = requests.get(main_url)
print(result.text[:1000])

<!DOCTYPE html>
<!--[if lt IE 7]>      <html lang="en-us" class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]-->
<!--[if IE 7]>         <html lang="en-us" class="no-js lt-ie9 lt-ie8"> <![endif]-->
<!--[if IE 8]>         <html lang="en-us" class="no-js lt-ie9"> <![endif]-->
<!--[if gt IE 8]><!--> <html lang="en-us" class="no-js"> <!--<![endif]-->
    <head>
        <title>
    All products | Books to Scrape - Sandbox
</title>

        <meta http-equiv="content-type" content="text/html; charset=UTF-8" />
        <meta name="created" content="24th Jun 2016 09:29" />
        <meta name="description" content="" />
        <meta name="viewport" content="width=device-width" />
        <meta name="robots" content="NOARCHIVE,NOCACHE" />

        <!-- Le HTML5 shim, for IE6-8 support of HTML elements -->
        <!--[if lt IE 9]>
        <script src="//html5shim.googlecode.com/svn/trunk/html5.js"></script>
        <![endif]-->

        
            <link rel="shortcut icon" href="static/oscar/favicon.

In [3]:
soup = bs(result.text, 'html.parser')

print(soup.prettify()[:1000])

<!DOCTYPE html>
<!--[if lt IE 7]>      <html lang="en-us" class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]-->
<!--[if IE 7]>         <html lang="en-us" class="no-js lt-ie9 lt-ie8"> <![endif]-->
<!--[if IE 8]>         <html lang="en-us" class="no-js lt-ie9"> <![endif]-->
<!--[if gt IE 8]><!-->
<html class="no-js" lang="en-us">
 <!--<![endif]-->
 <head>
  <title>
   All products | Books to Scrape - Sandbox
  </title>
  <meta content="text/html; charset=utf-8" http-equiv="content-type"/>
  <meta content="24th Jun 2016 09:29" name="created"/>
  <meta content="" name="description"/>
  <meta content="width=device-width" name="viewport"/>
  <meta content="NOARCHIVE,NOCACHE" name="robots"/>
  <!-- Le HTML5 shim, for IE6-8 support of HTML elements -->
  <!--[if lt IE 9]>
        <script src="//html5shim.googlecode.com/svn/trunk/html5.js"></script>
        <![endif]-->
  <link href="static/oscar/favicon.ico" rel="shortcut icon"/>
  <link href="static/oscar/css/styles.css" rel="stylesheet" type="tex

Function to fetch and parse any page

In [4]:
def getAndParseURL(url):
    result = requests.get(url)
    soup = bs(result.text, 'html.parser')
    return soup

We continue some data discovery and testing

In [5]:
soup.find("article", class_ = "product_pod") #get first product section

<article class="product_pod">
<div class="image_container">
<a href="catalogue/a-light-in-the-attic_1000/index.html"><img alt="A Light in the Attic" class="thumbnail" src="media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg"/></a>
</div>
<p class="star-rating Three">
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
</p>
<h3><a href="catalogue/a-light-in-the-attic_1000/index.html" title="A Light in the Attic">A Light in the ...</a></h3>
<div class="product_price">
<p class="price_color">Â£51.77</p>
<p class="instock availability">
<i class="icon-ok"></i>
    
        In stock
    
</p>
<form>
<button class="btn btn-primary btn-block" data-loading-text="Adding..." type="submit">Add to basket</button>
</form>
</div>
</article>

Get the actual product URL

In [6]:
soup.find("article", class_ = "product_pod").div.a.get('href')

'catalogue/a-light-in-the-attic_1000/index.html'

Get all the product URLs from the first page

In [7]:
main_page_products_urls = [x.div.a.get('href') for x in soup.findAll("article", class_ = "product_pod")]

print(str(len(main_page_products_urls)) + " fetched products URLs")
main_page_products_urls

20 fetched products URLs


['catalogue/a-light-in-the-attic_1000/index.html',
 'catalogue/tipping-the-velvet_999/index.html',
 'catalogue/soumission_998/index.html',
 'catalogue/sharp-objects_997/index.html',
 'catalogue/sapiens-a-brief-history-of-humankind_996/index.html',
 'catalogue/the-requiem-red_995/index.html',
 'catalogue/the-dirty-little-secrets-of-getting-your-dream-job_994/index.html',
 'catalogue/the-coming-woman-a-novel-based-on-the-life-of-the-infamous-feminist-victoria-woodhull_993/index.html',
 'catalogue/the-boys-in-the-boat-nine-americans-and-their-epic-quest-for-gold-at-the-1936-berlin-olympics_992/index.html',
 'catalogue/the-black-maria_991/index.html',
 'catalogue/starving-hearts-triangular-trade-trilogy-1_990/index.html',
 'catalogue/shakespeares-sonnets_989/index.html',
 'catalogue/set-me-free_988/index.html',
 'catalogue/scott-pilgrims-precious-little-life-scott-pilgrim-1_987/index.html',
 'catalogue/rip-it-up-and-start-again_986/index.html',
 'catalogue/our-band-could-be-your-life-scene

Function to get the product urls and reformat part of the url

In [8]:

def getBooksURLs(url):
    soup = getAndParseURL(url)
    # remove the index.html part of the base url before returning the results
    return ["/".join(url.split("/")[:-1]) + "/" + x.div.a.get('href') for x in soup.findAll("article", class_ = "product_pod")]

Get a list of the category urls

In [9]:
categories_urls = [main_url + x.get('href') for x in soup.find_all("a", href=re.compile("catalogue/category/books"))]
categories_urls = categories_urls[1:] # we remove the first one because it corresponds to all the books

print(str(len(categories_urls)) + " fetched categories URLs")
categories_urls

50 fetched categories URLs


['http://books.toscrape.com/catalogue/category/books/travel_2/index.html',
 'http://books.toscrape.com/catalogue/category/books/mystery_3/index.html',
 'http://books.toscrape.com/catalogue/category/books/historical-fiction_4/index.html',
 'http://books.toscrape.com/catalogue/category/books/sequential-art_5/index.html',
 'http://books.toscrape.com/catalogue/category/books/classics_6/index.html',
 'http://books.toscrape.com/catalogue/category/books/philosophy_7/index.html',
 'http://books.toscrape.com/catalogue/category/books/romance_8/index.html',
 'http://books.toscrape.com/catalogue/category/books/womens-fiction_9/index.html',
 'http://books.toscrape.com/catalogue/category/books/fiction_10/index.html',
 'http://books.toscrape.com/catalogue/category/books/childrens_11/index.html',
 'http://books.toscrape.com/catalogue/category/books/religion_12/index.html',
 'http://books.toscrape.com/catalogue/category/books/nonfiction_13/index.html',
 'http://books.toscrape.com/catalogue/category/boo

Get all the catalogue urls we shall parse and fetch product urls

In [10]:
# store all the results into a list
pages_urls = [main_url]

soup = getAndParseURL(pages_urls[0])

# while we get two matches, this means that the webpage contains a 'previous' and a 'next' button
# if there is only one button, this means that we are either on the first page or on the last page
# we stop when we get to the last page

while len(soup.findAll("a", href=re.compile("page"))) == 2 or len(pages_urls) == 1:
    
    # get the new complete url by adding the fetched URL to the base URL (and removing the .html part of the base URL)
    new_url = "/".join(pages_urls[-1].split("/")[:-1]) + "/" + soup.findAll("a", href=re.compile("page"))[-1].get("href")
    
    # add the URL to the list
    pages_urls.append(new_url)
    
    # parse the next page
    soup = getAndParseURL(new_url)
    

print(str(len(pages_urls)) + " fetched URLs")

pages_urls

50 fetched URLs


['http://books.toscrape.com/',
 'http://books.toscrape.com/catalogue/page-2.html',
 'http://books.toscrape.com/catalogue/page-3.html',
 'http://books.toscrape.com/catalogue/page-4.html',
 'http://books.toscrape.com/catalogue/page-5.html',
 'http://books.toscrape.com/catalogue/page-6.html',
 'http://books.toscrape.com/catalogue/page-7.html',
 'http://books.toscrape.com/catalogue/page-8.html',
 'http://books.toscrape.com/catalogue/page-9.html',
 'http://books.toscrape.com/catalogue/page-10.html',
 'http://books.toscrape.com/catalogue/page-11.html',
 'http://books.toscrape.com/catalogue/page-12.html',
 'http://books.toscrape.com/catalogue/page-13.html',
 'http://books.toscrape.com/catalogue/page-14.html',
 'http://books.toscrape.com/catalogue/page-15.html',
 'http://books.toscrape.com/catalogue/page-16.html',
 'http://books.toscrape.com/catalogue/page-17.html',
 'http://books.toscrape.com/catalogue/page-18.html',
 'http://books.toscrape.com/catalogue/page-19.html',
 'http://books.toscrape

Alternatively, as we know the limits for the pages, we can generate and test the urls 

In [11]:
pages_urls = []

new_page = "http://books.toscrape.com/catalogue/page-1.html"
while requests.get(new_page).status_code == 200:
    pages_urls.append(new_page)
    new_page = pages_urls[-1].split("-")[0] + "-" + str(int(pages_urls[-1].split("-")[1].split(".")[0]) + 1) + ".html"
    

print(str(len(pages_urls)) + " fetched URLs")

pages_urls

50 fetched URLs


['http://books.toscrape.com/catalogue/page-1.html',
 'http://books.toscrape.com/catalogue/page-2.html',
 'http://books.toscrape.com/catalogue/page-3.html',
 'http://books.toscrape.com/catalogue/page-4.html',
 'http://books.toscrape.com/catalogue/page-5.html',
 'http://books.toscrape.com/catalogue/page-6.html',
 'http://books.toscrape.com/catalogue/page-7.html',
 'http://books.toscrape.com/catalogue/page-8.html',
 'http://books.toscrape.com/catalogue/page-9.html',
 'http://books.toscrape.com/catalogue/page-10.html',
 'http://books.toscrape.com/catalogue/page-11.html',
 'http://books.toscrape.com/catalogue/page-12.html',
 'http://books.toscrape.com/catalogue/page-13.html',
 'http://books.toscrape.com/catalogue/page-14.html',
 'http://books.toscrape.com/catalogue/page-15.html',
 'http://books.toscrape.com/catalogue/page-16.html',
 'http://books.toscrape.com/catalogue/page-17.html',
 'http://books.toscrape.com/catalogue/page-18.html',
 'http://books.toscrape.com/catalogue/page-19.html',
 '

We now fetch and store all the product URLS

In [13]:
%%time
booksURLs = []
for page in pages_urls:
    booksURLs.extend(getBooksURLs(page))
print(str(len(booksURLs)) + " fetched URLs")
booksURLs

1000 fetched URLs
Wall time: 24.4 s


',
 'http://books.toscrape.com/catalogue/the-blind-side-evolution-of-a-game_231/index.html',
 'http://books.toscrape.com/catalogue/the-autobiography-of-malcolm-x_230/index.html',
 'http://books.toscrape.com/catalogue/the-art-of-simple-food-notes-lessons-and-recipes-from-a-delicious-revolution_229/index.html',
 'http://books.toscrape.com/catalogue/the-art-of-fielding_228/index.html',
 'http://books.toscrape.com/catalogue/surely-youre-joking-mr-feynman-adventures-of-a-curious-character_227/index.html',
 'http://books.toscrape.com/catalogue/stiff-the-curious-lives-of-human-cadavers_226/index.html',
 'http://books.toscrape.com/catalogue/spilled-milk-based-on-a-true-story_225/index.html',
 'http://books.toscrape.com/catalogue/something-borrowed-darcy-rachel-1_224/index.html',
 'http://books.toscrape.com/catalogue/something-blue-darcy-rachel-2_223/index.html',
 'http://books.toscrape.com/catalogue/soldier-talon-3_222/index.html',
 'http://books.toscrape.com/catalogue/shopaholic-baby-shopahol

# Actual Data scraping

In [14]:
%%time
names = []
prices = []
nb_in_stock = []
categories = []
ratings = []
# scrape data for every book URL: this may take some time
for url in booksURLs:
    soup = getAndParseURL(url)
    # product name
    names.append(soup.find("div", class_ = re.compile("product_main")).h1.text)
    # product price
    prices.append(soup.find("p", class_ = "price_color").text[2:]) # get rid of the pound sign
    # product category
    categories.append(soup.find("a", href = re.compile("../category/books/")).get("href").split("/")[3])
    # ratings
    ratings.append(soup.find("p", class_ = re.compile("star-rating")).get("class")[1])
    # number of available products
    nb_in_stock.append(re.sub("[^0-9]", "", soup.find("p", class_ = "instock availability").text)) # get rid of non numerical characters
    
# add data into pandas df
scraped_data = pd.DataFrame({'Name': names, 'Price': prices, "Category": categories, "Rating": ratings, 'Stock': nb_in_stock})
scraped_data

Wall time: 15min 40s


Unnamed: 0,Name,Price,Category,Rating,Stock
0,A Light in the Attic,51.77,poetry_23,Three,22
1,Tipping the Velvet,53.74,historical-fiction_4,One,20
2,Soumission,50.10,fiction_10,One,20
3,Sharp Objects,47.82,mystery_3,Four,20
4,Sapiens: A Brief History of Humankind,54.23,history_32,Five,20
...,...,...,...,...,...
995,Alice in Wonderland (Alice's Adventures in Won...,55.53,classics_6,One,1
996,"Ajin: Demi-Human, Volume 1 (Ajin: Demi-Human #1)",57.06,sequential-art_5,Four,1
997,A Spy's Devotion (The Regency Spies of London #1),16.97,historical-fiction_4,Five,1
998,1st to Die (Women's Murder Club #1),53.98,mystery_3,One,1


# Post Processing

In [15]:
scraped_data["Category"]=scraped_data["Category"].apply(lambda x: x.split("_")[0])
scraped_data["Category"]

0                  poetry
1      historical-fiction
2                 fiction
3                 mystery
4                 history
              ...        
995              classics
996        sequential-art
997    historical-fiction
998               mystery
999                travel
Name: Category, Length: 1000, dtype: object

In [16]:
categories=scraped_data["Category"].unique()
print(categories)
print(len(categories))

['poetry' 'historical-fiction' 'fiction' 'mystery' 'history' 'young-adult'
 'business' 'default' 'sequential-art' 'music' 'science-fiction'
 'politics' 'travel' 'thriller' 'food-and-drink' 'romance' 'childrens'
 'nonfiction' 'art' 'spirituality' 'philosophy' 'new-adult' 'contemporary'
 'fantasy' 'add-a-comment' 'science' 'health' 'horror' 'self-help'
 'religion' 'christian' 'crime' 'autobiography' 'christian-fiction'
 'biography' 'womens-fiction' 'erotica' 'cultural' 'psychology' 'humor'
 'historical' 'novels' 'short-stories' 'suspense' 'classics' 'academic'
 'sports-and-games' 'adult-fiction' 'parenting' 'paranormal']
50


In [17]:

rating_map={
    "One":1,
    "Two":2,
    "Three":3,
    "Four":4,
    "Five":5
}

# print(type(scraped_data["Rating"][0]) is str)
scraped_data["Rating"]=[rating_map.get(x) for x in scraped_data["Rating"] if type(x) is str]

In [18]:
scraped_data

Unnamed: 0,Name,Price,Category,Rating,Stock
0,A Light in the Attic,51.77,poetry,3,22
1,Tipping the Velvet,53.74,historical-fiction,1,20
2,Soumission,50.10,fiction,1,20
3,Sharp Objects,47.82,mystery,4,20
4,Sapiens: A Brief History of Humankind,54.23,history,5,20
...,...,...,...,...,...
995,Alice in Wonderland (Alice's Adventures in Won...,55.53,classics,1,1
996,"Ajin: Demi-Human, Volume 1 (Ajin: Demi-Human #1)",57.06,sequential-art,4,1
997,A Spy's Devotion (The Regency Spies of London #1),16.97,historical-fiction,5,1
998,1st to Die (Women's Murder Club #1),53.98,mystery,1,1


In [19]:
scraped_data=scraped_data.astype({"Price": "float32"}).astype({"Stock": "int32"})

In [1]:
data_dir="./data/"
if not os.path.exists(data_dir):
    os.makedirs(data_dir)
scraped_data.to_csv(f"{data_dir}/csv/scraped_books.csv")

NameError: name 'scraped_data' is not defined

In [21]:
!ls -lah data

'ls' is not recognized as an internal or external command,
operable program or batch file.


Lets try some data analysis

In [22]:
scraped_data.dtypes

Name         object
Price       float32
Category     object
Rating        int64
Stock         int32
dtype: object

In [23]:
scraped_data.describe()

Unnamed: 0,Price,Rating,Stock
count,1000.0,1000.0,1000.0
mean,35.070351,2.923,8.585
std,14.44669,1.434967,5.654622
min,10.0,1.0,1.0
25%,22.107501,2.0,3.0
50%,35.98,3.0,7.0
75%,47.457499,4.0,14.0
max,59.990002,5.0,22.0


In [24]:
scraped_data["Category"].value_counts()

default               152
nonfiction            110
sequential-art         75
add-a-comment          67
fiction                65
young-adult            54
fantasy                48
romance                35
mystery                32
food-and-drink         30
childrens              29
historical-fiction     26
poetry                 19
classics               19
history                18
horror                 17
womens-fiction         17
science-fiction        16
science                14
music                  13
business               12
philosophy             11
travel                 11
thriller               11
humor                  10
autobiography           9
art                     8
psychology              7
religion                7
new-adult               6
christian-fiction       6
spirituality            6
biography               5
self-help               5
sports-and-games        5
health                  4
christian               3
contemporary            3
politics    

Thats all for this section, onto the next one!