# Scraping Books to Scrape with BeautifulSoup

In [3]:
# import librairies
from bs4 import BeautifulSoup
import pandas as pd
import requests

## Downloading the webpage

In [4]:
url = 'https://books.toscrape.com'

In [5]:
response = requests.get(url)

In [6]:
page_contents = response.text

In [7]:
#Creating a file and loading the page contents in it.
with open('Bookswebpage.html','w') as f:
    f.write(page_contents)

## Parse and extract informations with beautifulSoup

In [8]:
doc = BeautifulSoup(page_contents,'html.parser')

In [9]:
doc

<!DOCTYPE html>

<!--[if lt IE 7]>      <html lang="en-us" class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]-->
<!--[if IE 7]>         <html lang="en-us" class="no-js lt-ie9 lt-ie8"> <![endif]-->
<!--[if IE 8]>         <html lang="en-us" class="no-js lt-ie9"> <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js" lang="en-us"> <!--<![endif]-->
<head>
<title>
    All products | Books to Scrape - Sandbox
</title>
<meta content="text/html; charset=utf-8" http-equiv="content-type"/>
<meta content="24th Jun 2016 09:29" name="created"/>
<meta content="" name="description"/>
<meta content="width=device-width" name="viewport"/>
<meta content="NOARCHIVE,NOCACHE" name="robots"/>
<!-- Le HTML5 shim, for IE6-8 support of HTML elements -->
<!--[if lt IE 9]>
        <script src="//html5shim.googlecode.com/svn/trunk/html5.js"></script>
        <![endif]-->
<link href="static/oscar/favicon.ico" rel="shortcut icon"/>
<link href="static/oscar/css/styles.css" rel="stylesheet" type="text/css"/>
<link href="s

## Grabbing book titles

In [10]:
# get_book_titles(doc) grabs the text from a tag within the h3 tag
def get_book_titles(doc):
    Book_title_tags = doc.find_all('h3')
    Book_titles = []
    for tags in Book_title_tags:
        Book_titles.append(tags.text)
    return Book_titles

In [11]:
get_book_titles(doc)

['A Light in the ...',
 'Tipping the Velvet',
 'Soumission',
 'Sharp Objects',
 'Sapiens: A Brief History ...',
 'The Requiem Red',
 'The Dirty Little Secrets ...',
 'The Coming Woman: A ...',
 'The Boys in the ...',
 'The Black Maria',
 'Starving Hearts (Triangular Trade ...',
 "Shakespeare's Sonnets",
 'Set Me Free',
 "Scott Pilgrim's Precious Little ...",
 'Rip it Up and ...',
 'Our Band Could Be ...',
 'Olio',
 'Mesaerion: The Best Science ...',
 'Libertarianism for Beginners',
 "It's Only the Himalayas"]

## Grabbing book prices

In [12]:
def get_book_price(doc):
    Book_price_tags = doc.find_all('p', class_ = 'price_color')
    Book_price = []
    for tags in Book_price_tags:
        Book_price.append(tags.text.replace('Â',''))
    return Book_price

In [13]:
get_book_price(doc)

['£51.77',
 '£53.74',
 '£50.10',
 '£47.82',
 '£54.23',
 '£22.65',
 '£33.34',
 '£17.93',
 '£22.60',
 '£52.15',
 '£13.99',
 '£20.66',
 '£17.46',
 '£52.29',
 '£35.02',
 '£57.25',
 '£23.88',
 '£37.59',
 '£51.33',
 '£45.17']

## Grabbing stock availability

In [14]:
def get_stock_availability(doc):
    Book_stock_tags = doc.find_all('p', class_ = 'instock availability')
    Book_stock = []
    for tags in Book_stock_tags:
        Book_stock.append(tags.text.strip())
    return Book_stock

In [15]:
get_stock_availability(doc)

['In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock']

## Grabbing rating

In [43]:
def get_rating(doc):
    
    Book_rating_tags = doc.find_all('p', class_="star-rating")
    Book_rating = []
    #print(Book_raiting_tags)
    for tags in Book_rating_tags:
        Book_rating.append(tags.get('class')[1])
    return Book_rating

In [44]:
get_rating(doc)

['Three',
 'One',
 'One',
 'Four',
 'Five',
 'One',
 'Four',
 'Three',
 'Four',
 'One',
 'Two',
 'Four',
 'Five',
 'Five',
 'Five',
 'Three',
 'One',
 'One',
 'Two',
 'Two']

In [117]:
def string_to_number(str):
    number_dict = {'One':1, 'Two':2, 'Three':3, 'Four':4, 'Five':5}
    return number_dict[str]
    

In [123]:
def test(doc):
    
    product_pods = doc.find_all('article', class_="product_pod")
    pod_dict = []
    
    for pod in product_pods:
        
        title = pod.find('h3').text
        price = float(pod.find('p', class_ ='price_color').text.replace('Â£',''))
        availability = pod.find('p', class_='instock availability').text.strip()
        rating = string_to_number(pod.find('p', class_="star-rating").get('class')[1])
        image_link = pod.find('img').get('src')
        pod_dict.append({'title': title, 'price_pound': price, 'availability': availability, 'rating': rating, 'image_link': image_link})
        
   
    return pod_dict
    

In [113]:
print_1_pod = doc.find_all('article', class_="product_pod")[0]
print_1_pod

<article class="product_pod">
<div class="image_container">
<a href="catalogue/a-light-in-the-attic_1000/index.html"><img alt="A Light in the Attic" class="thumbnail" src="media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg"/></a>
</div>
<p class="star-rating Three">
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
</p>
<h3><a href="catalogue/a-light-in-the-attic_1000/index.html" title="A Light in the Attic">A Light in the ...</a></h3>
<div class="product_price">
<p class="price_color">Â£51.77</p>
<p class="instock availability">
<i class="icon-ok"></i>
    
        In stock
    
</p>
<form>
<button class="btn btn-primary btn-block" data-loading-text="Adding..." type="submit">Add to basket</button>
</form>
</div>
</article>

In [124]:
a =test(doc)

In [125]:
b = pd.DataFrame(a)

In [126]:
b.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         20 non-null     object 
 1   price_pound   20 non-null     float64
 2   availability  20 non-null     object 
 3   rating        20 non-null     int64  
 4   image_link    20 non-null     object 
dtypes: float64(1), int64(1), object(3)
memory usage: 928.0+ bytes


In [127]:
b

Unnamed: 0,title,price_pound,availability,rating,image_link
0,A Light in the ...,51.77,In stock,3,media/cache/2c/da/2cdad67c44b002e7ead0cc35693c...
1,Tipping the Velvet,53.74,In stock,1,media/cache/26/0c/260c6ae16bce31c8f8c95daddd9f...
2,Soumission,50.1,In stock,1,media/cache/3e/ef/3eef99c9d9adef34639f51066202...
3,Sharp Objects,47.82,In stock,4,media/cache/32/51/3251cf3a3412f53f339e42cac213...
4,Sapiens: A Brief History ...,54.23,In stock,5,media/cache/be/a5/bea5697f2534a2f86a3ef27b5a8c...
5,The Requiem Red,22.65,In stock,1,media/cache/68/33/68339b4c9bc034267e1da611ab3b...
6,The Dirty Little Secrets ...,33.34,In stock,4,media/cache/92/27/92274a95b7c251fea59a2b8a7827...
7,The Coming Woman: A ...,17.93,In stock,3,media/cache/3d/54/3d54940e57e662c4dd1f3ff00c78...
8,The Boys in the ...,22.6,In stock,4,media/cache/66/88/66883b91f6804b2323c8369331cb...
9,The Black Maria,52.15,In stock,1,media/cache/58/46/5846057e28022268153beff6d352...


## Collecting data from multiple pages

In [20]:
def get_doc(url):
    response = requests.get(url)
    doc = BeautifulSoup(response.text,'html.parser')
    if response.status_code != 200:
        raise Exception('Failed to load page {}'.format(response))
    return doc

In [21]:
#
def scrape_multiple_pages(n):
    URL = 'https://books.toscrape.com/catalogue/page-'
    titles,prices,stocks_availability,raiting = [],[],[],[]
    
    for page in range(1,n+1):
        doc = get_doc(URL + str(page)+ '.html')
        titles.extend(get_book_titles(doc))
        prices.extend(get_book_price(doc))
        stocks_availability.extend(get_stock_availability(doc))
        raiting.extend(get_raiting(doc))
        
# a variable book_dict1 is created to store the information as dictionary
# scraped information is stored into Pandas DataFrame.
    book_dict1 = {
                'TITLE':titles,
                'PRICE':prices,
                'STOCK AVAILABILTY':stocks_availability,
                'RAITING':raiting}
    return pd.DataFrame(book_dict1)

In [84]:
#
def test_multi(n):
    URL = 'https://books.toscrape.com/catalogue/page-'
    pod_dict = []
    
    for page in range(1,n+1):
        doc = get_doc(URL + str(page)+ '.html')
        pod_dict.extend(test(doc))
        
# a variable book_dict1 is created to store the information as dictionary
# scraped information is stored into Pandas DataFrame.

    return pd.DataFrame(pod_dict)

In [85]:
test_multi(50)

Unnamed: 0,title,price,availability,rating,image_link
0,A Light in the ...,£51.77,In stock,Three,../media/cache/2c/da/2cdad67c44b002e7ead0cc356...
1,Tipping the Velvet,£53.74,In stock,One,../media/cache/26/0c/260c6ae16bce31c8f8c95dadd...
2,Soumission,£50.10,In stock,One,../media/cache/3e/ef/3eef99c9d9adef34639f51066...
3,Sharp Objects,£47.82,In stock,Four,../media/cache/32/51/3251cf3a3412f53f339e42cac...
4,Sapiens: A Brief History ...,£54.23,In stock,Five,../media/cache/be/a5/bea5697f2534a2f86a3ef27b5...
...,...,...,...,...,...
995,Alice in Wonderland (Alice's ...,£55.53,In stock,One,../media/cache/96/ee/96ee77d71a31b7694dac6855f...
996,"Ajin: Demi-Human, Volume 1 ...",£57.06,In stock,Four,../media/cache/09/7c/097cb5ecc6fb3fbe1690cf0cb...
997,A Spy's Devotion (The ...,£16.97,In stock,Five,../media/cache/1b/5f/1b5ff86f3c75e51e24c573d3f...
998,1st to Die (Women's ...,£53.98,In stock,One,../media/cache/2b/41/2b4161c5b72a4ae386b644682...


In [67]:
scrape_multiple_pages(50)

Unnamed: 0,TITLE,PRICE,STOCK AVAILABILTY,RAITING
0,A Light in the ...,£51.77,In stock,Three
1,Tipping the Velvet,£53.74,In stock,One
2,Soumission,£50.10,In stock,One
3,Sharp Objects,£47.82,In stock,Four
4,Sapiens: A Brief History ...,£54.23,In stock,Five
...,...,...,...,...
995,Alice in Wonderland (Alice's ...,£55.53,In stock,One
996,"Ajin: Demi-Human, Volume 1 ...",£57.06,In stock,Four
997,A Spy's Devotion (The ...,£16.97,In stock,Five
998,1st to Die (Women's ...,£53.98,In stock,One


# Creating a CSV file

In [None]:
#conversion of information stored to a CSV file named SCB.csv
#serial numbers or the index values are removed using index = None
scrape_multiple_pages(50).to_csv('SCB.csv',index = None)