## Web scraping

Install BeautifulSoup

In [1]:
!pip install beautifulsoup4



Import libraries

In [2]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import csv

Load the URL and store it

In [3]:
#Open URL and read it
url = urlopen("https://www.amazon.ca/gp/new-releases/books")
content = url.read()

Create soup object and print it

In [4]:
soup = BeautifulSoup(content, "lxml")
print(soup)

<!DOCTYPE html>
<html>
<head>
<link href="https://images-na.ssl-images-amazon.com/images/I/316PqWKEjJL.css?AUIClients/DigitalMusicWebsiteCartAssets" rel="stylesheet"/>
<link href="https://images-na.ssl-images-amazon.com/images/I/31lgsyNHOIL.css?AUIClients/DigitalMusicWebsitePrimeAssets" rel="stylesheet"/>
<link href="https://images-na.ssl-images-amazon.com/images/I/51o-Qd-LvxL._RC|01evdoiemkL.css,01K+Ps1DeEL.css,31yErFkQitL.css,01kivkxD60L.css,11UGC+GXOPL.css,21LK7jaicML.css,11L58Qpo0GL.css,21EuGTxgpoL.css,01Xl9KigtzL.css,21ygesff1yL.css,019SHZnt8RL.css,01qy9K8SDEL.css,11vZhCgAHbL.css,21uiGhnhrlL.css,11WgRxUdJRL.css,01dU8+SPlFL.css,115DrTlDqNL.css,01SHjPML6tL.css,111-D2qRjiL.css,01QrWuRrZ-L.css,31Wkf2OUteL.css,0147N8LucwL.css,01Alnvtt1zL.css,01oZl+VEzRL.css_.css?AUIClients/AmazonUI#not-trident" rel="stylesheet"/>
<script>
(function(g,h,K,la){function U(a){t&&t.tag&&t.tag(q(":","aui",a))}function u(a,b){t&&t.count&&t.count("aui:"+a,0===b?0:b||(t.count("aui:"+a)||0)+1)}function m(a){try{

## Get the hyperlinks of the child webpages
The following is an HTML Structure for the hyperlinks of pages:

    <html>
        .....
        .....
        <div id = 'zg_paginationWrapper' >
            ..... 
            <!-- books URL are in this section-->
           <a href="https://www.amazon.ca/Sherlock-Holmes-Companion-Daniel-Smith/dp/0785827846">Sherlock-Holmes book</a>
            .....
        </div>
        .....
    </html>

In [5]:
# The top 100 books is divided across 5 web pages. We need to get the URL
# of those pages

links=[] #create a list of links to the 5 pages

for divTag in soup.find_all( "div", {"id" : "zg_paginationWrapper"}):
    for aTag in divTag.find_all('a'):
        links.append(aTag.get('href'))

links
#print links 

['https://www.amazon.ca/gp/new-releases/books?ie=UTF8&pg=1',
 'https://www.amazon.ca/gp/new-releases/books?ie=UTF8&pg=2',
 'https://www.amazon.ca/gp/new-releases/books?ie=UTF8&pg=3',
 'https://www.amazon.ca/gp/new-releases/books?ie=UTF8&pg=4',
 'https://www.amazon.ca/gp/new-releases/books?ie=UTF8&pg=5']

# Get products information from each hyperlink (webpage)

The following is an HTML Structure for each item, to help us find under which HTML tags the product information are in the links:

    <html>
        .....
        <div class = 'zg_itemImmersion' >
                ..... 
               <span class="zg_rankNumber">Rank </span>
               <div class="p13n-sc-line-clamp-1">Book title goes here </div>
               .....
               <span class="p13n-sc-price">CDN$ Price </span>
                .....
               <div class="a-color-base">Author Name </div>
                .....
                <div class="zg_releaseDate">Release Date </div>
                .....
                <div class="a-color-secondary">Format </div>
        </div>
        .....
    </html>


Extract data from each hyperlink and store it in the dictionary

In [26]:
#Initialize empty dictionary to fill with data 
my_dict={}

for link in links:
    #Open link
    url = urlopen(link)
    content = url.read()
    soup = BeautifulSoup(content, "lxml")
    
    #Get the books data of each webpage by finding all elements with 'div' tag 
    books_data=soup.findAll("div", { "class" : "zg_itemImmersion"})
    
    #For each item tagged "zg_itemImmersion" is extracted price, author, release date and format
    for item in books_data:
            
            #Initialize the key of the item
            key=item.find(class_="zg_rankNumber").get_text().strip(' \t\n\r').rstrip('.') #the key will the rank number
            key
            my_dict[key]={}
            
            #Name
            my_dict[key]["Name"]=item.find(class_="p13n-sc-line-clamp-1").get_text().strip(' \t\n\r')
            
            #Price
            price=item.find(class_="p13n-sc-price").get_text()[5:]
            my_dict[key]["Price"]=float(price)
            
            #Author
            my_dict[key]["Author"]=item.find(class_="a-color-base").get_text().rstrip('\n')[0:]
            
            #Release date
            try:
                unformatedDate = item.find(class_="zg_releaseDate").get_text()[14:]
                #clean the returned date 
                my_dict[key]["Release Date"]=unformatedDate.replace("te:","") 
            except AttributeError:
                my_dict[key]["Release Date"]= "N/A"       
            
            #Format
            my_dict[key]["Format"]=item.find(class_="a-color-secondary").get_text()
my_dict

{'1': {'Author': 'Michael Wolff',
  'Format': 'Hardcover',
  'Name': 'Fire and Fury: Inside the Trump White House',
  'Price': 23.51,
  'Release Date': 'Jan. 5 2018'},
 '10': {'Author': 'Daniel H. Pink',
  'Format': 'Hardcover',
  'Name': 'When: The Scientific Secrets of Perfect Timing',
  'Price': 23.01,
  'Release Date': 'Jan. 9 2018'},
 '100': {'Author': 'Kristie Sullivan',
  'Format': 'Paperback',
  'Name': 'Keto Living Day-by-Day: An Inspirational Guide to the Ketogenic Diet, with 130 Deceptively Simple Recipes',
  'Price': 37.31,
  'Release Date': 'March 27 2018'},
 '11': {'Author': 'Gary Vaynerchuk',
  'Format': 'Hardcover',
  'Name': 'Crushing It!: How Great Entrepreneurs Build Their Business and Influence-and How You Can, Too',
  'Price': 31.89,
  'Release Date': 'Jan. 30 2018'},
 '12': {'Author': 'Leila Slimani',
  'Format': 'Paperback',
  'Name': 'The Perfect Nanny: A Novel',
  'Price': 11.19,
  'Release Date': 'Jan. 9 2018'},
 '13': {'Author': 'A. J Finn',
  'Format': 'Hard

Take a look at the second book data in HTML

In [27]:
books_data[1]

<div class="zg_itemImmersion" style="height:330px">
<div class="zg_rankDiv">
<span class="zg_rankNumber">
        82.
      </span>
</div>
<div class="zg_itemWrapper">
<div class="a-section a-spacing-none p13n-asin" data-p13n-asin-metadata='{"ref":"zg_bsnr_books_82","asin":"0765388243"}'>
<a class="a-link-normal" href="/Shroud-Eternity-Sister-Darkness-Chronicles/dp/0765388243?_encoding=UTF8&amp;psc=1"><div class="a-section a-spacing-mini"><img alt="Shroud of Eternity: Sister of Darkness: The Nicci Chronicles, Volume II" height="160" src="https://images-na.ssl-images-amazon.com/images/I/A17WXwC9wqL._SL500_SR106,160_.jpg" width="106"/></div>
<div aria-hidden="true" class="p13n-sc-truncate p13n-sc-truncated-hyphen p13n-sc-line-clamp-1" data-rows="1" data-truncate-mix-weblab="true">
            Shroud of Eternity: Sister of Darkness: The Nicci Chronicles, Volume II
        </div>
</a>
<div class="a-row a-size-small"><span class="a-size-small a-color-base">Terry Goodkind</span></div><div cl

Write CSV File

In [43]:
#Write CSV file
with open("prices.csv", "w", encoding="utf-8") as toWrite:
    writer = csv.writer(toWrite, delimiter=",")
    writer.writerow([ "Rank","Name", "Author", "Realease Date","Format", "Price"])
    rank = my_dict.keys()
    rank = list(map(int, rank))
    rank.sort()
    for x in rank:
        x = str(x)
        writer.writerow([x, my_dict[x]['Name'], my_dict[x]['Author'], my_dict[x]["Release Date"], my_dict[x]["Format"],my_dict[x]["Price"]])

Import CSV File and open

In [44]:
#Import CSV file to open
import pandas
books_prices = pandas.read_csv('prices.csv')
books_prices

Unnamed: 0,Rank,Name,Author,Realease Date,Format,Price
0,1,Fire and Fury: Inside the Trump White House,Michael Wolff,Jan. 5 2018,Hardcover,23.51
1,2,The Legend of Zelda: Breath of the Wild: Expan...,Piggyback,Feb. 13 2018,Hardcover,49.99
2,3,12 Rules for Life: An Antidote to Chaos,Jordan B. Peterson,Jan. 23 2018,Hardcover,21.78
3,4,Trumpocracy: The Corruption of the American Re...,David Frum,Jan. 16 2018,Hardcover,28.79
4,5,Dog Man and Cat Kid,Dav Pilkey,Dec 26 2017,Hardcover,9.90
5,6,Fire and Fury: Inside the Trump White House Pa...,Michael Wolff,2018,Paperback,29.37
6,7,Saga Volume 8,Brian K Vaughan,Jan. 2 2018,Paperback,12.22
7,8,Judgment Detox: Release the Beliefs That Hold ...,Gabrielle Bernstein,Jan. 2 2018,Hardcover,21.87
8,9,The Woman in the Window: A Novel,A. J Finn,Jan. 2 2018,Paperback,14.84
9,10,When: The Scientific Secrets of Perfect Timing,Daniel H. Pink,Jan. 9 2018,Hardcover,23.01


In [45]:
#Show Webpage
from IPython.display import HTML
HTML('https://www.amazon.ca/gp/new-releases/books')

0,1,2,3,4,5,6
Get to Know UsCareersAmazon and Our PlanetInvestor RelationsPress Releases,,Make Money with UsSell on AmazonAmazon AssociatesAdvertise Your ProductsIndependently Publish with Us,,Amazon Payment ProductsAmazon.ca Rewards Visa CardmembersShop with PointsCredit Card MarketplaceReload Your BalanceAmazon Currency ConverterGift CardsAmazon Cash,,Let Us Help YouShipping Rates & PoliciesAmazon PrimeReturns Are EasyManage your Content and DevicesCustomer Service

0,1,2,3,4,5,6
Amazon Music  Stream millions  of songs,,Amazon Web Services  Scalable Cloud  Computing Services,,Book Depository  Books With Free  Delivery Worldwide,,Goodreads  Book reviews  & recommendations
,,,,,,
"IMDb  Movies, TV  & Celebrities",,Shopbop  Designer  Fashion Brands,,Warehouse Deals  Open-Box  Discounts,,Whole Foods Market  We Believe in  Real Food
