### Crawler for Amazon Bestsellers of 2020
##### written by Jiyoung Kim

#### Import libraries required for scraping

In [1]:
import pandas as pd
import numpy as np
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests

#### Scraping Amazon Bestsellers list of 2020 from website

In [3]:
no_pages = 2

def get_data(pageNo):  
    headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0", "Accept-Encoding":"gzip, deflate", "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "DNT":"1","Connection":"close", "Upgrade-Insecure-Requests":"1"}

    r = requests.get('https://www.amazon.com/gp/bestsellers/2020/books/ref=zg_bsar_cal_ye'+str(pageNo)+'?ie=UTF8&pg='+str(pageNo), headers=headers)#, proxies=proxies)
    content = r.content
    soup = BeautifulSoup(content)
    #print(soup)

    alls = []
    for d in soup.findAll('div', attrs={'class':'a-section a-spacing-none aok-relative'}):
        #print(d)
        name = d.find('span', attrs={'class':'zg-text-center-align'})
        n = name.find_all('img', alt=True)
        #print(n[0]['alt'])
        author = d.find('a', attrs={'class':'a-size-small a-link-child'})
        rating = d.find('span', attrs={'class':'a-icon-alt'})
        users_rated = d.find('a', attrs={'class':'a-size-small a-link-normal'})
        price = d.find('span', attrs={'class':'p13n-sc-price'})

        all1=[]

        if name is not None:
            #print(n[0]['alt'])
            all1.append(n[0]['alt'])
        else:
            all1.append("unknown-product")

        if author is not None:
            #print(author.text)
            all1.append(author.text)
        elif author is None:
            author = d.find('span', attrs={'class':'a-size-small a-color-base'})
            if author is not None:
                all1.append(author.text)
            else:    
                all1.append('0')

        if rating is not None:
            #print(rating.text)
            all1.append(rating.text)
        else:
            all1.append('-1')

        if users_rated is not None:
            #print(price.text)
            all1.append(users_rated.text)
        else:
            all1.append('0')     

        if price is not None:
            #print(price.text)
            all1.append(price.text)
        else:
            all1.append('0')
        alls.append(all1)    
    return alls

#### Save the scraped result as csv

In [4]:
results = []
for i in range(1, no_pages+1):
    results.append(get_data(i))
flatten = lambda l: [item for sublist in l for item in sublist]
df2 = pd.DataFrame(flatten(results),columns=['Book Title','Author','Rating','Num_Customers_Rated', 'Price($)'])


In [5]:
df2.to_csv('amazon_bestsellers_2020.csv', index=False, encoding='utf-8')

#### Open csv file

In [6]:
df2 = pd.read_csv("amazon_bestsellers_2020.csv")

In [7]:
df2.shape

(100, 5)

In [8]:
df2.head(100)

Unnamed: 0,Book Title,Author,Rating,Num_Customers_Rated,Price($)
0,A Promised Land,Barack Obama,4.9 out of 5 stars,34872,$27.00
1,Too Much and Never Enough: How My Family Creat...,Mary L. Trump Ph.D.,4.6 out of 5 stars,84017,$19.58
2,Where the Crawdads Sing,Delia Owens,4.8 out of 5 stars,126630,$4.55
3,My First Learn to Write Workbook: Practice for...,Crystal Radke,4.8 out of 5 stars,38406,$5.39
4,Midnight Sun,Stephenie Meyer,4.8 out of 5 stars,53509,$13.32
...,...,...,...,...,...
95,"Relationship Goals: How to Win at Dating, Marr...",Michael Todd,4.8 out of 5 stars,13777,$10.49
96,The Happy in a Hurry Cookbook: 100-Plus Fast a...,Steve Doocy,4.7 out of 5 stars,7929,$20.98
97,Learn to Read: A Magical Sight Words and Phoni...,Modern Kid Press,4.7 out of 5 stars,9459,$6.99
98,P is for Potty! (Sesame Street) (Lift-the-Flap),Naomi Kleinberg,4.7 out of 5 stars,16343,$5.68


#### Data Preprocessing 

In [9]:
df2.insert(0, 'Rank', df2.index + 1) #Adding the Best seller rank using index

In [10]:
df2.insert(0, 'Year', '2020') #Adding Year column

In [11]:
#Getting rid of all the 'out of 5 stars' phrase from Rating column values
df2['Rating'] = df2['Rating'].apply(lambda x: x.split()[0]) 

In [12]:
df2['Rating'] = pd.to_numeric(df2['Rating']) #change Rating's data type into numeric

In [13]:
#Getting rid of all the dollar sign '$' from Price column values
df2['Price($)'] = df2['Price($)'].str.replace('$', '')
df2['Price($)'] = df2['Price($)'].astype(float)

In [14]:
#Getting rid of comma from Customers_Rated column value
df2["Num_Customers_Rated"] = df2["Num_Customers_Rated"].str.replace(',', '')
df2['Num_Customers_Rated'] = pd.to_numeric(df2['Num_Customers_Rated'])

In [15]:
df2.head()

Unnamed: 0,Year,Rank,Book Title,Author,Rating,Num_Customers_Rated,Price($)
0,2020,1,A Promised Land,Barack Obama,4.9,34872,27.0
1,2020,2,Too Much and Never Enough: How My Family Creat...,Mary L. Trump Ph.D.,4.6,84017,19.58
2,2020,3,Where the Crawdads Sing,Delia Owens,4.8,126630,4.55
3,2020,4,My First Learn to Write Workbook: Practice for...,Crystal Radke,4.8,38406,5.39
4,2020,5,Midnight Sun,Stephenie Meyer,4.8,53509,13.32


In [16]:
df2.to_csv("amazon_bestsellers_2020.csv", sep=",", index = False)