Amazon web scraper

In [1]:
import csv
from bs4 import BeautifulSoup

In [2]:
# firefox and Chrome
from selenium import webdriver

 Startup the webdriver

In [3]:
pip install webdriver-manager




In [4]:
## we activate the webdriver for Chrome as we are using Google Chrome
from webdriver_manager.chrome import ChromeDriverManager

driver = webdriver.Chrome(ChromeDriverManager().install())



Current google-chrome version is 97.0.4692
Get LATEST chromedriver version for 97.0.4692 google-chrome
Driver [C:\Users\bikem\.wdm\drivers\chromedriver\win32\97.0.4692.71\chromedriver.exe] found in cache
  driver = webdriver.Chrome(ChromeDriverManager().install())


In [5]:
# Using webdriver we'll now open the Amazon website in chrome
url = 'https://www.amazon.in'

# We'll use the get method of driver and pass in the URL
driver.get(url)

In [8]:
def get_url(search_term):
    '''
    This function fetches the URL of the item that you want to search
    '''
    template = 'https://www.amazon.in/s?k={}&crid=UOAX8JTJZ8XJ&ref=nb_sb_noss_1'
    # We'are replacing every space with '+' to adhere with the pattern 
    search_term = search_term.replace(" ","+")
    return template.format(search_term)

In [9]:
# Checking whether the function is working properly or not
url = get_url('nike shoes')
print(url)

https://www.amazon.in/s?k=nike+shoes&crid=UOAX8JTJZ8XJ&ref=nb_sb_noss_1


In [10]:
driver.get(url)

**Extract the collection**

In [11]:
#taking the page source and trying to extract from html
soup = BeautifulSoup(driver.page_source, 'html.parser')

In [12]:
# assigning the specific identity of the component we need to extract from the website
# in this case we need to extract the whole component that we search in the site
# say the mobile phone in this case and the whole component containing the name, price, etc needs to be assigned
results = soup.find_all('div', {'data-component-type': 's-search-result'})

In [13]:
len(results)

60

In [14]:
# prototype the results
item = results[0]

In [16]:
item.find('div', 'a-section a-spacing-medium a-text-center').text

'Amazon\'s Choicefor "nike shoes"  +7 colors/patternsNikeMen\'s Flex Experience Rn 10 Sport Performance  4.1 out of 5 stars 155  ₹3,240₹3,240 ₹4,995₹4,995  (35% off) FREE Delivery by Amazon'

In [17]:
''' while we try and extract the first most obvious thing to select would be the name of the product
    in order to select that we see that the name component was under the h2 tag and under a, hence 
    we extract that and assign to atag'''
atag = item.h2.a

In [18]:
atag.text.strip()

"Men's Flex Experience Rn 10 Sport Performance"

In [19]:
# we select that text from the atag and strip to select only the name text component
description = atag.text.strip()

In [20]:
# we need the exact url point of this component to be extracted properly
atag.get('href')

'/Nike-Experience-Crimson-Lt-Running-CI9960-008/dp/B096CJXNJ6/ref=sr_1_1?crid=UOAX8JTJZ8XJ&keywords=nike+shoes&qid=1642171385&sr=8-1'

In [21]:
# additionally we need this to be prefixed with the https amazon tag
url = 'https://www.amazon.in' + atag.get('href')

In [22]:
# now we need to extract the price of the product
item.find('span', 'a-price-whole').text

'3,240'

In [23]:
# now we need to extract the price of the product
price = item.find('span', 'a-price-whole').text

In [24]:
item.i.text

'4.1 out of 5 stars'

In [25]:
rating = item.i.text

In [26]:
# to extract the number of reviews given to the product
item.find('span', 'a-size-base').text

'155'

In [27]:
rating_count = item.find('span', 'a-size-base').text

## Generalize the pattern now

In [28]:
def extract_records(item):
    '''Extract and return data from a single record'''
    #description and url
    atag = item.h2.a
    description = atag.text.strip()
    url = "https://www.amazon.in" + atag.get("href")
    
    # price
    price = item.find('span', 'a-price-whole').text
    
    # rank and rating
    rating = item.i.text
    
    item.find('span', 'a-size-base').text
    rating_count = item.find('span', 'a-size-base').text
    
    result = (description, price, rating, rating_count, url)
    
    return result

In [29]:
# now we try to apply the above to the url and try to extract
records = []
results = soup.find_all('div', {'data-component-type': 's-search-result'})

for item in results:
    records.append(extract_records(item))

AttributeError: 'NoneType' object has no attribute 'text'

**We encounter the attribute error, this is basically because there are numerous results in the website page that need not match all the description as given for each of the products. Not all products might be having the same descriptions. Hence the attribute error occuring. We need to give exception for this error in the code**

## Error Handling

In [34]:
def extract_records(item):
    '''Extract and return data from a single record'''
    #description and url
    atag = item.h2.a
    description = atag.text.strip()
    url = "https://www.amazon.in" + atag.get("href")
    
    '''Basically we put the exception for price here which most definitely 
        should not be there return'''
    try:
        # price
        price = item.find('span', 'a-price-whole').text
    except AttributeError:
        price = ''
        
    
    ''' For the ratings although there might be missign details and 
        we rather add the exception properly'''
    try:
        # rank and rating
        rating = item.i.text
        
        item.find('span', 'a-size-base').text
        rating_count = item.find('span', 'a-size-base').text
    except AttributeError:
        rating = ''
        rating_count = ''
    
    result = (description, price, rating, rating_count, url)
    
    return result

In [35]:
# additionally we need to check whether the results give any empty records
records = []
results = soup.find_all('div', {'data-component-type': 's-search-result'})

for item in results:
   records.append(extract_records(item))

In [36]:
records[0]

("Men's Flex Experience Rn 10 Sport Performance",
 '3,240',
 '4.1 out of 5 stars',
 '155',
 'https://www.amazon.in/Nike-Experience-Crimson-Lt-Running-CI9960-008/dp/B096CJXNJ6/ref=sr_1_1?crid=UOAX8JTJZ8XJ&keywords=nike+shoes&qid=1642171385&sr=8-1')

In [37]:
for row in records:
    print(row[1])

3,240

3,763
5,596
1,560
4,103
5,598
1,729
2,580
2,412
5,072
3,652
3,614
5,493
6,293
4,224
1,399
1,599
1,599
1,399
3,101

2,506
3,294
4,273
2,464
2,654
3,694
2,397
3,287
5,995
2,747
5,852
3,596
1,896
2,700
2,346
2,995
6,942
2,694
2,018
3,795
3,477
2,036
3,357
2,408
8,588
3,326
5,396
5,260
8,109
3,179
13,019
6,999
449
1,399
1,399
1,499
9,899
1,599


In [38]:
# next step is to get the next page in order to get most reviews
# we need to take the component for the next button
def get_url(search_term):
    '''
    This function fetches the URL of the item that you want to search
    '''
    template = 'https://www.amazon.in/s?k={}&crid=UOAX8JTJZ8XJ&ref=nb_sb_noss_1'
    # We'are replacing every space with '+' to adhere with the pattern 
    search_term = search_term.replace(" ","+")
    
    # add term query to url
    url = template.format(search_term)
    
    # add page query placeholder
    url += '&page{}'
    
    return url

In [39]:
## Putting all the pieces of code together
import csv
from bs4 import BeautifulSoup
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager

def get_url(search_term):
    '''
    This function fetches the URL of the item that you want to search
    '''
    template = 'https://www.amazon.in/s?k={}&crid=UOAX8JTJZ8XJ&ref=nb_sb_noss_1'
    # We'are replacing every space with '+' to adhere with the pattern 
    search_term = search_term.replace(" ","+")
    
    # add term query to url
    url = template.format(search_term)
    
    # add page query placeholder
    url += '&page{}'
    
    return url

def extract_records(item):
    '''Extract and return data from a single record'''
    #description and url
    atag = item.h2.a
    description = atag.text.strip()
    url = "https://www.amazon.in" + atag.get("href")
    
    '''Basically we put the exception for price here which most definitely 
        should not be there return'''
    try:
        # price
        price = item.find('span', 'a-price-whole').text
    except AttributeError:
        price = ''
    
    ''' For the ratings although there might be missign details and 
        we rather add the exception properly'''
    try:
        # rank and rating
        rating = item.i.text
        
        item.find('span', 'a-size-base').text
        rating_count = item.find('span', 'a-size-base').text
    except AttributeError:
        rating = ''
        rating_count = ''
    
    result = (description, price, rating, rating_count, url)
    
    return result

def main(search_term):
    """Run the main program routine"""
    driver = webdriver.Chrome(ChromeDriverManager().install())

    record = []
    url = get_url(search_term)
    
    for page in range(1, 6):
        driver.get(url.format(page))
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        results = soup.find_all('div', {'data-component-type': 's-search-result'})
        
        for item in results:
            record = extract_records(item)
            if record:
                records.append(record)
    
    driver.close()
    
    #save the data to csv file
    with open('results.csv', 'w', newline='', encoding = 'utf-8') as f:
        writer = csv.writer(f)
        writer.writerow({'Description', 'Price_in_INR', 'Reviews', 'Review_count', 'Url'})
        writer.writerows(records)

In [40]:
main('nike shoes')



Current google-chrome version is 97.0.4692
Get LATEST chromedriver version for 97.0.4692 google-chrome
Driver [C:\Users\bikem\.wdm\drivers\chromedriver\win32\97.0.4692.71\chromedriver.exe] found in cache
  driver = webdriver.Chrome(ChromeDriverManager().install())
