# Amazon Web Scraper

## Requirements
- Selenium
- BeautifulSoup

In [1]:
from bs4 import BeautifulSoup
import csv
import requests
import pandas as pd

In [2]:
from selenium import webdriver

## Startup the webdriver

In [3]:
driver = webdriver.Chrome()

In [4]:
url='http://www.amazon.com'
driver.get(url)

In [5]:
def get_url(search_term):
    '''Generate a url from search term'''
    template='https://www.amazon.com/s?k={}&crid=2EH2C75C82P0M&sprefix=ultrawide+monitor%2Caps%2C434&ref=nb_sb_noss_2'
    search_term=search_term.replace(' ','+')
    return template.format(search_term)

In [6]:
url=get_url('ultrawide moniter')
print(url)

https://www.amazon.com/s?k=ultrawide+moniter&crid=2EH2C75C82P0M&sprefix=ultrawide+monitor%2Caps%2C434&ref=nb_sb_noss_2


In [7]:
driver.get(url)

# Extract the collection

In [8]:
soup=BeautifulSoup(driver.page_source,'html.parser')

In [9]:
results=soup.find_all('div',{'data-component-type':'s-search-result'})

In [10]:
len(results)              # i totaly have only 16 records but it showing me the 22 records because of extra advertising data

22

# Prototype the record

In [12]:
item=results[0]
item

<div class="s-result-item s-asin sg-col-0-of-12 sg-col-16-of-20 AdHolder sg-col s-widget-spacing-small sg-col-12-of-16" data-asin="B097NJ43WW" data-cel-widget="search_result_2" data-component-id="45" data-component-type="s-search-result" data-index="2" data-uuid="d4b6bc30-15b0-499b-9186-4632d3e983a0"><div class="sg-col-inner">
<div cel_widget_id="MAIN-SEARCH_RESULTS-2" class="s-widget-container s-spacing-small s-widget-container-height-small celwidget slot=MAIN template=SEARCH_RESULTS widgetId=search-results_1" data-cel-widget="MAIN-SEARCH_RESULTS-2" data-csa-c-id="uetoon-u60eni-1y6okz-y12bkx">
<div class="rush-component" data-component-id="46" data-component-props='{"percentageShownToFire":"50","batchable":true,"requiredElementSelector":".s-image:visible","url":"https://unagi-na.amazon.com/1/events/com.amazon.eel.SponsoredProductsEventTracking.prod?qualifier=1649049271&amp;id=4629323023341950&amp;widgetName=sp_atf&amp;adId=200087116504321&amp;eventType=1&amp;adIndex=0"}' data-componen

In [27]:
atag=item.h2.a

In [29]:
description=atag.text.strip()

In [31]:
url='http://www.amazon.com' + atag.get('href')

In [33]:
price_parent=item.find('span','a-price')

In [36]:
price=price_parent.find('span','a-offscreen').text

In [39]:
rating=item.i.text

# Generalize the pattern

In [46]:
def extract_record(item):
    '''Extract and return data from a single record'''
    
    # description and url
    atag=item.h2.a
    description=atag.text.strip()
    url='http://www.amazon.com' + atag.get('href')
    
    # price
    price_parent=item.find('span','a-price')
    price=price_parent.find('span','a-offscreen').text
    
    # rating
    rating=item.i.text
    
    result =(description,price,rating,url)
    
    return result

In [47]:
records =[]
results=soup.find_all('div',{'data-component-type':'s-search-result'})

for item in results:
    records.append(extract_record(item))

AttributeError: 'NoneType' object has no attribute 'find'

In [48]:
# i am getting an error because some of the product has no price data so its giving me an error

# so will write a command provide the data if their is no such data aswell

# Error Handling

In [49]:
def extract_record(item):
    '''Extract and return data from a single record'''
    
    # description and url
    atag=item.h2.a
    description=atag.text.strip()
    url='http://www.amazon.com' + atag.get('href')
    
    try:
        # price
        price_parent=item.find('span','a-price')
        price=price_parent.find('span','a-offscreen').text
    except AttributeError:
        return
    
    try:
        # rating
        rating=item.i.text
    except AttributeError:
        rating = ""
    
    result =(description,price,rating,url)
    
    return result

In [51]:
records =[]
results=soup.find_all('div',{'data-component-type':'s-search-result'})

for item in results:
    record =extract_record(item)
    if record:
        records.append(record)

In [52]:
records[0]

('ViewSonic VG3456 34 Inch 21:9 UltraWide WQHD 1440p Monitor with Ergonomics Design USB Type C Docking Built-in Gigabit Ethernet for Home and Office',
 '$599.99',
 '4.2 out of 5 stars',
 'http://www.amazon.com/gp/slredirect/picassoRedirect.html/ref=pa_sp_atf_aps_sr_pg1_1?ie=UTF8&adId=A091041722USLSNU4D44A&url=%2FViewSonic-VG3456-Inch-Ergonomics-Built%2Fdp%2FB0912L63P2%2Fref%3Dsr_1_1_sspa%3Fcrid%3D2EH2C75C82P0M%26keywords%3Dultrawide%2Bmonitor%26qid%3D1643943322%26sprefix%3Dultrawide%2Bmonitor%252Caps%252C434%26sr%3D8-1-spons%26psc%3D1&qualifier=1643943322&id=1458293824992456&widgetName=sp_atf')

In [53]:
for row in records:
    print(row[1])

$599.99
$449.99
$429.97
$1,199.99
$246.99
$529.98
$349.99
$499.99
$479.97
$349.99
$459.99
$449.99
$799.99
$459.99
$359.95
$459.99
$209.99
$549.99
$299.99


# Getting the next page

In [54]:
def get_url(search_term):
    '''Generate a url from search term'''
    template='https://www.amazon.com/s?k={}&crid=2EH2C75C82P0M&sprefix=ultrawide+monitor%2Caps%2C434&ref=nb_sb_noss_2'
    search_term=search_term.replace(' ','+')
    
    # add term query to url
    url = template.format(search_term)
    
    # add page query placeholder
    url+='&page{}'
    return template.format(search_term)

## Putting it all together

In [56]:
from bs4 import BeautifulSoup
import csv

def get_url(search_term):
    '''Generate a url from search term'''
    template='https://www.amazon.com/s?k={}&crid=2EH2C75C82P0M&sprefix=ultrawide+monitor%2Caps%2C434&ref=nb_sb_noss_2'
    search_term=search_term.replace(' ','+')
    
    # add term query to url
    url = template.format(search_term)
    
    # add page query placeholder
    url+='&page{}'
    return template.format(search_term)

def extract_record(item):
    '''Extract and return data from a single record'''
    
    # description and url
    atag=item.h2.a
    description=atag.text.strip()
    url='http://www.amazon.com' + atag.get('href')
    
    try:
        # price
        price_parent=item.find('span','a-price')
        price=price_parent.find('span','a-offscreen').text
    except AttributeError:
        return
    
    try:
        # rating
        rating=item.i.text
    except AttributeError:
        rating = ""
    
    result =(description,price,rating,url)
    
    return result

def main(search_term):
    '''run main program routine'''
    # startup the webdriver
    
    driver = webdriver.Chrome()
    
    record=[]
    url=get_url(search_term)
    
    for page in range(1,21):
        driver.get(url.format(page))
        soup=BeautifulSoup(driver.page_source,'html.parser')
        results=soup.find_all('div',{'data-component-type':'s-search-result'})
        
        for item in results:
            record=extract_record(item)
            if record:
                records.append(record)
        
    driver.close()
    
    # save data to csv file
    with open('results.csv','w',newline='',encoding='utf-8') as f:
        writer =csv.writer(f)
        writer.writerow(['Description','price','rating','url'])
        writer.writerows(records)


In [57]:
main('ultrawide monitor')