# Amazon Web Scraper

## Requirements
- Selenium
- BeautifulSoup

In [1]:
from bs4 import BeautifulSoup
import csv
import requests
import pandas as pd

In [2]:
# pip install selenium

In [4]:
from selenium import webdriver

## Startup the webdriver

In [6]:
driver = webdriver.Chrome()

In [7]:
url='http://www.amazon.com'
driver.get(url)

In [8]:
def get_url(search_term):
    '''Generate a url from search term'''
    template='https://www.amazon.com/s?k={}&crid=2EH2C75C82P0M&sprefix={}r%2Caps%2C434&ref=nb_sb_noss_2'
    search_term=search_term.replace(' ','+')
    return template.format(search_term,search_term)

In [10]:
url=get_url('honor mobile')
print(url)

https://www.amazon.com/s?k=honor+mobile&crid=2EH2C75C82P0M&sprefix=honor+mobiler%2Caps%2C434&ref=nb_sb_noss_2


In [11]:
driver.get(url)

# Extract the collection

In [12]:
soup=BeautifulSoup(driver.page_source,'html.parser')

In [12]:
results=soup.find_all('div',{'data-component-type':'s-search-result'})

In [13]:
len(results)              # i totaly have only 16 records but it showing me the 22 records because of extra advertising data

22

# Prototype the record

In [14]:
item=results[0]

In [15]:
atag=item.h2.a

In [16]:
description=atag.text.strip()

In [17]:
url='http://www.amazon.com' + atag.get('href')

In [18]:
price_parent=item.find('span','a-price')

In [19]:
price=price_parent.find('span','a-offscreen').text

In [20]:
rating=item.i.text

# Generalize the pattern

In [21]:
def extract_record(item):
    '''Extract and return data from a single record'''
    
    # description and url
    atag=item.h2.a
    description=atag.text.strip()
    url='http://www.amazon.com' + atag.get('href')
    
    # price
    price_parent=item.find('span','a-price')
    price=price_parent.find('span','a-offscreen').text
    
    # rating
    rating=item.i.text
    
    result =(description,price,rating,url)
    
    return result

In [22]:
records =[]
results=soup.find_all('div',{'data-component-type':'s-search-result'})

for item in results:
    records.append(extract_record(item))

AttributeError: 'NoneType' object has no attribute 'find'

In [23]:
# i am getting an error because some of the product has no price data so its giving me an error

# so will write a command provide the data if their is no such data aswell

# Error Handling

In [24]:
def extract_record(item):
    '''Extract and return data from a single record'''
    
    # description and url
    atag=item.h2.a
    description=atag.text.strip()
    url='http://www.amazon.com' + atag.get('href')
    
    try:
        # price
        price_parent=item.find('span','a-price')
        price=price_parent.find('span','a-offscreen').text
    except AttributeError:
        return
    
    try:
        # rating
        rating=item.i.text
    except AttributeError:
        rating = ""
    
    result =(description,price,rating,url)
    
    return result

In [25]:
records =[]
results=soup.find_all('div',{'data-component-type':'s-search-result'})

for item in results:
    record =extract_record(item)
    if record:
        records.append(record)

In [26]:
records[0]

('TCL 20 SE 6.82" Unlocked Cellphone, 4GB RAM + 128GB ROM, US Version Android 11 Smartphone with 48MP Rear AI Quad-Camera, 5000mAh Big Battery, Dual Speaker, OTG Reverse Charging Octa-Core, Aurora Green',
 '$189.99',
 '4.3 out of 5 stars',
 'http://www.amazon.com/gp/slredirect/picassoRedirect.html/ref=pa_sp_atf_aps_sr_pg1_1?ie=UTF8&adId=A088291415ZHLA05YGTI&url=%2FTCL-Cellphone-Smartphone-Quad-Camera-Octa-Core%2Fdp%2FB0949744D3%2Fref%3Dsr_1_1_sspa%3Fcrid%3D2EH2C75C82P0M%26keywords%3Dhonor%2Bmobile%26qid%3D1643986247%26sprefix%3Dhonor%2Bmobiler%252Caps%252C434%26sr%3D8-1-spons%26psc%3D1%26smid%3DA30Q8ROZNHHS8N&qualifier=1643986247&id=2426495670451488&widgetName=sp_atf')

In [27]:
for row in records:
    print(row[1])

$189.99
$499.99
$189.99
$249.99
$57.85
$179.99
$159.99
$119.99
$69.99
$71.99
$228.88
$159.99
$289.00
$199.99
$77.77
$159.99
$169.99


# Getting the next page

In [28]:
def get_url(search_term):
    '''Generate a url from search term'''
    template='https://www.amazon.com/s?k={}&crid=2EH2C75C82P0M&sprefix=ultrawide+monitor%2Caps%2C434&ref=nb_sb_noss_2'
    search_term=search_term.replace(' ','+')
    
    # add term query to url
    url = template.format(search_term)
    
    # add page query placeholder
    url+='&page{}'
    return template.format(search_term)

## Putting it all together

In [45]:
from bs4 import BeautifulSoup
import csv

def get_url(search_term):
    '''Generate a url from search term'''
    template='https://www.amazon.com/s?k={}&crid=2EH2C75C82P0M&sprefix=honor+mobiler%2Caps%2C434&ref=nb_sb_noss_2'
    search_term=search_term.replace(' ','+')
    
    # add term query to url
    url = template.format(search_term)
    
    # add page query placeholder
    url+='&page{}'
    return template.format(search_term)

def extract_record(item):
    '''Extract and return data from a single record'''
    
    # description and url
    atag=item.h2.a
    description=atag.text.strip()
    url='http://www.amazon.com' + atag.get('href')
    
    try:
        # price
        price_parent=item.find('span','a-price')
        price=price_parent.find('span','a-offscreen').text
    except AttributeError:
        return
    
    try:
        # rating
        rating=item.i.text
    except AttributeError:
        rating = ""
    
    result =(description,price,rating,url)
    
    return result

def main(search_term):
    '''run main program routine'''
    # startup the webdriver
    
    driver = webdriver.Chrome()
    
    records=[]
    url=get_url(search_term)
    
    for page in range(1,2):
        driver.get(url.format(page))
        soup=BeautifulSoup(driver.page_source,'html.parser')
        results=soup.find_all('div',{'data-component-type':'s-search-result'})
        
        for item in results:
            record=extract_record(item)
            if record:
                records.append(record)
        
    driver.close()
    
    # save data to csv file
    with open('output.csv','w',newline='',encoding='utf-8') as f:
        writer =csv.writer(f)
        writer.writerow(['description','price','rating','url'])
        writer.writerows(records)


In [46]:
main('honor mobile')

In [39]:
import csv

In [48]:
767/20

38.35