# Python script to scrape from Amazon given a search term input
Product price, review count, rating , product name data, url

## Import modules

In [1]:
import csv
from bs4 import BeautifulSoup
#chrome
from selenium import webdriver

## Start up the webdriver

In [17]:
driver= webdriver.Chrome()

In [19]:
url='https://www.amazon.com'
driver.implicitly_wait(30)
driver.get(url)

## Conduct Product Search

In [20]:
# conduct product search
#automate only what i absolutely need to:
# search term embeds into url, so format the url
def get_url(search_term):
    """generate url from search term"""
    template= 'https://www.amazon.com/s?k={}&crid=1Q9SL57XPI391&sprefix=necklace+for+women%2Caps%2C65&ref=nb_sb_noss_1'
    search_term = search_term.replace(' ','+')
    return template.format(search_term)


In [25]:
url= get_url('necklace for woman')
print(url)

https://www.amazon.com/s?k=necklace+for+woman&crid=1Q9SL57XPI391&sprefix=necklace+for+women%2Caps%2C65&ref=nb_sb_noss_1


In [26]:
driver.get(url)

In [30]:
#Extract content from page
soup=BeautifulSoup(driver.page_source, 'html.parser')

In [32]:
results= soup.find_all('div',{'data-component-type':'s-search-result'})

In [33]:
len(results)

60

In [35]:
#prototype the record
item=results[0]
#extract record header
atag=item.h2.a
atag.text

'Sterling Silver Cubic Zirconia Halo Pendant Necklace (Round & Princess) '

In [36]:
description=atag.text.strip()
description

'Sterling Silver Cubic Zirconia Halo Pendant Necklace (Round & Princess)'

In [38]:
url ='https://www.amazon.com' + atag.get('href')

In [39]:
#get price by inspecting
price_parent= item.find('span', 'a-price')

In [40]:
price = price_parent.find('span', 'a-offscreen').text

'$20.80'

In [48]:
# star reviews
rating= item.i.text
rating

'4.4 out of 5 stars'

In [51]:
review_count = item.find('span',{'class':'a-size-base s-underline-text'}).text


## Generalize the Pattern

In [52]:

def extract_record(item):
    #extract and return data from a single record
    #description and url
    atag=item.h2.a
    description=atag.text.strip()
    url ='https://www.amazon.com' + atag.get('href')
    
    #price
    price_parent= item.find('span', 'a-price')
    price = price_parent.find('span', 'a-offscreen').text
    
    #rank and rating
    rating= item.i.text
    review_count = item.find('span',{'class':'a-size-base s-underline-text'}).text
    
    result= (description, price, rating, review_count, url)
    return result



In [55]:
records =[]
results = soup.find_all('div', {'data-component-type': 's-search-result'})

for item in results:
    records.append(extract_record(item))
    

## Error handling


In [56]:
def extract_record(item):
    #extract and return data from a single record
    #description and url
    atag=item.h2.a
    description=atag.text.strip()
    url ='https://www.amazon.com' + atag.get('href')
    
    try: 
        #price
        price_parent= item.find('span', 'a-price')
        price = price_parent.find('span', 'a-offscreen').text
    except AttributeError:
        return
    
    try:
    #rank and rating
        rating= item.i.text
        review_count = item.find('span',{'class':'a-size-base s-underline-text'}).text

    except AttributeError:
        rating=''
        review_count =''
        
    result= (description, price, rating, review_count, url)
    
    return result



In [57]:
records =[]
results = soup.find_all('div', {'data-component-type': 's-search-result'})

for item in results:
    record =extract_record(item)
    if record: 
        records.append(record)

In [58]:
records[0]

('Sterling Silver Cubic Zirconia Halo Pendant Necklace (Round & Princess)',
 '$20.80',
 '4.4 out of 5 stars',
 '1,250',
 'https://www.amazon.com/gp/slredirect/picassoRedirect.html/ref=pa_sp_atf_aps_sr_pg1_1?ie=UTF8&adId=A02173901HT2FVKE9UH8A&url=%2FAmazon-Essentials-Sterling-Zirconia-Necklace%2Fdp%2FB00OZKEKAM%2Fref%3Dsr_1_1_sspa%3Fcrid%3D1Q9SL57XPI391%26keywords%3Dnecklace%2Bfor%2Bwomen%26qid%3D1648499243%26sprefix%3Dnecklace%2Bfor%2Bwomen%252Caps%252C65%26sr%3D8-1-spons%26psc%3D1&qualifier=1648499243&id=5787439452785647&widgetName=sp_atf')

In [59]:
for row in records:
    print(row[1])

$20.80
$39.99
$79.99
$26.34
$48.00
$15.40
$70.00
$99.00
$12.95
$17.50
$115.00
$129.80
$36.00
$30.60
$27.00
$69.99
$18.50
$49.99
$99.99
$38.00
$37.00
$65.10
$250.00
$125.00
$7.98
$225.00
$25.30
$28.30
$105.11
$74.20
$20.80
$69.99
$12.99
$37.98
$17.60
$32.00
$16.60
$24.20
$44.20
$32.00
$65.00
$28.00
$134.90
$49.00
$38.00
$54.95
$35.00
$35.00
$279.00
$12.95
$28.30
$9.69
$14.99
$17.60
$26.99
$29.99
$23.40
$18.17
$18.60
$59.99


## Getting to Next Page

In [60]:
#adjust query in url using string formatting
#modify get url function

def get_url(search_term):
    #generate url from search term
    template= 'https://www.amazon.com/s?k={}&crid=1Q9SL57XPI391&sprefix=necklace+for+women%2Caps%2C65&ref=nb_sb_noss_1'
    search_term = search_term.replace(' ','+')
    
    #add term query to url
    url= templage.format(search_term)
    
    #add page query placeholder
    url += '&page{}'
    
    return url

# Put it all together and Scrape

In [66]:
import csv
from bs4 import BeautifulSoup
#chrome
from selenium import webdriver

def get_url(search_term):
    #generate url from search term
    template= 'https://www.amazon.com/s?k={}&crid=1Q9SL57XPI391&sprefix=necklace+for+women%2Caps%2C65&ref=nb_sb_noss_1'
    search_term = search_term.replace(' ','+')
    
    #add term query to url
    url= template.format(search_term)
    
    #add page query placeholder
    url += '&page{}'
    
    return url

def extract_record(item):
    #extract and return data from a single record
    #description and url
    atag=item.h2.a
    description=atag.text.strip()
    url ='https://www.amazon.com' + atag.get('href')
    
    try: 
        #price
        price_parent= item.find('span', 'a-price')
        price = price_parent.find('span', 'a-offscreen').text
    except AttributeError:
        return
    
    try:
    #rank and rating
        rating= item.i.text
        review_count = item.find('span',{'class':'a-size-base s-underline-text'}).text

    except AttributeError:
        rating=''
        review_count =''
        
    result= (description, price, rating, review_count, url)
    
    return result

def main(search_term):
    """Run main program routine"""
    #startup the webdriver
    driver= webdriver.Chrome()
    
    record=[]
    url= get_url(search_term)
    
    for page in range(1,21): # first 20 pages
        driver.get(url.format(page))
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        results = soup.find_all('div', {'data-component-type': 's-search-result'})
    
        for item in results:
            record = extract_record(item)
            if record:
                records.append(record)
    
    driver.close()
    
    #save data to csv
    with open('results.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['Description', 'Price', 'Rating', 'ReviewCount', 'Url'])
        writer.writerows(records)

In [67]:
main('necklace for women')