# Scraper to Monitor Amazon Prices

In [1]:
import csv
from datetime import datetime
from bs4 import BeautifulSoup
from msedge.selenium_tools import Edge, EdgeOptions

In [7]:
def scrape_page(raw_html):
    """Scrape results from page"""
    results = []
    soup = BeautifulSoup(raw_html, 'lxml')

    # get the next navigation page
    try:
        next_page = 'https://www.amazon.com' + soup.find('li', 'a-last').a.get('href')
    except AttributeError:
        next_page = None

    items = soup.find_all('div', 's-result-item')
    for row in items:
        try:
            item_desc = row.find('span', 'a-size-medium').text.strip()
        except AttributeError:
            continue
        try:
            rating = row.find('span', 'a-icon-alt').text
            # filter out sponsored ads
            if 'sponsored' in rating.lower():
                continue
        except AttributeError:
            rating = ''
        try:
            review_count = row.find('span', {'class': 'a-size-base', 'dir': 'auto'}).text
            try:
                # sometimes this picks up coupons... this will filter that out.
                int(review_count)
            except ValueError:
                review_count = 0
        except AttributeError:
            review_count = ''
        try:
            price = row.find('span', 'a-price').span.text
            item_price = float(price.replace('$','').replace(',',''))
        except AttributeError:
            continue
        item_link = 'https://www.amazon.com' + row.find('a', 'a-link-normal').get('href')
        img_url = row.img['src']
        result = {
            'item_desc': item_desc,
            'rating': rating,
            'review_count': review_count,
            'item_price': item_price,
            'item_link': item_link,
            'img_url': img_url,
        }
        if result:
            results.append(result)

    return results, next_page

def get_search_url(term):
    """Generate a url based on the search term"""
    adjusted_term = term.replace(' ', '+')
    base = "https://www.amazon.com/s?k={}&ref=nb_sb_noss_2"
    return base.format(adjusted_term)

In [4]:
# Launch Microsoft Edge (Chromium)
options = EdgeOptions()
options.use_chromium = True
options.add_argument('-headless')
options.add_argument('-disable-gpu')
driver = Edge(options = options)

# save search results
search_results = []
search_term = 'ultrawide monitor'
url = get_search_url(search_term)

# initial page results
driver.get(url)
results, next_page = scrape_page(driver.page_source)
search_results.extend(results)

# get remaining results
while True:
    if not next_page:
        driver.close()
        break
    else:
        driver.get(next_page)
        results, next_page = scrape_page(driver.page_source)
        search_results.extend(results)

In [5]:
# save_results
field_names = list(search_results[0].keys())

filename = 'search_results_' + datetime.today().strftime('%Y%m%d') + '.csv'

with open(filename, mode='w', newline='', encoding='utf-8') as f:
    dictwriter = csv.DictWriter(f, fieldnames=field_names)
    dictwriter.writeheader()
    dictwriter.writerows(search_results)

In [6]:
for key, value in search_results[0].items():
    print(key, '--', value)

item_desc -- Deco Gear 35" Curved Ultrawide LED Gaming Monitor 21:9 Aspect Ratio, Crisp 2560 x 1080 Resolution, 16.7 Million Colors, 75 HZ Refresh Rate, 2000:1 Contrast Ratio, (HDMI, DVI, and DP Connections)
rating -- 4.6 out of 5 stars
review_count -- 103
item_price -- 319.99
item_link -- https://www.amazon.com/gp/slredirect/picassoRedirect.html/ref=pa_sp_atf_aps_sr_pg1_1?ie=UTF8&adId=A08509659VO79UAH3ZGM&url=%2FDeco-Gear-Curved-Ultrawide-Monitor%2Fdp%2FB07NPT78C6%2Fref%3Dsr_1_1_sspa%3Fdchild%3D1%26keywords%3Dultrawide%2Bmonitor%26qid%3D1598713294%26sr%3D8-1-spons%26psc%3D1&qualifier=1598713294&id=6766989571022690&widgetName=sp_atf
img_url -- https://m.media-amazon.com/images/I/61gBigSCEWL._AC_UY218_.jpg
