# Scraper to Monitor Amazon Prices

In [1]:
import csv
from datetime import datetime
from bs4 import BeautifulSoup
from msedge.selenium_tools import Edge, EdgeOptions

In [2]:
def scrape_page(raw_html):
    """Scrape results from page"""
    results = []
    soup = BeautifulSoup(raw_html, 'lxml')

    # get the next navigation page
    try:
        next_page = 'https://www.amazon.com' + soup.find('li', 'a-last').a.get('href')
    except AttributeError:
        next_page = None

    items = soup.find_all('div', 's-result-item')
    for row in items:
        try:
            item_desc = row.find('span', 'a-size-medium').text.strip()
        except AttributeError:
            continue
        try:
            rating = row.find('span', 'a-icon-alt').text
            # filter out sponsored ads
            if 'sponsored' in rating.lower():
                continue
        except AttributeError:
            rating = ''
        try:
            review_count = row.find('span', {'class': 'a-size-base', 'dir': 'auto'}).text
            try:
                # sometimes this picks up coupons... this will filter that out.
                int(review_count)
            except ValueError:
                review_count = 0
        except AttributeError:
            review_count = ''
        try:
            price = row.find('span', 'a-price').span.text
            item_price = float(price.replace('$','').replace(',',''))
        except AttributeError:
            continue
        item_link = 'https://www.amazon.com' + row.find('a', 'a-link-normal').get('href')
        img_url = row.img['src']
        result = {
            'item_desc': item_desc,
            'rating': rating,
            'review_count': review_count,
            'item_price': item_price,
            'item_link': item_link,
            'img_url': img_url,
        }
        if result:
            results.append(result)

    return results, next_page

In [3]:
# Launch Microsoft Edge (Chromium)
options = EdgeOptions()
options.use_chromium = True
options.add_argument('-headless')
options.add_argument('-disable-gpu')
driver = Edge(options = options)

# save search results
search_results = []

# initial page results
driver.get('https://www.amazon.com/s?k=ultrawide+monitor&ref=nb_sb_noss_1')
results, next_page = scrape_page(driver.page_source)
search_results.extend(results)

# get remaining results
while True:
    if not next_page:
        driver.close()
        break
    else:
        driver.get(next_page)
        results, next_page = scrape_page(driver.page_source)
        search_results.extend(results)

In [4]:
# save_results
field_names = list(search_results[0].keys())

filename = 'search_results_' + datetime.today().strftime('%Y%m%d') + '.csv'

with open(filename, mode='w', newline='', encoding='utf-8') as f:
    dictwriter = csv.DictWriter(f, fieldnames=field_names)
    dictwriter.writeheader()
    dictwriter.writerows(search_results)

In [8]:
for key, value in search_results[0].items():
    print(key, '--', value)

item_desc -- AOC CQ34G2 Super Curved Frameless Gaming Monitor, UltraWide FHD 2560x1080, 1500R VA Panel, 1ms MPRT, 75Hz, FreeSync, Height Adjustable, 3-Yr Zero Dead Pixels
rating -- 4.6 out of 5 stars
review_count -- 0
item_price -- 309.99
item_link -- https://www.amazon.com/AOC-CQ34G2-Frameless-UltraWide-Adjustable/dp/B08632C4MS/ref=sr_1_1?dchild=1&keywords=ultrawide+monitor&qid=1598580150&sr=8-1
img_url -- https://m.media-amazon.com/images/I/41+CXFgQwjL._AC_UY218_.jpg
