In [1]:
import re
import requests
import time
from requests_html import HTML
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

In [2]:
options = Options()
options.add_argument('--headless')

driver = webdriver.Chrome(options = options)

In [3]:
categories = [
    'https://www.amazon.com/s?bbn=16225007011&rh=n%3A16225007011%2Cn%3A193870011&dc&fst=as%3Aoff&qid=1589403115&rnid=16225007011&ref=lp_16225007011_nr_n_1',
    'https://www.amazon.com/s?bbn=16225007011&rh=n%3A16225007011%2Cn%3A1292110011&dc&fst=as%3Aoff&qid=1589403115&rnid=16225007011&ref=lp_16225007011_nr_n_3',
    'https://www.amazon.com/s?bbn=16225007011&rh=n%3A16225007011%2Cn%3A3011391011&dc&fst=as%3Aoff&qid=1589403115&rnid=16225007011&ref=lp_16225007011_nr_n_4',
    'https://www.amazon.com/s?bbn=16225007011&rh=n%3A16225007011%2Cn%3A1292115011&dc&fst=as%3Aoff&qid=1589403115&rnid=16225007011&ref=lp_16225007011_nr_n_5'
]

In [4]:
first_url = categories[0]

In [5]:
driver.get(first_url)

In [6]:
body_el = driver.find_element_by_css_selector('body')
html_str = body_el.get_attribute('innerHTML')

In [7]:
html_obj = HTML(html=html_str)

In [8]:
new_links = [x for x in html_obj.links if x.startswith('/')]
page_links = [f'https://www.amazon.com{x}' for x in new_links]

In [9]:
def scrape_product_page(url, title_lookup = '#productTitle'
    ,price_lookup = '#priceblock_ourprice'):
    driver.get(url)
    time.sleep(1.5)
    body_el = driver.find_element_by_css_selector('body')
    html_str = body_el.get_attribute('innerHTML')
    html_obj = HTML(html=html_str)
    product_title = html_obj.find(title_lookup, first = True).text
    product_price = html_obj.find(price_lookup, first = True).text
    return product_title, product_price

In [10]:
regex_options = [
    r'https://www.amazon.com/gp/product/(?P<product_id>[\w-]+)/',
    r'https://www.amazon.com/dp/(?P<product_id>[\w-]+)/',
    r'https://www.amazon.com/(?P<slug>[\w-]+)/dp/(?P<product_id>[\w-]+)/'
]

In [11]:
def extract_product_id_from_url(url):
    product_id = None
    for regex_str in regex_options:
        regex = re.compile(regex_str)
        match = regex.match(url)
        if match != None:
            try:
                product_id = match['product_id']
            except:
                pass
            
    return product_id

In [12]:
# page_links = [x for x in page_links if extract_product_id_from_url(x) != None]

def clean_page_links(page_links = []):
    final_page_links = []
    for url in page_links:
        product_id = extract_product_id_from_url(url)
        if product_id != None:
            final_page_links.append({'url':url, 'product_id':product_id})
    return final_page_links

cleaned_links = clean_page_links(page_links)

In [18]:
len(page_links)

181

In [23]:
len(cleaned_links)

38

In [28]:
cleaned_links

[{'url': 'https://www.amazon.com/Blue-NAND-500GB-SSD-WDS500G2B0A/dp/B073SBZ8YH/ref=sr_1_16?dchild=1&fst=as%3Aoff&qid=1589819517&rnid=16225007011&s=computers-intl-ship&sr=1-16#customerReviews',
  'product_id': 'B073SBZ8YH'},
 {'url': 'https://www.amazon.com/Thermaltake-Certified-Continuous-cooling-PS-SPD-0500NPCWUS-W/dp/B014W3EM2W/ref=sr_1_14?dchild=1&fst=as%3Aoff&qid=1589819517&rnid=16225007011&s=computers-intl-ship&sr=1-14#customerReviews',
  'product_id': 'B014W3EM2W'},
 {'url': 'https://www.amazon.com/Passport-Portable-External-Drive-Black/dp/B07VTFN6HM/ref=sr_1_12?dchild=1&fst=as%3Aoff&qid=1589819517&rnid=16225007011&s=computers-intl-ship&sr=1-12#customerReviews',
  'product_id': 'B07VTFN6HM'},
 {'url': 'https://www.amazon.com/Samsung-970-EVO-1TB-MZ-V7E1T0BW/dp/B07BN217QG/ref=sr_1_5?dchild=1&fst=as%3Aoff&qid=1589819517&rnid=16225007011&s=computers-intl-ship&sr=1-5#customerReviews',
  'product_id': 'B07BN217QG'},
 {'url': 'https://www.amazon.com/Samsung-Inch-Internal-MZ-76E1T0B-AM/d

In [29]:
def perform_scrape(cleaned_items =[]):
    extracted_data = []
    for obj in cleaned_items:
        link = obj['url']
        product_id = obj['product_id']
        title, price = (None, None)
        try:
            title, price = scrape_product_page(link)
        except:
            pass
        if title != None and price !=None:
            print (link, title, price)
        product_data ={
            'url' : link,
            'product_id':product_id,
            'title':title,
            'price':price
        }
        extracted_data.append(product_data)
    
    return extracted_data

In [30]:
extracted_data = perform_scrape(cleaned_items = cleaned_links)

https://www.amazon.com/Blue-NAND-500GB-SSD-WDS500G2B0A/dp/B073SBZ8YH/ref=sr_1_16?dchild=1&fst=as%3Aoff&qid=1589819517&rnid=16225007011&s=computers-intl-ship&sr=1-16#customerReviews WD Blue 3D NAND 500GB Internal PC SSD - SATA III 6 Gb/s, 2.5"/7mm, Up to 560 MB/s - WDS500G2B0A $64.99
https://www.amazon.com/Thermaltake-Certified-Continuous-cooling-PS-SPD-0500NPCWUS-W/dp/B014W3EM2W/ref=sr_1_14?dchild=1&fst=as%3Aoff&qid=1589819517&rnid=16225007011&s=computers-intl-ship&sr=1-14#customerReviews Thermaltake Smart 500W 80+ White Certified PSU, Continuous Power with 120mm Ultra Quiet Cooling Fan, ATX 12V V2.3/EPS 12V Active PFC Power Supply PS-SPD-0500NPCWUS-W $52.99
https://www.amazon.com/Passport-Portable-External-Drive-Black/dp/B07VTFN6HM/ref=sr_1_12?dchild=1&fst=as%3Aoff&qid=1589819517&rnid=16225007011&s=computers-intl-ship&sr=1-12#customerReviews WD 2TB My Passport Portable External Hard Drive, Black - WDBYVG0020BBK-WESN $59.99
https://www.amazon.com/Samsung-970-EVO-1TB-MZ-V7E1T0BW/dp/B07B

https://www.amazon.com/Toshiba-HDTB420XK3AA-Canvio-Portable-External/dp/B079D3D8NR/ref=sr_1_11?dchild=1&fst=as%3Aoff&qid=1589819517&rnid=16225007011&s=computers-intl-ship&sr=1-11#customerReviews Toshiba (HDTB420XK3AA) Canvio Basics 2TB Portable External Hard Drive USB 3.0, Black $59.99
https://www.amazon.com/Blue-NAND-1TB-SSD-WDS100T2B0A/dp/B073SBQMCX/ref=sr_1_13?dchild=1&fst=as%3Aoff&qid=1589819517&rnid=16225007011&s=computers-intl-ship&sr=1-13#customerReviews WD Blue 3D NAND 1TB Internal PC SSD - SATA III 6 Gb/s, 2.5"/7mm, Up to 560 MB/s - WDS100T2B0A $109.99
https://www.amazon.com/Seagate-Portable-External-Hard-Drive/dp/B07CRG94G3/ref=sr_1_1?dchild=1&fst=as%3Aoff&qid=1589819517&rnid=16225007011&s=computers-intl-ship&sr=1-1#customerReviews Seagate STGX2000400 Portable 2TB External Hard Drive Portable HDD - USB 3.0 for PC Laptop, Mac, Xbox & PS4 $62.99
https://www.amazon.com/AMD-Ryzen-3600X-12-Thread-Processor/dp/B07SQBFN2D/ref=sr_1_7?dchild=1&fst=as%3Aoff&qid=1589819517&rnid=16225007

In [31]:
extracted_data

[{'url': 'https://www.amazon.com/Blue-NAND-500GB-SSD-WDS500G2B0A/dp/B073SBZ8YH/ref=sr_1_16?dchild=1&fst=as%3Aoff&qid=1589819517&rnid=16225007011&s=computers-intl-ship&sr=1-16#customerReviews',
  'product_id': 'B073SBZ8YH',
  'title': 'WD Blue 3D NAND 500GB Internal PC SSD - SATA III 6 Gb/s, 2.5"/7mm, Up to 560 MB/s - WDS500G2B0A',
  'price': '$64.99'},
 {'url': 'https://www.amazon.com/Thermaltake-Certified-Continuous-cooling-PS-SPD-0500NPCWUS-W/dp/B014W3EM2W/ref=sr_1_14?dchild=1&fst=as%3Aoff&qid=1589819517&rnid=16225007011&s=computers-intl-ship&sr=1-14#customerReviews',
  'product_id': 'B014W3EM2W',
  'title': 'Thermaltake Smart 500W 80+ White Certified PSU, Continuous Power with 120mm Ultra Quiet Cooling Fan, ATX 12V V2.3/EPS 12V Active PFC Power Supply PS-SPD-0500NPCWUS-W',
  'price': '$52.99'},
 {'url': 'https://www.amazon.com/Passport-Portable-External-Drive-Black/dp/B07VTFN6HM/ref=sr_1_12?dchild=1&fst=as%3Aoff&qid=1589819517&rnid=16225007011&s=computers-intl-ship&sr=1-12#customerR