In [1]:
from pathlib import Path
import os
import re
import requests
import time
import datetime
import pandas as pd
from requests_html import HTML
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

In [2]:
BASE_DIR = Path.cwd()
DATA_DIR = BASE_DIR / 'data'
if not DATA_DIR.exists():
    DATA_DIR.mkdir(exist_ok = True)

product_category_links_output = DATA_DIR / 'cat-products.csv'
product_output = DATA_DIR / 'products.csv'


In [3]:
options = Options()
options.add_argument('--headless')

driver = webdriver.Chrome(options = options)

In [4]:
categories = [
    {'name':'computer-components', 'url':'https://www.amazon.com/s?bbn=16225007011&rh=n%3A16225007011%2Cn%3A193870011&dc&fst=as%3Aoff&qid=1589403115&rnid=16225007011&ref=lp_16225007011_nr_n_1'},
    {'name':'data-storage','url':'https://www.amazon.com/s?bbn=16225007011&rh=n%3A16225007011%2Cn%3A1292110011&dc&fst=as%3Aoff&qid=1589403115&rnid=16225007011&ref=lp_16225007011_nr_n_3'},
    {'name':'laptop-accessories','url':'https://www.amazon.com/s?bbn=16225007011&rh=n%3A16225007011%2Cn%3A3011391011&dc&fst=as%3Aoff&qid=1589403115&rnid=16225007011&ref=lp_16225007011_nr_n_4'},
    {'name':'monitors','url':'https://www.amazon.com/s?bbn=16225007011&rh=n%3A16225007011%2Cn%3A1292115011&dc&fst=as%3Aoff&qid=1589403115&rnid=16225007011&ref=lp_16225007011_nr_n_5'}
]

In [5]:
regex_options = [
    r'https://www.amazon.com/gp/product/(?P<product_id>[\w-]+)/',
    r'https://www.amazon.com/dp/(?P<product_id>[\w-]+)/',
    r'https://www.amazon.com/(?P<slug>[\w-]+)/dp/(?P<product_id>[\w-]+)/'
]

In [6]:
def extract_product_id_from_url(url):
    product_id = None
    for regex_str in regex_options:
        regex = re.compile(regex_str)
        match = regex.match(url)
        if match != None:
            try:
                product_id = match['product_id']
            except:
                pass
            
    return product_id

In [7]:
# page_links = [x for x in page_links if extract_product_id_from_url(x) != None]

def clean_page_links(page_links = [], category = None):
    final_page_links = []
    for url in page_links:
        product_id = extract_product_id_from_url(url)
        if product_id != None:
            final_page_links.append({'url':url, 'product_id':product_id,'category':category})
    return final_page_links

# cleaned_links = clean_page_links(page_links)

In [8]:
def scrape_cat_product_links(categories =[]):
    all_product_links = []
    for cat in categories:
        time.sleep(1.5)
        url = cat.get('url')
        driver.get(url)
        body_el = driver.find_element_by_css_selector('body')
        html_str = body_el.get_attribute('innerHTML')
        html_obj = HTML(html = html_str)
        page_links = [f'https://www.amazon.com{x}' for x in html_obj.links if x.startswith('/')]
        cleaned_links = clean_page_links(page_links = page_links, category = cat )
        all_product_links +=cleaned_links
    return all_product_links

In [9]:
def extract_cat_and_save(categories =[]):
    all_product_links = scrape_cat_product_links(categories)
    cat_df = pd.DataFrame(all_product_links)
    cat_df.to_csv(product_category_links_output, index = False)

In [10]:
extract_cat_and_save(categories=categories)

In [11]:
def scrape_product_page(url, title_lookup = '#productTitle'
    ,price_lookup = '#priceblock_ourprice'):
    driver.get(url)
    time.sleep(1.5)
    body_el = driver.find_element_by_css_selector('body')
    html_str = body_el.get_attribute('innerHTML')
    html_obj = HTML(html=html_str)
    product_title = html_obj.find(title_lookup, first = True).text
    product_price = html_obj.find(price_lookup, first = True).text
    return product_title, product_price

In [12]:
def perform_scrape(cleaned_items =[]):
    extracted_data = []
    for obj in cleaned_items:
        link = obj['url']
        product_id = obj['product_id']
        title, price = (None, None)
        try:
            title, price = scrape_product_page(link)
        except:
            pass
        if title != None and price !=None:
            print (link, title, price)
        product_data ={
            'url' : link,
            'product_id':product_id,
            'title':title,
            'price':price
        }
        extracted_data.append(product_data)
    
    return extracted_data

In [13]:
def row_scraper(row, *args, **kwargs):
    link = row['url']
    scraped = 0
    try:
        scraped = row['scraped']
    except:
        pass
    # print(link)
    if scraped == 1 or scraped == "1":
        print("skipped")
        return row
    product_id = row['product_id']
    title, price = (None, None)
    try:
        title, price = scrape_product_page(link)
    except:
        pass
    row['title'] = title
    row['price'] = price
    row['scraped'] = 1
    row['timestamp'] = datetime.datetime.now().timestamp()
    print(link, title, price)
    return row

In [14]:
df = pd.read_csv(product_category_links_output)
df.head()

Unnamed: 0,url,product_id,category
0,https://www.amazon.com/Ryzen-3600-Procesador-e...,B07STGGQ18,"{'name': 'computer-components', 'url': 'https:..."
1,https://www.amazon.com/Samsung-Inch-Internal-M...,B07L3D19MY,"{'name': 'computer-components', 'url': 'https:..."
2,https://www.amazon.com/Samsung-Inch-Internal-M...,B078DPCY3T,"{'name': 'computer-components', 'url': 'https:..."
3,https://www.amazon.com/Samsung-970-EVO-1TB-MZ-...,B07BN217QG,"{'name': 'computer-components', 'url': 'https:..."
4,https://www.amazon.com/Blue-NAND-1TB-SSD-WDS10...,B073SBQMCX,"{'name': 'computer-components', 'url': 'https:..."


Unnamed: 0,url,product_id,category
0,https://www.amazon.com/Ryzen-3600-Procesador-e...,B07STGGQ18,"{'name': 'computer-components', 'url': 'https:..."
1,https://www.amazon.com/Samsung-Inch-Internal-M...,B07L3D19MY,"{'name': 'computer-components', 'url': 'https:..."
2,https://www.amazon.com/Samsung-Inch-Internal-M...,B078DPCY3T,"{'name': 'computer-components', 'url': 'https:..."
3,https://www.amazon.com/Samsung-970-EVO-1TB-MZ-...,B07BN217QG,"{'name': 'computer-components', 'url': 'https:..."
4,https://www.amazon.com/Blue-NAND-1TB-SSD-WDS10...,B073SBQMCX,"{'name': 'computer-components', 'url': 'https:..."


In [15]:
df_sub = df.copy()
df_sub = df.apply(row_scraper, axis = 1)

https://www.amazon.com/Ryzen-3600-Procesador-escritorio-desbloqueado/dp/B07STGGQ18/ref=sr_1_3?dchild=1&fst=as%3Aoff&qid=1589824327&rnid=16225007011&s=computers-intl-ship&sr=1-3 AMD Ryzen 5 3600 6-Core, 12-Thread Unlocked Desktop Processor with Wraith Stealth Cooler $172.39
https://www.amazon.com/Ryzen-3600-Procesador-escritorio-desbloqueado/dp/B07STGGQ18/ref=sr_1_3?dchild=1&fst=as%3Aoff&qid=1589824327&rnid=16225007011&s=computers-intl-ship&sr=1-3 AMD Ryzen 5 3600 6-Core, 12-Thread Unlocked Desktop Processor with Wraith Stealth Cooler $172.39
https://www.amazon.com/Ryzen-3600-Procesador-escritorio-desbloqueado/dp/B07STGGQ18/ref=sr_1_3?dchild=1&fst=as%3Aoff&qid=1589824327&rnid=16225007011&s=computers-intl-ship&sr=1-3 AMD Ryzen 5 3600 6-Core, 12-Thread Unlocked Desktop Processor with Wraith Stealth Cooler $172.39
https://www.amazon.com/Ryzen-3600-Procesador-escritorio-desbloqueado/dp/B07STGGQ18/ref=sr_1_3?dchild=1&fst=as%3Aoff&qid=1589824327&rnid=16225007011&s=computers-intl-ship&sr=1-3 A

https://www.amazon.com/Blue-NAND-500GB-SSD-WDS500G2B0A/dp/B073SBZ8YH/ref=sr_1_15?dchild=1&fst=as%3Aoff&qid=1589824327&rnid=16225007011&s=computers-intl-ship&sr=1-15 WD Blue 3D NAND 500GB Internal PC SSD - SATA III 6 Gb/s, 2.5"/7mm, Up to 560 MB/s - WDS500G2B0A $64.99
https://www.amazon.com/Blue-NAND-500GB-SSD-WDS500G2B0A/dp/B073SBZ8YH/ref=sr_1_15?dchild=1&fst=as%3Aoff&qid=1589824327&rnid=16225007011&s=computers-intl-ship&sr=1-15 WD Blue 3D NAND 500GB Internal PC SSD - SATA III 6 Gb/s, 2.5"/7mm, Up to 560 MB/s - WDS500G2B0A $64.99
https://www.amazon.com/Blue-NAND-500GB-SSD-WDS500G2B0A/dp/B073SBZ8YH/ref=sr_1_15?dchild=1&fst=as%3Aoff&qid=1589824327&rnid=16225007011&s=computers-intl-ship&sr=1-15#customerReviews WD Blue 3D NAND 500GB Internal PC SSD - SATA III 6 Gb/s, 2.5"/7mm, Up to 560 MB/s - WDS500G2B0A $64.99
https://www.amazon.com/Blue-NAND-500GB-SSD-WDS500G2B0A/dp/B073SBZ8YH/ref=sr_1_15?dchild=1&fst=as%3Aoff&qid=1589824327&rnid=16225007011&s=computers-intl-ship&sr=1-15#customerReviews

https://www.amazon.com/Corsair-Vengeance-3200MHz-Desktop-Memory/dp/B0143UM4TC/ref=sr_1_9?dchild=1&fst=as%3Aoff&qid=1589824327&rnid=16225007011&s=computers-intl-ship&sr=1-9 Corsair Vengeance LPX 16GB (2x8GB) DDR4 DRAM 3200MHz C16 Desktop Memory Kit - Black (CMK16GX4M2B3200C16),Vengeance LPX Black $74.99
https://www.amazon.com/Corsair-Vengeance-3200MHz-Desktop-Memory/dp/B0143UM4TC/ref=sr_1_9?dchild=1&fst=as%3Aoff&qid=1589824327&rnid=16225007011&s=computers-intl-ship&sr=1-9 Corsair Vengeance LPX 16GB (2x8GB) DDR4 DRAM 3200MHz C16 Desktop Memory Kit - Black (CMK16GX4M2B3200C16),Vengeance LPX Black $74.99
https://www.amazon.com/Kingston-240GB-Solid-SA400S37-240G/dp/B01N5IB20Q/ref=sr_1_16?dchild=1&fst=as%3Aoff&qid=1589824327&rnid=16225007011&s=computers-intl-ship&sr=1-16 Kingston 240GB A400 SATA 3 2.5" Internal SSD SA400S37/240G - HDD Replacement for Increase Performance $39.99
https://www.amazon.com/Kingston-240GB-Solid-SA400S37-240G/dp/B01N5IB20Q/ref=sr_1_16?dchild=1&fst=as%3Aoff&qid=15898

https://www.amazon.com/Samsung-970-EVO-1TB-MZ-V7E1T0BW/dp/B07BN217QG/ref=sr_1_2?dchild=1&fst=as%3Aoff&qid=1589824331&rnid=16225007011&s=computers-intl-ship&sr=1-2 Samsung (MZ-V7E1T0BW) 970 EVO SSD 1TB - M.2 NVMe Interface Internal Solid State Drive with V-NAND Technology, Black/Red $179.99
https://www.amazon.com/Passport-Portable-External-Drive-Black/dp/B07VTFN6HM/ref=sr_1_6?dchild=1&fst=as%3Aoff&qid=1589824331&rnid=16225007011&s=computers-intl-ship&sr=1-6#customerReviews WD 2TB My Passport Portable External Hard Drive, Black - WDBYVG0020BBK-WESN $59.99
https://www.amazon.com/Passport-Portable-External-Drive-Black/dp/B07VTFN6HM/ref=sr_1_6?dchild=1&fst=as%3Aoff&qid=1589824331&rnid=16225007011&s=computers-intl-ship&sr=1-6#customerReviews WD 2TB My Passport Portable External Hard Drive, Black - WDBYVG0020BBK-WESN $59.99
https://www.amazon.com/Toshiba-HDTB420XK3AA-Canvio-Portable-External/dp/B079D3D8NR/ref=sr_1_5?dchild=1&fst=as%3Aoff&qid=1589824331&rnid=16225007011&s=computers-intl-ship&s

https://www.amazon.com/Samsung-T5-Portable-SSD-MU-PA500B/dp/B073GZBT36/ref=sr_1_11?dchild=1&fst=as%3Aoff&qid=1589824331&rnid=16225007011&s=computers-intl-ship&sr=1-11#customerReviews None None
https://www.amazon.com/Samsung-Inch-Internal-MZ-76E1T0B-AM/dp/B078DPCY3T/ref=sr_1_3?dchild=1&fst=as%3Aoff&qid=1589824331&rnid=16225007011&s=computers-intl-ship&sr=1-3#customerReviews None None
https://www.amazon.com/Samsung-T5-Portable-SSD-MU-PA500R/dp/B07Q41P7B7/ref=sr_1_11?fst=as%3Aoff&qid=1589824331&rnid=16225007011&s=computers-intl-ship&sr=1-11 None None
https://www.amazon.com/Passport-Portable-External-Drive-3-Pack/dp/B088P7LT41/ref=sr_1_6?fst=as%3Aoff&qid=1589824331&rnid=16225007011&s=computers-intl-ship&sr=1-6 None None
https://www.amazon.com/WD_Black-Drive-External-Compatible-WDBA3A0050BBK-WESN/dp/B07VNTFHD5/ref=sr_1_16?dchild=1&fst=as%3Aoff&qid=1589824331&rnid=16225007011&s=computers-intl-ship&sr=1-16 None None
https://www.amazon.com/Samsung-Inch-Internal-MZ-76Q1T0B-AM/dp/B07L3D19MY/ref=

https://www.amazon.com/Nulaxy-Ergonomic-Aluminum-Detachable-Compatible/dp/B07SSXGM1Q/ref=sr_1_17?fst=as%3Aoff&qid=1589824335&rnid=16225007011&s=computers-intl-ship&sr=1-17 None None
https://www.amazon.com/MOSISO-MacBook-Keyboard-Protector-Compatible/dp/B07M9XBXSH/ref=sr_1_13?fst=as%3Aoff&qid=1589824335&rnid=16225007011&s=computers-intl-ship&sr=1-13 None None
https://www.amazon.com/Charger-Replacement-Magsafe-Adapter-13-Inch-Before/dp/B07P71G7B6/ref=sr_1_20?dchild=1&fst=as%3Aoff&qid=1589824335&rnid=16225007011&s=computers-intl-ship&sr=1-20#customerReviews None None
https://www.amazon.com/Dell-452-BCYT-D6000-Universal-Black/dp/B071YTQBXM/ref=sr_1_8?dchild=1&fst=as%3Aoff&qid=1589824335&rnid=16225007011&s=computers-intl-ship&sr=1-8 None None
https://www.amazon.com/Dell-Docking-Station-Delivery-DisplayPort/dp/B07S3XHMP1/ref=sr_1_10?dchild=1&fst=as%3Aoff&qid=1589824335&rnid=16225007011&s=computers-intl-ship&sr=1-10#customerReviews None None
https://www.amazon.com/Plugable-Universal-Docking-S

https://www.amazon.com/Dell-Screen-LED-Lit-Monitor-P2419H/dp/B07F8XZN69/ref=sr_1_15?dchild=1&fst=as%3Aoff&qid=1589824338&rnid=16225007011&s=computers-intl-ship&sr=1-15#customerReviewshttps://www.amazon.com/CalDigit-TS3-Plus-Thunderbolt-Dock/dp/B07DYMVY99/ref=sr_1_5?fst=as%3Aoff&qid=1589824335&rnid=16225007011&s=computers-intl-ship&sr=1-5 None None
https://www.amazon.com/Lenovo-ThinkPad-USB-C-Dock-40AS0090US/dp/B07R68XHZ9/ref=sr_1_16?dchild=1&fst=as%3Aoff&qid=1589824335&rnid=16225007011&s=computers-intl-ship&sr=1-16#customerReviews None None
https://www.amazon.com/Hiearcool-USB-Hub-11-Compatiable/dp/B07QNRM45T/ref=sr_1_23?dchild=1&fst=as%3Aoff&qid=1589824335&rnid=16225007011&s=computers-intl-ship&sr=1-23 None None
https://www.amazon.com/Charger-Replacement-Magsafe-Magnetic-Adapter/dp/B07RPD51XV/ref=sr_1_4?dchild=1&fst=as%3Aoff&qid=1589824335&rnid=16225007011&s=computers-intl-ship&sr=1-4 None None
https://www.amazon.com/Lenovo-ThinkPad-Thunderbolt-40AN0135US-Capability/dp/B07M6S81CM/ref=

https://www.amazon.com/Dell-Screen-LED-Lit-Monitor-P2419H/dp/B07F8XZN69/ref=sr_1_15?dchild=1&fst=as%3Aoff&qid=1589824338&rnid=16225007011&s=computers-intl-ship&sr=1-15#customerReviews None None
https://www.amazon.com/Dell-SE2419Hx-23-8-1920x1080-Monitor/dp/B07HKV5RLG/ref=sr_1_16?dchild=1&fst=as%3Aoff&qid=1589824338&rnid=16225007011&s=computers-intl-ship&sr=1-16 None None
 None None
https://www.amazon.com/Dell-SE2419Hx-23-8-1920x1080-Monitor/dp/B07HKV5RLG/ref=sr_1_16?dchild=1&fst=as%3Aoff&qid=1589824338&rnid=16225007011&s=computers-intl-ship&sr=1-16 None None


In [16]:
#df.to_csv(product_output, index = False)

In [17]:
products_df = pd.read_csv(product_output)
products_df.head()

Unnamed: 0,url,product_id
0,https://www.amazon.com/AMD-Ryzen-3900X-24-Thre...,B07SXMZLP9
1,https://www.amazon.com/Seagate-Portable-Extern...,B07CRG94G3
2,https://www.amazon.com/Ryzen-3600-Procesador-e...,B07STGGQ18
3,https://www.amazon.com/Kingston-240GB-Solid-SA...,B01N5IB20Q
4,https://www.amazon.com/Kingston-240GB-Solid-SA...,B01N5IB20Q


Unnamed: 0,url,product_id
0,https://www.amazon.com/AMD-Ryzen-3900X-24-Thre...,B07SXMZLP9
1,https://www.amazon.com/Seagate-Portable-Extern...,B07CRG94G3
2,https://www.amazon.com/Ryzen-3600-Procesador-e...,B07STGGQ18
3,https://www.amazon.com/Kingston-240GB-Solid-SA...,B01N5IB20Q
4,https://www.amazon.com/Kingston-240GB-Solid-SA...,B01N5IB20Q


In [18]:
final_df = pd.concat([products_df, df_sub])
final_df.to_csv(product_output, index = False)