In [12]:
from pathlib import Path
import re
import requests
import time

import pandas as pd
from requests_html import HTML
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

In [13]:
BASE_DIR = Path.cwd()
DATA_DIR = BASE_DIR / 'data' # os.path.join(BASE_DIR, 'data')
if not DATA_DIR.exists(): #os.path.exists(DATA_DIR)
    DATA_DIR.mkdir(exist_ok=True) #os.makedirs(DATA_DIR, exist_ok=True)

product_category_links_output = DATA_DIR / 'category-products.csv'
product_output = DATA_DIR / 'products.csv'


In [4]:
options = Options()
options.add_argument('--headless')

driver = webdriver.Chrome(options=options)

In [5]:
categories = [
    {'name': 'toys-and-games', 'url':"https://www.amazon.com/Best-Sellers-Toys-Games/zgbs/toys-and-games/"},
    {'name': 'electronics', 'url': "https://www.amazon.com/Best-Sellers-Electronics/zgbs/electronics/"},
    {'name': 'fashion', 'url':"https://www.amazon.com/Best-Sellers/zgbs/fashion/"}
]

In [6]:
regex_options = [
    r"https://www.amazon.com/gp/product/(?P<product_id>[\w-]+)/",
    r"https://www.amazon.com/dp/(?P<product_id>[\w-]+)/",
    r"https://www.amazon.com/(?P<slug>[\w-]+)/dp/(?P<product_id>[\w-]+)/",
]

def extract_product_id_from_url(url):
    product_id = None
    for regex_str in regex_options:
        regex = re.compile(regex_str)
        match = regex.match(url)
        if match != None:
            try:
                product_id = match['product_id']
            except:
                pass
    return product_id

In [7]:
def clean_page_links(page_links=[]):
    final_page_links = []
    for url in page_links:
        product_id = extract_product_id_from_url(url)
        if product_id != None:
            final_page_links.append({"url": url, "product_id": product_id})
    return final_page_links

In [9]:
def scrape_category_product_links(categories=[]):
    all_product_links = []
    for category in categories:
        time.sleep(1.5)
        url = category.get('url')
        driver.get(url)
        body_el = driver.find_element_by_css_selector('body')
        html_str = body_el.get_attribute('innerHTML')
        html_obj = HTML(html=html_str)
        page_links = [f'https://www.amazon.com{x}' for x in html_obj.links if x.startswith('/')]
        cleaned_links = clean_page_links(page_links=page_links)
        all_product_links += cleaned_links
    return all_product_links

all_product_links = scrape_category_product_links(categories)

In [16]:
category_df = pd.DataFrame(all_product_links)
category_df.to_csv(product_category_links_output, index=False)

In [10]:
print(all_product_links)

[{'url': 'https://www.amazon.com/Crayola-Adult-Face-Mask-Reusable/dp/B08B2J4J46/ref=zg_bs_toys-and-games_3?_encoding=UTF8&psc=1&refRID=HERZ0C3GGF1W43NQ77QA', 'product_id': 'B08B2J4J46'}, {'url': 'https://www.amazon.com/Crayola-Shark-Coloring-Pages-Wonder/dp/B07PMLL5L7/ref=zg_bs_toys-and-games_38?_encoding=UTF8&psc=1&refRID=HERZ0C3GGF1W43NQ77QA', 'product_id': 'B07PMLL5L7'}, {'url': 'https://www.amazon.com/Cards-Against-Humanity-LLC-CAHUS/dp/B004S8F7QM/ref=zg_bs_toys-and-games_49?_encoding=UTF8&psc=1&refRID=HERZ0C3GGF1W43NQ77QA', 'product_id': 'B004S8F7QM'}, {'url': 'https://www.amazon.com/Elmers-Liquid-School-Washable-Ounces/dp/B072MHQZT1/ref=zg_bs_toys-and-games_31?_encoding=UTF8&psc=1&refRID=HERZ0C3GGF1W43NQ77QA', 'product_id': 'B072MHQZT1'}, {'url': 'https://www.amazon.com/Bobor-Trampoline-Sprinklers-Outdoor-Summer/dp/B07QZFS4CY/ref=zg_bs_toys-and-games_35?_encoding=UTF8&psc=1&refRID=HERZ0C3GGF1W43NQ77QA', 'product_id': 'B07QZFS4CY'}, {'url': 'https://www.amazon.com/Max-Liquidator-3

In [None]:
def scrape_product_page(url, title_lookup = '#productTitle', price_lookup = '#priceblock_ourprice'):
    driver.get(url)
    time.sleep(1.2)
    body_el = driver.find_element_by_css_selector('body')
    html_str = body_el.get_attribute('innerHTML')
    html_obj = HTML(html=html_str)
    product_title = html_obj.find(title_lookup, first=True).text
    product_price = html_obj.find(price_lookup, first=True).text
    return product_title, product_price


In [None]:
first_product_link = page_links[0]
first_product_link

In [None]:
# https://www.amazon.ca/TP-Link-Extender-Intelligent-Indicator-RE450/dp/B010S6SG3S/ref=zg_bs_electronics_11/135-2776524-4703011?_encoding=UTF8&psc=1&refRID=PZSK781X651YAS3CFZFS

# <base-url>/<slug>/<product_id>/

In [None]:
# my_regex_pattern = r"https://www.amazon.com/(?P<slug>[\w-]+)/dp/(?P<product_id>[\w-]+)/"
# my_url = 'https://www.amazon.com/Crayola-Washable-Watercolors-8-ea/dp/B000HHKAE2/'

In [None]:
# regex = re.compile(my_regex_pattern)

In [None]:
# my_match = regex.match(my_url)

In [None]:
# print(my_match)
# my_match['product_id']

In [None]:
# my_match['slug']

In [None]:
# final_page_links = [x for x in page_links if extract_product_id_from_url(x) != None]    ### list comprehension

cleaned_links = clean_page_links(page_links)

In [None]:
len(page_links) == len(cleaned_links)

In [None]:
def perform_scrape(cleaned_items=[]):
    data_extracted = []
    for obj in cleaned_items:
        link = obj['url']
        product_id = obj['product_id']
        title, price = (None, None)
        try:
            title, price = scrape_product_page(link)
        except:
            pass
        if title != None and price != None:
            print(link, title, price)
        product_data = {
            'url': link,
            'product_id': product_id,
            'title': title,
            'price': price
        }
        data_extracted.append(product_data)
        return data_extracted

In [None]:
len(page_links)

In [None]:
len(cleaned_links)

In [None]:
extracted_data = perform_scrape(cleaned_items=cleaned_links)

In [None]:
print(extracted_data)