In [1]:
from pathlib import Path
import re
import requests
import time
import datetime

import pandas as pd
from requests_html import HTML
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

In [2]:
BASE_DIR = Path.cwd()
DATA_DIR = BASE_DIR / "data" # os.path.join(BASE_DIR, 'data')
if not DATA_DIR.exists(): # os.path.exists(DATA_DIR)
    DATA_DIR.mkdir(exist_ok=True) # os.makedirs(DATA_DIR, exist_ok=True)
    
product_category_links_output = DATA_DIR / "category-products.csv"
product_output = DATA_DIR / "products.csv"

In [3]:
options = Options()
options.add_argument("--headless")

driver = webdriver.Chrome(options=options)

In [4]:
categories = [
    {"name": "toys-and-games", "url": "https://www.amazon.com/Best-Sellers-Toys-Games/zgbs/toys-and-games/"},
    {"name": "electronics", "url": "https://www.amazon.com/Best-Sellers-Electronics/zgbs/electronics/"},
    {"name": "fashion", "url": "https://www.amazon.com/Best-Sellers/zgbs/fashion/"}
]

In [5]:
regex_options = [
    r"https://www.amazon.com/gp/product/(?P<product_id>[\w-]+)/",
    r"https://www.amazon.com/dp/(?P<product_id>[\w-]+)/",
    r"https://www.amazon.com/(?P<slug>[\w-]+)/dp/(?P<product_id>[\w-]+)/",
]

def extract_product_id_from_url(url):
    product_id = None
    for regex_str in regex_options:
        regex = re.compile(regex_str)
        match = regex.match(url)
        if match != None:
            try:
                product_id = match['product_id']
            except:
                pass
    return product_id

In [6]:
def clean_page_links(page_links=[], category=None):
    final_page_links = []
    for url in page_links:
        product_id = extract_product_id_from_url(url)
        if product_id != None:
            final_page_links.append({"url": url, "product_id": product_id, "category": category})
    return final_page_links

In [7]:
def scrace_category_product_links(categories=[]):
    all_product_links = []
    for category in categories:
        time.sleep(1.5)
        url = category.get("url")
        driver.get(url)
        body_el = driver.find_element_by_css_selector("body")
        html_str = body_el.get_attribute("innerHTML")
        html_obj = HTML(html=html_str)
        page_links = [f"https://www.amazon.com{x}" for x in html_obj.links if x.startswith("/")]
        cleaned_links = clean_page_links(page_links=page_links, category=category)
        all_product_links += cleaned_links
    return all_product_links

In [8]:
def extract_categories_and_save(categories=[]):
    all_product_links = scrace_category_product_links(categories)
    category_df = pd.DataFrame(all_product_links)
    category_df.to_csv(product_category_links_output, index=False)

In [9]:
extract_categories_and_save(categories=categories)

In [10]:
def scrape_product_page(url, title_lookup = "#productTitle", price_lookup = "#priceblock_ourprice"):
    driver.get(url)
    time.sleep(1.5)
    body_el = driver.find_element_by_css_selector("body")
    html_str = body_el.get_attribute("innerHTML")
    html_obj = HTML(html=html_str)
    product_title = html_obj.find(title_lookup, first=True).text
    product_price = html_obj.find(price_lookup, first=True).text
    return product_title, product_price

In [11]:
def perform_scrape(cleaned_items=[]):
    data_extracted = []
    for obj in cleaned_items:
        link = obj['url']
        product_id = obj['product_id']
        title, price = (None, None)
        try:
            title, price = scrape_product_page(link)
        except:
            pass
        if title != None and price != None:
            print(link, title, price)
        product_data = {
            "url": link,
            "product_id": product_id,
            "title": title,
            "price": price
        }
        data_extracted.append(product_data)
    return data_extracted

In [12]:
# extracted_data = perform_scrape(cleaned_items=cleaned_links)

In [13]:
# print(extracted_data)

In [14]:
def row_scrape_event(row, *args, **kwargs):
    link = row['url']
    scraped = 0
    try:
        scraped = row['scraped']
    except:
        pass
    # print(link)
    if scraped == 1 or scraped == "1":
        print("skipped")
        return row
    product_id = row['product_id']
    title, price = (None, None)
    try:
        title, price = scrape_product_page(link)
    except:
        pass
    row['title'] = title
    row['price'] = price
    row['scraped'] = 1
    row['timestamp'] = datetime.datetime.now().timestamp()
    print(link, title, price)
    return row

In [15]:
df = pd.read_csv(product_category_links_output)
df.head()

Unnamed: 0,url,product_id,category
0,https://www.amazon.com/Melissa-Doug-Activity-C...,B00EJAEUBC,"{'name': 'toys-and-games', 'url': 'https://www..."
1,https://www.amazon.com/Munchkin-17040-Fishin-B...,B01N52DUNK,"{'name': 'toys-and-games', 'url': 'https://www..."
2,https://www.amazon.com/Play-Doh-Modeling-Compo...,B00JM5GZGW,"{'name': 'toys-and-games', 'url': 'https://www..."
3,https://www.amazon.com/Crayola-Washable-Specia...,B00PY47LHW,"{'name': 'toys-and-games', 'url': 'https://www..."
4,https://www.amazon.com/Bunch-Balloons-Pack-Ama...,B07GW2QQWN,"{'name': 'toys-and-games', 'url': 'https://www..."


In [16]:
df.shape

(150, 3)

In [17]:
df_sub = df.copy() # df.head(n=10)

In [20]:
df_sub = df_sub.apply(row_scrape_event, axis=1)

https://www.amazon.com/Melissa-Doug-Activity-Child-Safe-Scissors/dp/B00EJAEUBC/ref=zg_bs_toys-and-games_15/132-3105173-9301645?_encoding=UTF8&psc=1&refRID=BZ10YZJJ1T0QC9YT1AR2 Melissa & Doug Scissor Skills Activity Pad $4.99
https://www.amazon.com/Melissa-Doug-Activity-Child-Safe-Scissors/dp/B00EJAEUBC/ref=zg_bs_toys-and-games_15/132-3105173-9301645?_encoding=UTF8&psc=1&refRID=BZ10YZJJ1T0QC9YT1AR2 Melissa & Doug Scissor Skills Activity Pad $4.99
https://www.amazon.com/Munchkin-17040-Fishin-Bath-Toy/dp/B01N52DUNK/ref=zg_bs_toys-and-games_48/132-3105173-9301645?_encoding=UTF8&psc=1&refRID=BZ10YZJJ1T0QC9YT1AR2 Munchkin Fishin' Bath Toy $7.73
https://www.amazon.com/Play-Doh-Modeling-Compound-Non-Toxic-Exclusive/dp/B00JM5GZGW/ref=zg_bs_toys-and-games_17/132-3105173-9301645?_encoding=UTF8&psc=1&refRID=BZ10YZJJ1T0QC9YT1AR2 Play-Doh Modeling Compound 36-Pack Case of Colors, Non-Toxic, Assorted Colors, 3-Ounce Cans (Amazon Exclusive) $24.99
https://www.amazon.com/Crayola-Washable-Special-Sidewa

https://www.amazon.com/ThinkFun-Zingo-Winning-Pre-Readers-Readers/dp/B01DY818JG/ref=zg_bs_toys-and-games_46/132-3105173-9301645?_encoding=UTF8&psc=1&refRID=BZ10YZJJ1T0QC9YT1AR2 ThinkFun Zingo Bingo Award Winning Preschool Game for Pre-Readers and Early Readers Age 4 and Up - One of the Most Popular Board Games for Boys and Girls and their Parents, Amazon Exclusive Version $19.99
https://www.amazon.com/Elmers-Liquid-Glitter-Washable-Assorted/dp/B008M56YZU/ref=zg_bs_toys-and-games_39/132-3105173-9301645?_encoding=UTF8&psc=1&refRID=BZ10YZJJ1T0QC9YT1AR2 Elmer's Liquid Glitter Glue, Great For Making Slime, Washable, Assorted Colors, 6 Ounces Each, 3 Count $11.95
https://www.amazon.com/SunWorks-Construction-Paper-White-Sheets/dp/B0017OHG1O/ref=zg_bs_toys-and-games_35/132-3105173-9301645?_encoding=UTF8&psc=1&refRID=BZ10YZJJ1T0QC9YT1AR2 SunWorks Heavyweight Construction Paper, 9 x 12 Inches, White, 100 Sheets $3.69
https://www.amazon.com/Cards-Against-Humanity-LLC-CAHUS/dp/B004S8F7QM/ref=zg_bs

https://www.amazon.com/All-new-Kindle-Paperwhite-Waterproof-Storage/dp/B07CXG6C9W/ref=zg_bs_electronics_30?_encoding=UTF8&psc=1&refRID=GGNJR9AXHDW7SAG05ZFQ Kindle Paperwhite – Now Waterproof with 2x the Storage – Includes Special Offers $129.99
https://www.amazon.com/GE-Outlet-Protector-Extension-14092/dp/B00DOMYL24/ref=zg_bs_electronics_19?_encoding=UTF8&psc=1&refRID=GGNJR9AXHDW7SAG05ZFQ GE 6 Outlet Surge Protector, 10 Ft Extension Cord, Power Strip, 800 Joules, Flat Plug, Twist-to-Close Safety Covers, White, 14092 $12.02
https://www.amazon.com/all-new-fire-tv-cube-with-alexa-voice-remote/dp/B07KGVB6D6/ref=zg_bs_electronics_44?_encoding=UTF8&psc=1&refRID=GGNJR9AXHDW7SAG05ZFQ All-new Fire TV Cube, hands-free with Alexa built in, 4K Ultra HD, streaming media player, released 2019 $119.99
https://www.amazon.com/All-New-Tablet-Display-Canary-Yellow/dp/B07952VWF2/ref=zg_bs_electronics_13?_encoding=UTF8&psc=1&refRID=GGNJR9AXHDW7SAG05ZFQ Fire HD 8 Tablet (8" HD Display, 16 GB) - Yellow $49.9

https://www.amazon.com/Hanes-ComfortSoft-Cotton-T-Shirt-Black-Steel-S-2PK/dp/B018MS63B6/ref=zg_bs_fashion_47?_encoding=UTF8&psc=1&refRID=M4P7EFK0DWPMB3G1P3S6 None None
https://www.amazon.com/Rebirth-Shop-Anti-Haze-Breathable-Lightweight/dp/B08767JLB8/ref=zg_bs_fashion_36?_encoding=UTF8&psc=1&refRID=M4P7EFK0DWPMB3G1P3S6 None None
https://www.amazon.com/Fruit-Loom-womens-Cotton-Pullover/dp/B07K9C8NRT/ref=zg_bs_fashion_45?_encoding=UTF8&psc=1&refRID=M4P7EFK0DWPMB3G1P3S6 None None
https://www.amazon.com/Fruit-Loom-Womens-Built-Up-Sports/dp/B07KL1WLG6/ref=zg_bs_fashion_26?_encoding=UTF8&psc=1&refRID=M4P7EFK0DWPMB3G1P3S6 None None
https://www.amazon.com/Carhartt-Workwear-Pocket-Short-Sleeve-T-Shirt/dp/B0007XB2Y8/ref=zg_bs_fashion_1?_encoding=UTF8&psc=1&refRID=M4P7EFK0DWPMB3G1P3S6 None None
https://www.amazon.com/Bali-Designs-Womens-Comfort-Revolution/dp/B06X1BWH4S/ref=zg_bs_fashion_21?_encoding=UTF8&psc=1&refRID=M4P7EFK0DWPMB3G1P3S6 None None
https://www.amazon.com/Amazon-Essentials-2-Pack-L

In [None]:
# df.to_csv(product_output, index=False)

In [19]:
products_df = pd.read_csv(product_output)

In [21]:
final_df = pd.concat([products_df, df_sub])
final_df.to_csv(product_output, index=False)

In [24]:
final_df.head()

Unnamed: 0,url,product_id,title,price,timestamp,category,scraped
0,https://www.amazon.com/First-Years-Stack-Up-Cu...,B00005C5H4,,,,,
1,https://www.amazon.com/Crayola-Markers-Assorte...,B01C64BASI,"Crayola Fine Line Markers Adult Coloring Set, ...",$12.99,1587710000.0,"{'name': 'toys-and-games', 'url': 'https://www...",1.0
2,https://www.amazon.com/Crayola-Washable-Sidewa...,B00LH1WN4W,,,1587710000.0,"{'name': 'toys-and-games', 'url': 'https://www...",1.0
3,https://www.amazon.com/Crayola-Colored-Pre-sha...,B018HB2QFU,"Crayola Colored Pencils, Adult Coloring, Fun A...",$19.38,1587710000.0,"{'name': 'toys-and-games', 'url': 'https://www...",1.0
4,https://www.amazon.com/L-L-Surprise-Candylicio...,B07XSQ3BYJ,L.O.L. Surprise! O.M.G. Candylicious Fashion D...,$26.88,1587710000.0,"{'name': 'toys-and-games', 'url': 'https://www...",1.0
