In [27]:
from pathlib import Path
import re
import requests
import time
import datetime

import pandas as pd
from requests_html import HTML
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

In [13]:
BASE_DIR = Path.cwd()
DATA_DIR = BASE_DIR / 'data' # os.path.join(BASE_DIR, 'data')
if not DATA_DIR.exists(): #os.path.exists(DATA_DIR)
    DATA_DIR.mkdir(exist_ok=True) #os.makedirs(DATA_DIR, exist_ok=True)

product_category_links_output = DATA_DIR / 'category-products.csv'
product_output = DATA_DIR / 'products.csv'


In [4]:
options = Options()
options.add_argument('--headless')

driver = webdriver.Chrome(options=options)

In [5]:
categories = [
    {'name': 'toys-and-games', 'url':"https://www.amazon.com/Best-Sellers-Toys-Games/zgbs/toys-and-games/"},
    {'name': 'electronics', 'url': "https://www.amazon.com/Best-Sellers-Electronics/zgbs/electronics/"},
    {'name': 'fashion', 'url':"https://www.amazon.com/Best-Sellers/zgbs/fashion/"}
]

In [6]:
regex_options = [
    r"https://www.amazon.com/gp/product/(?P<product_id>[\w-]+)/",
    r"https://www.amazon.com/dp/(?P<product_id>[\w-]+)/",
    r"https://www.amazon.com/(?P<slug>[\w-]+)/dp/(?P<product_id>[\w-]+)/",
]

def extract_product_id_from_url(url):
    product_id = None
    for regex_str in regex_options:
        regex = re.compile(regex_str)
        match = regex.match(url)
        if match != None:
            try:
                product_id = match['product_id']
            except:
                pass
    return product_id

In [29]:
def clean_page_links(page_links=[], category=None):
    final_page_links = []
    for url in page_links:
        product_id = extract_product_id_from_url(url)
        if product_id != None:
            final_page_links.append({"url": url, "product_id": product_id, "category": category})
    return final_page_links

In [30]:
def scrace_category_product_links(categories=[]):
    all_product_links = []
    for category in categories:
        time.sleep(1.5)
        url = category.get("url")
        driver.get(url)
        body_el = driver.find_element_by_css_selector("body")
        html_str = body_el.get_attribute("innerHTML")
        html_obj = HTML(html=html_str)
        page_links = [f"https://www.amazon.com{x}" for x in html_obj.links if x.startswith("/")]
        cleaned_links = clean_page_links(page_links=page_links, category=category)
        all_product_links += cleaned_links
    return all_product_links


In [31]:
def extract_categories_and_save(categories=[]):
    all_product_links = scrape_category_product_links(categories)
    category_df = pd.DataFrame(all_product_links)
    category_df.to_csv(product_category_links_output, index=False)
    

In [32]:
extract_categories_and_save(categories=categories)

In [10]:
print(all_product_links)

[{'url': 'https://www.amazon.com/Crayola-Adult-Face-Mask-Reusable/dp/B08B2J4J46/ref=zg_bs_toys-and-games_3?_encoding=UTF8&psc=1&refRID=HERZ0C3GGF1W43NQ77QA', 'product_id': 'B08B2J4J46'}, {'url': 'https://www.amazon.com/Crayola-Shark-Coloring-Pages-Wonder/dp/B07PMLL5L7/ref=zg_bs_toys-and-games_38?_encoding=UTF8&psc=1&refRID=HERZ0C3GGF1W43NQ77QA', 'product_id': 'B07PMLL5L7'}, {'url': 'https://www.amazon.com/Cards-Against-Humanity-LLC-CAHUS/dp/B004S8F7QM/ref=zg_bs_toys-and-games_49?_encoding=UTF8&psc=1&refRID=HERZ0C3GGF1W43NQ77QA', 'product_id': 'B004S8F7QM'}, {'url': 'https://www.amazon.com/Elmers-Liquid-School-Washable-Ounces/dp/B072MHQZT1/ref=zg_bs_toys-and-games_31?_encoding=UTF8&psc=1&refRID=HERZ0C3GGF1W43NQ77QA', 'product_id': 'B072MHQZT1'}, {'url': 'https://www.amazon.com/Bobor-Trampoline-Sprinklers-Outdoor-Summer/dp/B07QZFS4CY/ref=zg_bs_toys-and-games_35?_encoding=UTF8&psc=1&refRID=HERZ0C3GGF1W43NQ77QA', 'product_id': 'B07QZFS4CY'}, {'url': 'https://www.amazon.com/Max-Liquidator-3

In [33]:
def scrape_product_page(url, title_lookup = '#productTitle', price_lookup = '#priceblock_ourprice'):
    driver.get(url)
    time.sleep(1.2)
    body_el = driver.find_element_by_css_selector('body')
    html_str = body_el.get_attribute('innerHTML')
    html_obj = HTML(html=html_str)
    product_title = html_obj.find(title_lookup, first=True).text
    product_price = html_obj.find(price_lookup, first=True).text
    return product_title, product_price


In [None]:
def perform_scrape(cleaned_items=[]):
    data_extracted = []
    for obj in cleaned_items:
        link = obj['url']
        product_id = obj['product_id']
        title, price = (None, None)
        try:
            title, price = scrape_product_page(link)
        except:
            pass
        if title != None and price != None:
            print(link, title, price)
        product_data = {
            "url": link,
            "product_id": product_id,
            "title": title,
            "price": price
        }
        data_extracted.append(product_data)
    return data_extracted

In [None]:
# extracted_data = perform_scrape(cleaned_items=cleaned_links)

In [None]:
# print(extracted_data)

In [24]:
def row_scrape_event(row, *args, **kwargs):
    link = row['url']
    scraped = 0
    try:
        scraped = row['scraped']
    except:
        pass
    # print(link)
    if scraped == 1 or scraped == "1":
        print("skipped")
        return row
    product_id = row['product_id']
    title, price = (None, None)
    try:
        title, price = scrape_product_page(link)
    except:
        pass
    row['title'] = title
    row['price'] = price
    row['scraped'] = 1
    row['timestamp'] = datetime.datetime.now().timestamp()
    print(link, title, price)
    return row

In [23]:
df = pd.read_csv(product_category_links_output)
df.head()

Unnamed: 0,url,product_id
0,https://www.amazon.com/HITOP-Blaster-Children-...,B07R6GZQBV
1,https://www.amazon.com/Waterproof-Stickers-Lug...,B07QR3NWGK
2,https://www.amazon.com/Aqua-Monterey-Multi-Pur...,B01CETEZ78
3,https://www.amazon.com/Bobor-Trampoline-Sprink...,B07QZFS4CY
4,https://www.amazon.com/Intex-Lounge-Swimming-L...,B00AY2KDS8


In [34]:
df.shape

(149, 2)

In [35]:
df_sub = df.copy()

In [36]:
df_sub = df_sub.apply(row_scrape_event, axis=1)


https://www.amazon.com/HITOP-Blaster-Children-Swimming-Fighting/dp/B07R6GZQBV/ref=zg_bs_toys-and-games_17?_encoding=UTF8&psc=1&refRID=1TQPTF23RKE0GFEFZZPJ None None
https://www.amazon.com/HITOP-Blaster-Children-Swimming-Fighting/dp/B07R6GZQBV/ref=zg_bs_toys-and-games_17?_encoding=UTF8&psc=1&refRID=1TQPTF23RKE0GFEFZZPJ None None
https://www.amazon.com/Waterproof-Stickers-Luggage-Scrapbook-Postcard/dp/B07QR3NWGK/ref=zg_bs_toys-and-games_39?_encoding=UTF8&psc=1&refRID=1TQPTF23RKE0GFEFZZPJ None None
https://www.amazon.com/Aqua-Monterey-Multi-Purpose-Inflatable-Portable/dp/B01CETEZ78/ref=zg_bs_toys-and-games_1?_encoding=UTF8&psc=1&refRID=1TQPTF23RKE0GFEFZZPJ Aqua 4-in-1 Monterey Hammock Inflatable Pool Float, Multi-Purpose Pool Hammock (Saddle, Lounge Chair, Hammock, Drifter) Pool Chair, Portable Water Hammock, Light Blue/White Stripe $14.26
https://www.amazon.com/Bobor-Trampoline-Sprinklers-Outdoor-Summer/dp/B07QZFS4CY/ref=zg_bs_toys-and-games_35?_encoding=UTF8&psc=1&refRID=1TQPTF23RKE0GFE

https://www.amazon.com/Transformers-TRA-EARTHRISE-EARTHMODE-Pack/dp/B07ZPL7DXZ/ref=zg_bs_toys-and-games_26?_encoding=UTF8&psc=1&refRID=1TQPTF23RKE0GFEFZZPJ Transformers Toys Generations War for Cybertron: Earthrise Deluxe WFC-E31 Autobot Alliance 2-Pack Action Figures - Kids Ages 8 and Up, 5.5-inch $39.99
https://www.amazon.com/Bunch-Balloons-Pack-Amazon-Exclusive/dp/B07GW2QQWN/ref=zg_bs_toys-and-games_2?_encoding=UTF8&psc=1&refRID=1TQPTF23RKE0GFEFZZPJ None None
https://www.amazon.com/LEGO-Classic-Green-Baseplate-Supplement/dp/B00NHQF65S/ref=zg_bs_toys-and-games_47?_encoding=UTF8&psc=1&refRID=1TQPTF23RKE0GFEFZZPJ None None
https://www.amazon.com/FindUWill-Swimming-Multi-Purpose-Inflatable-Lightblue/dp/B083Q1TTBJ/ref=zg_bs_toys-and-games_9?_encoding=UTF8&psc=1&refRID=1TQPTF23RKE0GFEFZZPJ FindUWill 2-Pack Premium Swimming Pool Float Hammock, Multi-Purpose Inflatable Hammock (Saddle, Lounge Chair, Hammock, Drifter), Water Hammock Lounge (Pink and Lightblue) $40.99
https://www.amazon.com/O

https://www.amazon.com/Insignia-NS-24DF311SE21-24-inch-Smart-HD/dp/B0874Y42B7/ref=zg_bs_electronics_32?_encoding=UTF8&psc=1&refRID=0N2NJ4X7R0F0RQXF4X6F Insignia NS-24DF311SE21 24-inch Smart HD TV - Fire TV Edition $109.99
https://www.amazon.com/AmazonBasics-6-Outlet-Protector-2-Pack-2-Foot/dp/B00TP1BWMK/ref=zg_bs_electronics_23?_encoding=UTF8&psc=1&refRID=0N2NJ4X7R0F0RQXF4X6F AmazonBasics 6-Outlet, 200 Joule Surge Protector Power Strip, 2 Foot, White - Pack of 2 $12.99
https://www.amazon.com/all-new-fire-tv-cube-with-alexa-voice-remote/dp/B07KGVB6D6/ref=zg_bs_electronics_40?_encoding=UTF8&psc=1&refRID=0N2NJ4X7R0F0RQXF4X6F None None
https://www.amazon.com/Apple-Watch-GPS-38mm-Silver-Aluminium/dp/B07K37HKT8/ref=zg_bs_electronics_18?_encoding=UTF8&psc=1&refRID=0N2NJ4X7R0F0RQXF4X6F None None
https://www.amazon.com/SanDisk-MicroSDXC-Nintendo-Switch-SDSQXAO-128G-GNCZN/dp/B07KXQX3S3/ref=zg_bs_electronics_42?_encoding=UTF8&psc=1&refRID=0N2NJ4X7R0F0RQXF4X6F SanDisk 128GB MicroSDXC UHS-I Memory 

https://www.amazon.com/YunJey-sleeve-triple-stripe-T-shirt/dp/B074T31XSC/ref=zg_bs_fashion_48?_encoding=UTF8&psc=1&refRID=1RGYHW4TCM2K0JGNHEWE None None
https://www.amazon.com/Womens-Sleeve-Shirts-Summer-Blouses/dp/B08661QD1N/ref=zg_bs_fashion_8?_encoding=UTF8&psc=1&refRID=1RGYHW4TCM2K0JGNHEWE NSQTBA Womens Rolled Short Sleeve Tops V Neck T Shirts Summer Blouses Knot Front Tees $17.99
https://www.amazon.com/Baleaf-Womens-Workout-Control-Pockets/dp/B07LF27WBQ/ref=zg_bs_fashion_1?_encoding=UTF8&psc=1&refRID=1RGYHW4TCM2K0JGNHEWE BALEAF Women's 8" /5" /2" High Waist Workout Biker Yoga Running Compression Exercise Shorts Side Pockets (Regular/Plus Size) $22.99
https://www.amazon.com/Hanes-Womens-T-Shirt-Small-Black/dp/B00KRYLG7U/ref=zg_bs_fashion_46?_encoding=UTF8&psc=1&refRID=1RGYHW4TCM2K0JGNHEWE Hanes Women's Nano T-Shirt $5.96
https://www.amazon.com/MIHOLL-Womens-Summer-Casual-Sleeve/dp/B07N4P8VM9/ref=zg_bs_fashion_33?_encoding=UTF8&psc=1&refRID=1RGYHW4TCM2K0JGNHEWE MIHOLL Women's Short 

https://www.amazon.com/Under-Armour-Graphic-Shorts-Graphite/dp/B072J36KT1/ref=zg_bs_fashion_39?_encoding=UTF8&psc=1&refRID=1RGYHW4TCM2K0JGNHEWE Under Armour Men's Tech Graphic Shorts $19.99


In [None]:
# df.to_csv(product_output, index=False)

In [37]:
products_df = pd.read_csv(product_output)
products_df.head()

FileNotFoundError: [Errno 2] File C:\projects\30-days-of-python\amazon-price\data\products.csv does not exist: 'C:\\projects\\30-days-of-python\\amazon-price\\data\\products.csv'

In [None]:
final_df = pd.concat([products_df, df_sub])
final_df.to_csv(product_output, index=False)

In [None]:
final_df.head()