# Parsing URLs with Regular Expressions
The goal of this notebook is to parse urls from Amazon's Best Sellers - Electronics list and extract product ids from them. 

In [1]:
# importing important libraries
import requests
import re
import time
import datetime
import pandas as pd

from pathlib import Path
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from requests_html import HTML

In [2]:
BASE_DIR = Path.cwd()
DATA_DIR = BASE_DIR / "data"
if not DATA_DIR.exists():
    DATA_DIR.mkdir(exist_ok=True)

product_category_links_output = DATA_DIR / "category-products.csv"
product_output = DATA_DIR / "products.csv"

In [3]:
# this allows me to run selenium without opening a new chrome window 
options = Options()
options.add_argument("--headless")

driver = webdriver.Chrome(options=options)

In [4]:
# three links from Amazon's best sellers page
categories = [
    {"name": "electronics", "url": "https://www.amazon.com/Best-Sellers-Electronics/zgbs/electronics/"},
    {"name": "home-and-decor", "url": "https://www.amazon.com/Best-Sellers-Home-Kitchen/zgbs/home-garden/"},
    {"name": "toys-and-games", "url":"https://www.amazon.com/Best-Sellers-Toys-Games/zgbs/toys-and-games/"}
]

In [5]:
# different ways product page links can look
regex_options = [
    r"https://www.amazon.com/gp/product/(?P<product_id>[\w-]+)/",
    r"https://www.amazon.com/dp/(?P<product_id>[\w-]+)/",
    r"https://www.amazon.com/(?P<slug>[\w-]+)/dp/(?P<product_id>[\w-]+)/",
]

In [6]:
# from the url i want to extract the product id
# for future price tracking
def extract_product_id_from_url(url):
    product_id = None
    for regex_str in regex_options:
        regex = re.compile(regex_str)
        match = regex.match(url)
        if match != None:
            try:
                product_id = match['product_id']
            except:
                pass
    return product_id

In [7]:
def clean_page_links(page_links=[], category=None):
    final_page_links = []
    for url in page_links:
        product_id = extract_product_id_from_url(url)
        if product_id != None:
            final_page_links.append({"url": url, "product_id": product_id, "category": category})
    return final_page_links

In [8]:
def scrape_category_product_links(categories=[]):
    all_product_links = []
    for category in categories:
        time.sleep(1.5)
        url = category.get("url")
        driver.get(url)
        body_el = driver.find_element(By.CSS_SELECTOR, 'body')
        html_str = body_el.get_attribute('innerHTML')
        html_obj = HTML(html=html_str)
        page_links = [f"https://www.amazon.com{x}" for x in html_obj.links if x.startswith("/")]
        cleaned_links = clean_page_links(page_links=page_links, category=category)
        all_product_links += cleaned_links
    return all_product_links

In [9]:
def extract_categories_and_save(categories=[]):
    all_product_links = scrape_category_product_links(categories)
    category_df = pd.DataFrame(all_product_links)
    category_df.to_csv(product_category_links_output, index=False)

In [10]:
extract_categories_and_save(categories=categories)

In [11]:
# pass in a link and it scrapes the page looking for the
# product title and product proce
def scrape_product_page(url, title_lookup = "#productTitle",
price_lookup = "#corePrice_desktop > div > table > tbody > tr:nth-child(2) > td.a-span12 > span.a-price.a-text-price.a-size-medium.apexPriceToPay > span.a-offscreen"):
    driver.get(url)
    time.sleep(1.5)
    body_el = driver.find_element(By.CSS_SELECTOR, 'body')
    html_str = body_el.get_attribute('innerHTML')
    html_obj = HTML(html=html_str)
    product_title = html_obj.find(title_lookup, first=True).text
    product_price = html_obj.find(price_lookup, first=True).text
    return product_title, product_price

In [12]:
def perform_scrape(cleaned_items=[]):
    data_extracted = []
    for obj in cleaned_items:
        link = obj['url']
        product_id = obj['product_id']
        title, price = (None, None)
        try:
            title, price = scrape_product_page(link)
        except:
            pass
        if title != None and price != None:
            print(link, title, price)
        product_data = {
            "url": link,
            "product_id": product_id,
            "title": title,
            "price": price
        }
        data_extracted.append(product_data)
    return data_extracted

In [13]:
def row_scrape_event(row, *args, **kwargs):
    link = row["url"]
    scraped = 0
    try:
        scraped = row["scraped"]
    except:
        pass
    if scraped == 1 or scraped == "1":
        print("skipped")
        return row
    product_id = row['product_id']
    title, price = (None, None)
    try:
        title, price = scrape_product_page(link)
    except:
        pass
    row['title'] = title
    row['price'] = title
    row['scraped'] = 1
    row['timestamp'] = datetime.datetime.now().timestamp()
    print(link, title, price)
    return row

In [14]:
df = pd.read_csv(product_category_links_output)
df.head()

Unnamed: 0,url,product_id,category
0,https://www.amazon.com/All-new-Kindle-Paperwhi...,B08KTZ8249,"{'name': 'electronics', 'url': 'https://www.am..."
1,https://www.amazon.com/Blocking-Glasses-Comput...,B07W781XWF,"{'name': 'electronics', 'url': 'https://www.am..."
2,https://www.amazon.com/all-new-Echo-Dot/dp/B07...,B07XJ8C8F5,"{'name': 'electronics', 'url': 'https://www.am..."
3,https://www.amazon.com/Tcstei-Protector-Outlet...,B08PC872RL,"{'name': 'electronics', 'url': 'https://www.am..."
4,https://www.amazon.com/fire-tv-stick-lite-late...,B091G4YP57,"{'name': 'electronics', 'url': 'https://www.am..."


In [15]:
df.shape

(90, 3)

In [16]:
df_sub = df.copy()

In [17]:
df_sub = df_sub.apply(row_scrape_event, axis=1)

https://www.amazon.com/All-new-Kindle-Paperwhite-adjustable-Ad-Supported/dp/B08KTZ8249/ref=zg_bs_electronics_24/131-6504244-7830130?pd_rd_i=B08KTZ8249&psc=1 None None
https://www.amazon.com/Blocking-Glasses-Computer-Reading-Eyestrain/dp/B07W781XWF/ref=zg_bs_electronics_17/131-6504244-7830130?pd_rd_i=B09N7LVXLH&psc=1 None None
https://www.amazon.com/all-new-Echo-Dot/dp/B07XJ8C8F5/ref=zg_bs_electronics_4/131-6504244-7830130?pd_rd_i=B07XJ8C8F5&psc=1 None None
https://www.amazon.com/Tcstei-Protector-Outlets-Extension-Essentials/dp/B08PC872RL/ref=zg_bs_electronics_25/131-6504244-7830130?pd_rd_i=B08PC872RL&psc=1 None None
https://www.amazon.com/fire-tv-stick-lite-latest-alexa-voice-remote-lite/dp/B091G4YP57/ref=zg_bs_electronics_20/131-6504244-7830130?pd_rd_i=B091G4YP57&psc=1 None None
https://www.amazon.com/Echo-Dot/dp/B07FZ8S74R/ref=zg_bs_electronics_3/131-6504244-7830130?pd_rd_i=B07FZ8S74R&psc=1 None None
https://www.amazon.com/Sabrent-4-Port-Individual-Switches-HB-UMLS/dp/B00BWF5U0M/ref=

In [18]:
df.to_csv(product_output, index=False)
products_df = pd.read_csv(product_output)

In [19]:
final_df = pd.concat([products_df, df_sub])
final_df.to_csv(product_output, index=False)

In [20]:
final_df.head()

Unnamed: 0,url,product_id,category,title,price,scraped,timestamp
0,https://www.amazon.com/All-new-Kindle-Paperwhi...,B08KTZ8249,"{'name': 'electronics', 'url': 'https://www.am...",,,,
1,https://www.amazon.com/Blocking-Glasses-Comput...,B07W781XWF,"{'name': 'electronics', 'url': 'https://www.am...",,,,
2,https://www.amazon.com/all-new-Echo-Dot/dp/B07...,B07XJ8C8F5,"{'name': 'electronics', 'url': 'https://www.am...",,,,
3,https://www.amazon.com/Tcstei-Protector-Outlet...,B08PC872RL,"{'name': 'electronics', 'url': 'https://www.am...",,,,
4,https://www.amazon.com/fire-tv-stick-lite-late...,B091G4YP57,"{'name': 'electronics', 'url': 'https://www.am...",,,,
