# Parsing URLs with Regular Expressions
The goal of this notebook is to parse urls from Amazon's Best Sellers - Electronics list and extract product ids from them. 

In [35]:
# importing important libraries
import requests
from requests_html import HTML
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import re

In [36]:
# this allows me to run selenium without opening a new chrome window 
options = Options()
options.add_argument("--headless")

driver = webdriver.Chrome(options=options)

In [37]:
# three links from Amazon's best sellers page
categories = [
    "https://www.amazon.com/Best-Sellers-Electronics/zgbs/electronics/",
    "https://www.amazon.com/Best-Sellers-Home-Kitchen/zgbs/home-garden/",
    "https://www.amazon.com/Best-Sellers-Toys-Games/zgbs/toys-and-games/"
]

In [38]:
# grabbed the electronics link from the list
# to use in testing new functions
first_url = categories[0]
driver.get(first_url)

In [39]:
# grabs body of the page
body_el = driver.find_element(By.CSS_SELECTOR, 'body')
html_str = body_el.get_attribute('innerHTML')

In [40]:
html_obj = HTML(html=html_str)

In [41]:
# creates of list of links from the best sellers page
new_links = [x for x in html_obj.links if x.startswith("/")]
# makes sure that the links are not product reviews
# we are focusing on actual products
new_links = [x for x in new_links if "product-reviews/" not in x]
# takes the links and properly formats them as urls
page_links = [f"https://www.amazon.com{x}" for x in new_links]

In [42]:
# pass in a link and it scrapes the page looking for the
# product title and product proce
def scrape_product_page(url, title_lookup = "#productTitle",
price_lookup = "#corePrice_desktop > div > table > tbody > tr:nth-child(2) > td.a-span12 > span.a-price.a-text-price.a-size-medium.apexPriceToPay > span.a-offscreen"):
    driver.get(url)
    time.sleep(1.2)
    body_el = driver.find_element(By.CSS_SELECTOR, 'body')
    html_str = body_el.get_attribute('innerHTML')
    html_obj = HTML(html=html_str)
    product_title = html_obj.find(title_lookup, first=True).text
    product_price = html_obj.find(price_lookup, first=True).text
    return product_title, product_price

In [43]:
# <base-url>/<slug>/dp/<product_id>/
# this is an example of a product url
# regular expressions help us more accurately define product
# page links
my_regex_pattern = r"https://www.amazon.com/(?P<slug>[\w-]+)/dp/(?P<product_id>[\w-]+)/"
my_url = 'https://www.amazon.com/Charger-Protector-QINLIANF-Extender-Charging/dp/B08R6S1M1K/'

In [44]:
# creates a regular expression object from a patter
regex = re.compile(my_regex_pattern)

In [45]:
# contains match object if item matches pattern
my_match = regex.match(my_url)
my_match['product_id']

'B08R6S1M1K'

In [46]:
my_match['slug']

'Charger-Protector-QINLIANF-Extender-Charging'

In [47]:
# different ways product page links can look
regex_options = [
    r"https://www.amazon.com/gp/product/(?P<product_id>[\w-]+)/",
    r"https://www.amazon.com/dp/(?P<product_id>[\w-]+)/",
    r"https://www.amazon.com/(?P<slug>[\w-]+)/dp/(?P<product_id>[\w-]+)/",
]

In [48]:
# from the url i want to extract the product id
# for future price tracking
def extract_product_id_from_url(url):
    product_id = None
    for regex_str in regex_options:
        regex = re.compile(my_regex_pattern)
        match = regex.match(url)
        if match != None:
            try:
                product_id = match['product_id']
            except:
                pass
    return product_id

In [51]:

# page_links = [x for x in page_links if extract_product_id_from_url(x) != None]
def clean_page_links(page_links=[]):
    final_page_links = []
    for url in page_links:
        product_id = extract_product_id_from_url(url)
        if product_id != None:
            final_page_links.append({"url": url, "product_id": product_id})
    return final_page_links
cleaned_links = clean_page_links(page_links)

In [52]:
len(page_links) == len(cleaned_links)

False

In [54]:
len(page_links)

155

In [55]:
len(cleaned_links)

30

In [56]:
def perform_scrape(cleaned_items=[]):
    data_extracted = []
    for obj in cleaned_items:
        link = obj['url']
        product_id = obj['product_id']
        title, price = (None, None)
        try:
            title, price = scrape_product_page(link)
        except:
            pass
        if title != None and price != None:
            print(link, title, price)
        product_data = {
            "url": link,
            "product_id": product_id,
            "title": title,
            "price": price
        }
        data_extracted.append(product_data)
    return data_extracted

In [57]:
extracted_data = perform_scrape(cleaned_items=cleaned_links)

https://www.amazon.com/Fujifilm-INSTAX-Instant-Twin-Pack/dp/B00EB4ADQW/ref=zg_bs_electronics_14/142-9870449-9536740?pd_rd_i=B00EB4ADQW&psc=1 Fujifilm Instax Mini Instant Film Twin Pack (White) $12.99
https://www.amazon.com/Ferilinso-Designed-Protector-Tempered-Definition/dp/B09BNBMT8Y/ref=zg_bs_electronics_19/142-9870449-9536740?pd_rd_i=B09BNBMT8Y&psc=1 Ferilinso Designed for iPhone 13 Pro Max Camera Lens Protector, Designed for iPhone 13 Pro Camera Lens Protector, 3 Pack 9H Tempered Glass, Night Shooting Mode, Case Friendly, High Definition $6.70
https://www.amazon.com/Tcstei-Protector-Outlets-Extension-Essentials/dp/B08PC872RL/ref=zg_bs_electronics_26/142-9870449-9536740?pd_rd_i=B08PC872RL&psc=1 Power Strip , Tcstei Surge Protector with 12 Outlets and 4 USB Ports, 6 Feet Extension Cord (1875W/15A) for for Home, Office, Dorm Essentials, 2700 Joules, ETL Listed, (Black) $17.80
https://www.amazon.com/Certified-Refurbished-Fire-TV-Stick-Light/dp/B085PL3BJ1/ref=zg_bs_electronics_22/142-98

In [None]:
&26 