In [1]:
import requests
from requests_html import HTML
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import re

In [2]:
options = Options()
options.add_argument("--headless")

driver = webdriver.Chrome(options=options)

In [3]:
categories = [
    "https://www.amazon.co.jp/gp/bestsellers/videogames/",
    "https://www.amazon.co.jp/gp/bestsellers/electronics/",
    "https://www.amazon.co.jp/gp/bestsellers/apparel/",
]

In [4]:
categories

['https://www.amazon.co.jp/gp/bestsellers/videogames/',
 'https://www.amazon.co.jp/gp/bestsellers/electronics/',
 'https://www.amazon.co.jp/gp/bestsellers/apparel/']

In [5]:
first_url = categories[0]

In [6]:
driver.get(first_url)

In [7]:
body_el = driver.find_element_by_css_selector("body")
html_str = body_el.get_attribute("innerHTML")

In [8]:
html_obj = HTML(html=html_str)

In [9]:
page_links = [f"https://www.amazon.com{x}" for x in html_obj.links if x.startswith("/")]

In [10]:
def scrape_product_page(url,title_lookup = "#productTitle",price_lookup = "#priceblock_ourprice"):
    driver.get(url)
    time.sleep(1.2)
    body_el = driver.find_element_by_css_selector("body")
    html_str = body_el.get_attribute("innerHTML")
    html_obj = HTML(html=html_str)
    product_title = html_obj.find(title_lookup, first=True).text
    product_price = html_obj.find(price_lookup, first=True).text
    return product_title,product_price

In [11]:
first_product_link = page_links[0]

In [12]:
for link in page_links:
    title,price = (None,None)
    try:
        title,price = scrape_product_page(link)
    except:
        pass
    if title != None and price != None:
        print(link,title,price)

In [13]:
#regex = re.compile(my_regex_pattern)

In [14]:
#my_match = regex.match(my_url)
#print(my_match)
#my_match['product_id']

In [15]:
regex_options = [
    r"https://www.amazon.com/gp/product/(?P<product_id>[\w-]+)/",
    r"https://www.amazon.com/dp/(?P<product_id>[\w-]+)/",
    r"https://www.amazon.com/(?P<slug>[\w-]+)/dp/(?P<product_id>[\w-]+)/",
]

def  extract_product_id_from_url(url):
    product_id = None
    for regex_str in regex_options:
        regex = re.compile(regex_str)
        match = regex.match(url)
        if match != None:
            try:
                product_id = match['product_id']
            except:
                pass
    return product_id

In [17]:
#page_links = [f"https://www.amazon.co.jp{x}" for x in new_links]
# page_links = [x for x in page_links if extract_product_id_from_url(x) != None]
def clean_page_links(page_links=[]):
    final_page_links = []
    for url in page_links:
            product_id = extract_product_id_from_url(url)
            if product_id != None:
                final_page_links.append({"url":url,"product_id":product_id})
    return final_page_links

cleaned_links = clean_page_links(page_links)

In [19]:
#len(page_links) == len(final_page_links)

In [36]:
def perform_scrape(cleaned_items=[]):
    data_extracted = []
    for obj in cleaned_items:
        link = obj['url']
        product_id = obj['product_id']
        title,price = (None,None)
        try:
            title,price = scrape_product_page(link)
        except:
            pass
        if title != None and price != None:
            print(link,title,price)
        product_data = {
            "url":link,
            "product_id":product_id,
            "title":title,
            "price":price
        }
        data_extracted.append(product_data)
    return data_extracted

In [37]:
cleaned_links

[{'url': 'https://www.amazon.com/dp/B07XV8VSZT/ref=zg_bs_videogames_4/357-9619310-5233550?_encoding=UTF8&psc=1&refRID=FKENPT4ZQYF4MKMR25S5',
  'product_id': 'B07XV8VSZT'}]

In [38]:
extracted_data = perform_scrape(cleaned_items=cleaned_links)

In [39]:
print(extracted_data)

[{'url': 'https://www.amazon.com/dp/B07XV8VSZT/ref=zg_bs_videogames_4/357-9619310-5233550?_encoding=UTF8&psc=1&refRID=FKENPT4ZQYF4MKMR25S5', 'product_id': 'B07XV8VSZT', 'title': None, 'price': None}]
