In [2]:
import requests
import time
from requests_html import HTML
from selenium import webdriver
from selenium.webdriver.firefox.options import Options

In [3]:
options = Options()
options.add_argument('--headless')

driver = webdriver.Firefox(options=options, executable_path="./geckodriver")

In [4]:
categories = [
    "https://www.amazon.in/gp/bestsellers/electronics/ref=zg_bs_nav_0",
    "https://www.amazon.in/gp/bestsellers/computers/ref=zg_bs_nav_0",
    "https://www.amazon.in/gp/bestsellers/apparel/ref=zg_bs_nav_0",
]

In [5]:
categories

['https://www.amazon.in/gp/bestsellers/electronics/ref=zg_bs_nav_0',
 'https://www.amazon.in/gp/bestsellers/computers/ref=zg_bs_nav_0',
 'https://www.amazon.in/gp/bestsellers/apparel/ref=zg_bs_nav_0']

In [6]:
first_url = categories[0]
driver.get(first_url)
body = driver.find_element_by_css_selector("body")
html_str = body.get_attribute("innerHTML")

In [7]:
html_obj = HTML(html=html_str)

In [8]:
new_links = [x for x in html_obj.links if x.startswith("/")]
new_links = [x for x in new_links if "/product-reviews" not in x]
new_links[:5]

['/Samsung-Galaxy-Prime-Ocean-Storage/dp/B085J3GN6M/ref=zg_bs_electronics_12/262-3070226-5117809?_encoding=UTF8&psc=1&refRID=BXV8MV216D57NBTRF4PN',
 '/Samsung-Galaxy-Storage-Additional-Exchange/dp/B086KCCMCP/ref=zg_bs_electronics_45/262-3070226-5117809?_encoding=UTF8&psc=1&refRID=BXV8MV216D57NBTRF4PN',
 '/gp/prime/ref=nav_prime_ajax_err/262-3070226-5117809',
 '/Oppo-Fantasy-Storage-Additional-Exchange/dp/B08444SXZ6/ref=zg_bs_electronics_40/262-3070226-5117809?_encoding=UTF8&psc=1&refRID=BXV8MV216D57NBTRF4PN',
 '/Samsung-Galaxy-Storage-Additional-Exchange/dp/B086KCDGTQ/ref=zg_bs_electronics_26/262-3070226-5117809?_encoding=UTF8&psc=1&refRID=BXV8MV216D57NBTRF4PN']

In [9]:
len(new_links)

100

In [10]:
def scrape_product_page(url, title_lookup="#productTitle",price_lookup="#priceblock_ourprice"):
    driver.get(url)
    time.sleep(1.2)
    body_el = driver.find_element_by_css_selector("body")
    html_str = body_el.get_attribute("innerHTML")
    html_obj = HTML(html=html_str)
    product_title = html_obj.find(title_lookup, first=True).text
    product_price = html_obj.find(price_lookup, first=True).text
    return product_title, product_price

In [11]:
# 'https://amazon.in/OnePlus-Buds-Z-White/dp/B07XY541GH/ref=zg_bs_electronics_10/259-0783252-4776544?_encoding=UTF8&psc=1&refRID=ZKDVTAN6T53MJ4QJRMX2'

# pattern
# <base_url>/<slug>/dp/<product-id>


In [12]:
my_regex_pattern = r"https://amazon.in/(?P<slug>[\w-]+)/dp/(?P<product_id>[\w-]+)/"
my_url = "https://amazon.in/OnePlus-Buds-Z-White/dp/B07XY541GH/ref=zg_bs_electronics_10/259-0783252-4776544?_encoding=UTF8&psc=1&refRID=ZKDVTAN6T53MJ4QJRMX2"


In [13]:
import re
regex = re.compile(my_regex_pattern)
match = regex.match(my_url)
match.groups()

('OnePlus-Buds-Z-White', 'B07XY541GH')

In [14]:
match.groupdict()

{'slug': 'OnePlus-Buds-Z-White', 'product_id': 'B07XY541GH'}

In [15]:
regex_options = [
    r"https://amazon.in/gp/product/(?P<product_id>[\w-]+)/",
    r"https://amazon.in/dp/(?P<product_id>[\w-]+)/",
    r"https://amazon.in/(?P<slug>[\w-]+)/dp/(?P<product_id>[\w-]+)/"
]

In [16]:
def extract_productid_from_url(url):
    product_id = None
    for regex_str in regex_options:
        regex = re.compile(regex_str)
        match = regex.match(url)
        if match is not None:
            try:
                product_id = match['product_id']
            except:
                pass
    return product_id 

In [17]:
page_links = [f"https://amazon.in{x}" for x in new_links]
page_links = [x for x in page_links if extract_productid_from_url(x) is not None]
page_links[:5]

['https://amazon.in/Samsung-Galaxy-Prime-Ocean-Storage/dp/B085J3GN6M/ref=zg_bs_electronics_12/262-3070226-5117809?_encoding=UTF8&psc=1&refRID=BXV8MV216D57NBTRF4PN',
 'https://amazon.in/Samsung-Galaxy-Storage-Additional-Exchange/dp/B086KCCMCP/ref=zg_bs_electronics_45/262-3070226-5117809?_encoding=UTF8&psc=1&refRID=BXV8MV216D57NBTRF4PN',
 'https://amazon.in/Oppo-Fantasy-Storage-Additional-Exchange/dp/B08444SXZ6/ref=zg_bs_electronics_40/262-3070226-5117809?_encoding=UTF8&psc=1&refRID=BXV8MV216D57NBTRF4PN',
 'https://amazon.in/Samsung-Galaxy-Storage-Additional-Exchange/dp/B086KCDGTQ/ref=zg_bs_electronics_26/262-3070226-5117809?_encoding=UTF8&psc=1&refRID=BXV8MV216D57NBTRF4PN',
 'https://amazon.in/Tangent-Lite-Magnetic-Bluetooth-Headphones/dp/B085W8CFLH/ref=zg_bs_electronics_28/262-3070226-5117809?_encoding=UTF8&psc=1&refRID=BXV8MV216D57NBTRF4PN']

In [18]:
def perform_scrape(page_links):
    for link in page_links:
        title, price = (None, None)
        try:
            title, price = scrape_product_page(link)
        except:
            pass
        if title is not None and price is not None:
            print(title, price, sep="-")
            

In [19]:
perform_scrape(page_links)

Samsung Galaxy M01 (Black, 3GB RAM, 32GB Storage) with No Cost EMI/Additional Exchange Offers-₹ 7,499.00
Oppo A31 (Fantasy White, 6GB RAM, 128GB Storage) with No Cost EMI/Additional Exchange Offers-₹ 11,990.00
Samsung Galaxy M01 (Blue, 3GB RAM, 32GB Storage) with No Cost EMI/Additional Exchange Offers-₹ 7,499.00
pTron Tangent Lite Bluetooth 5.0 Wireless Headphones with Hi-Fi Stereo Sound, 6Hrs Playtime, Lightweight Ergonomic Neckband, Sweat-Resistant Magnetic Earbuds, Voice Assistant & Mic - (Black)-₹ 599.00
boAt Bassheads 100 in Ear Wired Earphones with Mic(Black)-₹ 375.00
Redmi 9A (Sea Blue, 3GB Ram, 32GB Storage)-₹ 7,499.00
boAt Rockerz 450 Bluetooth On-Ear Headphone with Mic(Luscious Black)-₹ 1,399.00
Tygot Adjustable Aluminium Alloy Tripod Stand Holder for Mobile Phones & Camera, 360 mm -1050 mm, 1/4 inch Screw + Mobile Holder Bracket-₹ 399.00
OnePlus Bullets Wireless Z Bass Edition (Reverb Red)-₹ 1,999.00
ELV Desktop Cell Phone Stand Tablet Stand, Aluminum Stand Holder for Mobile

In [21]:
def perform_scrape(page_links):
    extracted_data = []
    for link in page_links:
        title, price = (None, None)
        try:
            title, price = scrape_product_page(link)
        except:
            pass
        if title is not None and price is not None:
            data = {
                "link": link,
                "product": title,
                "price": price 
            }
        extracted_data.append(data)
    return extracted_data