JP's code

In [None]:
import httpx
from selectolax.parser import HTMLParser
import pandas as pd
from datetime import datetime
import os

# Disable debugger file validation
os.environ["PYDEVD_DISABLE_FILE_VALIDATION"] = "1"

def load_cookies_from_json(json_file):
    import json
    with open(json_file, 'r') as f:
        cookies = json.load(f)
    return cookies

def html_get_with_cookies(url, cookies):
    client = httpx.Client()
    client.headers.update(
        {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36"
        }
    )
   
    # Set cookies in httpx client
    for cookie in cookies:
        client.cookies.set(cookie['name'], cookie['value'], domain=cookie['domain'])
   
    resp = client.get(url)
    resp.raise_for_status()
    if resp.status_code == 200:
        print(resp.status_code)
    else:
        print(f"Failed to fetch {url}: Status code {resp.status_code}")
    return resp.text

def parse_detail_page(html):
    # Initialize variables to store product details
    product_details = {}

    # Parse the HTML using selectolax HTMLParser
    tree = HTMLParser(html)
   
    # Extract product title
    product_title_element = tree.css_first("span#productTitle")
    if product_title_element:
        product_details['title'] = product_title_element.text(strip=True)
    else:
        product_details['title'] = "Product title not found"

    # Extract product price using refined CSS selector
    price_element = tree.css_first("span.a-price.a-text-price.a-size-medium span.a-offscreen")
    if (price_element):
        product_details['price'] = price_element.text(strip=True)
    else:
        product_details['price'] = "Price not found"
        price_parent = tree.css_first("span.a-price.a-text-price.a-size-medium")
        print("Price not found. HTML segment:\n", price_parent.html if price_parent else "Element not found")
        print("-" * 50)

    # Extract seller information
    seller_element = tree.css_first("div#merchant-info a.a-link-normal span")
    if seller_element:
        product_details['seller'] = seller_element.text(strip=True)
    else:
        # Try to find alternative seller information
        seller_fallback_element = tree.css_first("div#merchant-info span")
        if seller_fallback_element:
            product_details['seller'] = seller_fallback_element.text(strip=True)
        else:
            product_details['seller'] = "Seller not found"

    # Add a timestamp
    product_details['timestamp'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
   
    return product_details

def main():
    # Prompt user for Excel file path
    excel_file = input("Enter the path to the Excel file containing product links: ").strip()
   
    # Read product links from Excel
    df = pd.read_excel(excel_file)
    product_links = df['links'].tolist()
   
    # Load cookies from JSON file
    json_file = r"C:\Users\jayapriyaa.rg.lv\BABA.json"
    cookies = load_cookies_from_json(json_file)
   
    # Initialize list to store scraped product details
    scraped_products = []
   
    # Process each product link
    for link in product_links:
        try:
            # Fetch HTML for the product link
            product_html = html_get_with_cookies(link, cookies)
           
            # Parse product details
            product_details = parse_detail_page(product_html)
            product_details['link'] = link
            scraped_products.append(product_details)
           
            # Print scraped product details
            print(f"Scraped Product Title: {product_details['title']}")
            print(f"Scraped Product Price: {product_details['price']}")
            print(f"Scraped Seller Info: {product_details['seller']}")
            print(f"Timestamp: {product_details['timestamp']}")
            print(f"Link: {link}")
            print("-" * 50)
           
        except Exception as e:
            print(f"Error scraping product at {link}: {e}")
   
    # Create a DataFrame from scraped products
    scraped_df = pd.DataFrame(scraped_products)
   
    # Output scraped data to Excel
    output_file = "Amazon_abab_4_07.xlsx"
    scraped_df.to_excel(output_file, index=False)
    print(f"Scraped product details saved to {output_file}")

if __name__ == "__main__":
    main()


Tried with beautifulsoup

In [22]:
import requests
from bs4 import BeautifulSoup

def get_breadcrumbs(url):
    # Send a request to the URL
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code != 200:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")
        return None
    
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    
    
    # Find the div with id 'wayfinding-breadcrumbs_container'
    breadcrumbs_div = soup.find('div', id='wayfinding-breadcrumbs_container')
    
    if not breadcrumbs_div:
        print("Breadcrumbs container not found")
        return None
    
    # Find all the a tags within the breadcrumbs div
    breadcrumb_links = breadcrumbs_div.find_all('a', class_='a-link-normal')
    
    # Extract the category names and URLs
    breadcrumbs = []
    for link in breadcrumb_links:
        category = link.get_text(strip=True)
        breadcrumbs.append(category)
    
    return breadcrumbs

# Example usage
url = 'https://www.amazon.com/dp/B001VNCJNQ?th=1'  # Replace with the actual product URL
breadcrumbs = get_breadcrumbs(url)

if breadcrumbs:
    print(breadcrumbs)


['Patio, Lawn & Garden', 'Patio Furniture & Accessories', 'Patio Seating', 'Chairs']


Tried with BS and httpx

In [16]:
!pip install httpx

Defaulting to user installation because normal site-packages is not writeable
Collecting httpx
  Using cached httpx-0.27.0-py3-none-any.whl.metadata (7.2 kB)
Collecting httpcore==1.* (from httpx)
  Using cached httpcore-1.0.5-py3-none-any.whl.metadata (20 kB)
Using cached httpx-0.27.0-py3-none-any.whl (75 kB)
Using cached httpcore-1.0.5-py3-none-any.whl (77 kB)
Installing collected packages: httpcore, httpx
Successfully installed httpcore-1.0.5 httpx-0.27.0




In [28]:
import httpx
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime

def html_get(url):
    client = httpx.Client()
    client.headers.update(
        {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36"
        }
    )
    try:
        resp = client.get(url)
        resp.raise_for_status()
        return resp.text
    except httpx.RequestError as e:
        print(f"Request error for {url}: {e}")
        return None
    except httpx.HTTPStatusError as e:
        print(f"HTTP error for {url}: {e}")
        return None

def parse_category(html):
    if not html:
        return []
   
    categories = []
    soup = BeautifulSoup(html, 'lxml')
    ul_element = soup.find('ul', class_='a-unordered-list a-horizontal a-size-small')
   
    if ul_element:
        for li in ul_element.find_all('li'):
            span_element = li.find('span', class_='a-list-item')
            if span_element:
                a_element = span_element.find('a', class_='a-link-normal')
                if a_element:
                    category_text = a_element.get_text(strip=True)
                    if category_text:
                        print(category_text)
                        categories.append(category_text)
                else:
                    category_text = span_element.get_text(strip=True)
                    if category_text and category_text != '›':
                        categories.append(category_text)
    print(categories)
    print("-------------")
    return categories

def main():
    # Prompt user for Excel file path
    excel_file = input("Enter the path to the Excel file containing product links: ").strip()
   
    # Read product links from Excel
    df = pd.read_excel(excel_file)
   
    # Verify the column name containing the product links
    link_column = 'links'  # Adjust this if your column name is different
    if link_column not in df.columns:
        raise KeyError(f"Column '{link_column}' not found in the Excel file.")
   
    product_links = df[link_column].tolist()
   
    # Initialize list to store scraped product details
    scraped_products = []
   
    # Process each product link
    for link in product_links:
        try:
            # Fetch HTML for the product link
            product_html = html_get(link)
           
            # Skip if HTML is None
            if not product_html:
                continue
           
            # Parse categories
            categories = parse_category(product_html)
            print(categories)
            # Prepare data for DataFrame
            product_data = {
                'links': link
            }
            # Add each category to the dictionary with a unique column name
            for i, category in enumerate(categories):
                product_data[f'Category_{i+1}'] = category
           
            # Append product details to the list
            scraped_products.append(product_data)
           
        except Exception as e:
            # Log errors for each link without interrupting the process
            print(f"Error scraping product at {link}: {e}")
   
    # Create a DataFrame from scraped products
    scraped_df = pd.DataFrame(scraped_products)
    
   
    # Generate a unique output file name with timestamp
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    output_file = f"Amazon_product_categories_{timestamp}.xlsx"
   
    # Output scraped data to Excel
    try:
        scraped_df.to_excel(output_file, index=False)
        print(f"Scraped product details saved to {output_file}")
    except PermissionError as e:
        print(f"Permission error: {e}")
        print("Please ensure the file is not open in another application and you have write permissions.")

if __name__ == "__main__":
    main()


Enter the path to the Excel file containing product links:  dummy amazon link.xlsx


Patio, Lawn & Garden
Patio Furniture & Accessories
Patio Seating
Chairs
['Patio, Lawn & Garden', 'Patio Furniture & Accessories', 'Patio Seating', 'Chairs']
-------------
['Patio, Lawn & Garden', 'Patio Furniture & Accessories', 'Patio Seating', 'Chairs']
[]
-------------
[]
[]
-------------
[]
[]
-------------
[]
[]
-------------
[]
Home & Kitchen
Furniture
Living Room Furniture
TV & Media Furniture
Television Stands & Entertainment Centers
['Home & Kitchen', 'Furniture', 'Living Room Furniture', 'TV & Media Furniture', 'Television Stands & Entertainment Centers']
-------------
['Home & Kitchen', 'Furniture', 'Living Room Furniture', 'TV & Media Furniture', 'Television Stands & Entertainment Centers']
[]
-------------
[]
[]
-------------
[]
[]
-------------
[]
[]
-------------
[]
[]
-------------
[]
[]
-------------
[]
Tools & Home Improvement
Kitchen & Bath Fixtures
Bathroom Fixtures
Bathtubs
Freestanding Bathtubs
['Tools & Home Improvement', 'Kitchen & Bath Fixtures', 'Bathroom Fixt

KeyboardInterrupt: 

In [6]:
!pip install undetected-chromedriver
!pip install webdriver_manager


Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [2]:
!pip install webdriver_manager


Defaulting to user installation because normal site-packages is not writeable


In [18]:
import time
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
from datetime import datetime

def get_breadcrumbs(url, driver):
    print(f"Fetching breadcrumbs for URL: {url}")
    driver.get(url)

    try:
        # Wait for the breadcrumbs div to be present
        breadcrumbs_div = WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.XPATH, "//div[@cel_widget_id='showing-breadcrumbs_csm_instrumentation_wrapper']"))
        )
        print("Breadcrumbs div found")

        # Find the div with id 'wayfinding-breadcrumbs_container'
        wayfinding_div = breadcrumbs_div.find_element(By.ID, 'wayfinding-breadcrumbs_container')
        print("Wayfinding breadcrumbs container found")

        # Wait for the ul element to be present and find all breadcrumb list items
        ul_element = WebDriverWait(wayfinding_div, 10).until(
            EC.presence_of_element_located((By.TAG_NAME, 'ul'))
        )
        breadcrumb_items = ul_element.find_elements(By.TAG_NAME, 'li')
        print(f"Found {len(breadcrumb_items)} breadcrumb items")

        # Extract the category names
        breadcrumbs = []
        for item in breadcrumb_items:
            try:
                a_element = item.find_element(By.TAG_NAME, 'a')
                category_text = a_element.text.strip()
                if category_text:
                    breadcrumbs.append(category_text)
                    print(f"Found category: {category_text}")
            except Exception as e:
                # Handle separators or text without links
                category_text = item.text.strip()
                if category_text and category_text != '›':
                    breadcrumbs.append(category_text)
                    print(f"Found category: {category_text}")

        print(f"Breadcrumbs for URL {url}: {breadcrumbs}")
        return breadcrumbs
    except Exception as e:
        print(f"An error occurred for URL {url}: {e}")
        return []

def main():
    # Prompt user for Excel file path
    excel_file = input("Enter the path to the Excel file containing product links: ").strip()

    # Read product links from Excel
    df = pd.read_excel(excel_file)

    # Verify the column name containing the product links
    link_column = 'links'  # Adjust this if your column name is different
    if link_column not in df.columns:
        raise KeyError(f"Column '{link_column}' not found in the Excel file.")

    product_links = df[link_column].tolist()

    # Initialize the undetected Chrome driver
    options = uc.ChromeOptions()
    chrome_version = 125
    driver = uc.Chrome(options=options, version_main=chrome_version)

    # Initialize list to store scraped product details
    scraped_products = []

    # Process product links in chunks
    chunk_size = 20
    for i in range(0, len(product_links), chunk_size):
        chunk_links = product_links[i:i+chunk_size]
        print(f"Processing links {i+1} to {i+len(chunk_links)}")

        for link in chunk_links:
            try:
                # Fetch breadcrumbs for the product link
                categories = get_breadcrumbs(link, driver)
               
                # Prepare data for DataFrame
                product_data = {'links': link}
                for i, category in enumerate(categories):
                    product_data[f'Category_{i+1}'] = category

                # Append product details to the list
                scraped_products.append(product_data)
            except Exception as e:
                print(f"Error scraping product at {link}: {e}")

    # Close the WebDriver
    driver.quit()

    # Create a DataFrame from scraped products
    scraped_df = pd.DataFrame(scraped_products)

    # Generate a unique output file name with timestamp
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    output_file = f"Amazon_product_categories_{timestamp}.xlsx"

    # Output scraped data to Excel
    try:
        scraped_df.to_excel(output_file, index=False)
        print(f"Scraped product details saved to {output_file}")
    except PermissionError as e:
        print(f"Permission error: {e}")
        print("Please ensure the file is not open in another application and you have write permissions.")

if __name__ == "__main__":
    main()


Enter the path to the Excel file containing product links:  set8_of_11k_sku.xlsx


Processing links 1 to 20
Fetching breadcrumbs for URL: https://www.amazon.com/dp/B092RNYFSC
Breadcrumbs div found
Wayfinding breadcrumbs container found
Found 7 breadcrumb items
Found category: Home & Kitchen
Found category: Furniture
Found category: Accent Furniture
Found category: Storage Cabinets
Breadcrumbs for URL https://www.amazon.com/dp/B092RNYFSC: ['Home & Kitchen', 'Furniture', 'Accent Furniture', 'Storage Cabinets']
Fetching breadcrumbs for URL: https://www.amazon.com/dp/B093K8T114
Breadcrumbs div found
Wayfinding breadcrumbs container found
Found 7 breadcrumb items
Found category: Home & Kitchen
Found category: Furniture
Found category: Dining Room Furniture
Found category: Table & Chair Sets
Breadcrumbs for URL https://www.amazon.com/dp/B093K8T114: ['Home & Kitchen', 'Furniture', 'Dining Room Furniture', 'Table & Chair Sets']
Fetching breadcrumbs for URL: https://www.amazon.com/dp/B093KD4M2G
Breadcrumbs div found
Wayfinding breadcrumbs container found
Found 7 breadcrumb it

Trying concurreny

Threadpooling