In [3]:
pip install playwright

Collecting playwright
  Downloading playwright-1.58.0-py3-none-win_amd64.whl.metadata (3.5 kB)
Collecting pyee<14,>=13 (from playwright)
  Using cached pyee-13.0.0-py3-none-any.whl.metadata (2.9 kB)
Collecting greenlet<4.0.0,>=3.1.1 (from playwright)
  Downloading greenlet-3.3.1-cp312-cp312-win_amd64.whl.metadata (3.8 kB)
Downloading playwright-1.58.0-py3-none-win_amd64.whl (36.8 MB)
   ---------------------------------------- 0.0/36.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/36.8 MB 660.6 kB/s eta 0:00:56
   ---------------------------------------- 0.0/36.8 MB 660.6 kB/s eta 0:00:56
   ---------------------------------------- 0.1/36.8 MB 491.5 kB/s eta 0:01:15
   ---------------------------------------- 0.1/36.8 MB 722.1 kB/s eta 0:00:51
   ---------------------------------------- 0.2/36.8 MB 841.6 kB/s eta 0:00:44
   ---------------------------------------- 0.2/36.8 MB 901.1 kB/s eta 0:00:41
   ---------------------------------------- 0.3/36.8 MB 883.3 kB/s et

In [1]:
# from fastapi import FastAPI
from pydantic import BaseModel
from playwright.async_api import async_playwright
import random
import time
import re

In [5]:
# app = FastAPI()

In [6]:
MIN_DELAY = 2
MAX_DELAY = 9  # Reduced from 9 for better user experience

In [7]:
def human_delay():
    """Add a random delay to mimic human behavior"""
    time.sleep(random.uniform(MIN_DELAY, MAX_DELAY))

In [8]:
def clean_amazon_image(url):
    """Remove Amazon image size parameters to get higher quality image"""
    if url:
        # Remove the ._AC_.*?. pattern for cleaner URLs
        return re.sub(r'\._AC_[^.]*\.', '.', url)
    return url

In [16]:
class SearchInput(BaseModel):
    upc: str

In [10]:
# @app.get("/")
def home():
    """Health check endpoint"""
    return {"status": "Amazon scraper API running", "version": "1.0"}

In [None]:
def search_product_function(data):
    upc = data.strip()

    if not upc:
        return {"error": "UPC cannot be empty", "SKU": upc}

    browser = None
    
    try:
        with async_playwright() as p:
            # Launch browser with additional options for better stability
            browser = p.chromium.launch(
                headless=True,
                args=['--disable-blink-features=AutomationControlled']
            )
            
            # Set user agent to avoid detection
            context = browser.new_context(
                locale="en-US",
                user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
            )
            
            page = context.new_page()

            # Navigate to Amazon search
            print(f"Searching for UPC: {upc}")
            page.goto(
                f"https://www.amazon.com/s?k={upc}",
                wait_until="domcontentloaded",
                timeout=60000
            )

            # Wait for search results to load
            try:
                page.wait_for_selector(
                    "div[data-component-type='s-search-result']",
                    timeout=10000
                )
            except Exception as wait_error:
                print(f"Wait error: {wait_error}")
                # Take a screenshot for debugging
                # page.screenshot(path="/home/claude/debug_screenshot.png")
                browser.close()
                return {
                    "error": "No search results found or page took too long to load",
                    "SKU": upc
                }

            human_delay()

            # Get the first search result
            item = page.query_selector(
                "div[data-component-type='s-search-result']"
            )

            if not item:
                browser.close()
                return {"error": "Product not found", "SKU": upc}

            # Extract ASIN
            asin = item.get_attribute("data-asin")
            if not asin:
                asin = ""

            # Extract title
            title_el = item.query_selector("h2 span")
            title = title_el.inner_text().strip() if title_el else ""

            # Extract image
            img_el = item.query_selector("img.s-image")
            image = ""
            if img_el:
                # Try src first, then data-image-latency-src
                image = img_el.get_attribute("src")
                if not image or "data:image" in image:
                    image = img_el.get_attribute("data-image-latency-src") or ""
                image = clean_amazon_image(image)

            # Extract product link
            link_el = item.query_selector("h2 a")
            link = ""
            if link_el:
                href = link_el.get_attribute("href")
                if href:
                    # Handle both relative and absolute URLs
                    if href.startswith("http"):
                        link = href
                    else:
                        link = "https://www.amazon.com" + href

            browser.close()

            return {
                "SKU": upc,
                "ASIN": asin,
                "Title": title,
                "Image": image,
                "AmazonURL": link
            }

    except Exception as e:
        if browser:
            try:
                browser.close()
            except:
                pass
        
        print(f"Error occurred: {str(e)}")
        return {
            "error": f"An error occurred: {str(e)}",
            "SKU": upc
        }

In [3]:
search_product_function("0888072301252")

Error occurred: 'PlaywrightContextManager' object does not support the context manager protocol


{'error': "An error occurred: 'PlaywrightContextManager' object does not support the context manager protocol",
 'SKU': '0888072301252'}

In [26]:
def search_product(data: SearchInput):
    """
    Search for a product on Amazon by UPC
    Returns product details including ASIN, title, image, and URL
    """
    upc = data.upc.strip()
    
    if not upc:
        return {"error": "UPC cannot be empty", "SKU": upc}

    browser = None
    
    try:
        with async_playwright() as p:
            # Launch browser with additional options for better stability
            browser = p.chromium.launch(
                headless=True,
                args=['--disable-blink-features=AutomationControlled']
            )
            
            # Set user agent to avoid detection
            context = browser.new_context(
                locale="en-US",
                user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
            )
            
            page = context.new_page()

            # Navigate to Amazon search
            print(f"Searching for UPC: {upc}")
            page.goto(
                f"https://www.amazon.com/s?k={upc}",
                wait_until="domcontentloaded",
                timeout=60000
            )

            # Wait for search results to load
            try:
                page.wait_for_selector(
                    "div[data-component-type='s-search-result']",
                    timeout=10000
                )
            except Exception as wait_error:
                print(f"Wait error: {wait_error}")
                # Take a screenshot for debugging
                page.screenshot(path="/home/claude/debug_screenshot.png")
                browser.close()
                return {
                    "error": "No search results found or page took too long to load",
                    "SKU": upc
                }

            human_delay()

            # Get the first search result
            item = page.query_selector(
                "div[data-component-type='s-search-result']"
            )

            if not item:
                browser.close()
                return {"error": "Product not found", "SKU": upc}

            # Extract ASIN
            asin = item.get_attribute("data-asin")
            if not asin:
                asin = ""

            # Extract title
            title_el = item.query_selector("h2 span")
            title = title_el.inner_text().strip() if title_el else ""

            # Extract image
            img_el = item.query_selector("img.s-image")
            image = ""
            if img_el:
                # Try src first, then data-image-latency-src
                image = img_el.get_attribute("src")
                if not image or "data:image" in image:
                    image = img_el.get_attribute("data-image-latency-src") or ""
                image = clean_amazon_image(image)

            # Extract product link
            link_el = item.query_selector("h2 a")
            link = ""
            if link_el:
                href = link_el.get_attribute("href")
                if href:
                    # Handle both relative and absolute URLs
                    if href.startswith("http"):
                        link = href
                    else:
                        link = "https://www.amazon.com" + href

            browser.close()

            return {
                "SKU": upc,
                "ASIN": asin,
                "Title": title,
                "Image": image,
                "AmazonURL": link
            }

    except Exception as e:
        if browser:
            try:
                browser.close()
            except:
                pass
        
        print(f"Error occurred: {str(e)}")
        return {
            "error": f"An error occurred: {str(e)}",
            "SKU": upc
        }

In [27]:
search_product(SearchInput(upc="0888072301252"))

Error occurred: 'PlaywrightContextManager' object does not support the context manager protocol


{'error': "An error occurred: 'PlaywrightContextManager' object does not support the context manager protocol",
 'SKU': '0888072301252'}