In [None]:
!playwright codegen

In [None]:
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
import pandas as pd

# Open the browser and start scraping
playwright = await async_playwright().start()
browser = await playwright.chromium.launch(headless=False)
context = await browser.new_context()
page = await context.new_page()

# Go to the first page
await page.goto("https://jp.mercari.com/search?category_id=72")

all_data = []

while True:
    # Wait for the items to load on the page
    await page.wait_for_selector("li[data-testid='item-cell']")
    
    # Get the page content and parse it with BeautifulSoup
    html = await page.content()
    soup = BeautifulSoup(html, 'html.parser')
    
    # Find all product listings
    items = soup.find_all('li', {'data-testid': 'item-cell'})
    
    for item in items:
        try:
            # Safely extract the product name, product link, image URL, and price
            product_name_div = item.find('div', role='img')
            product_name = product_name_div['aria-label'] if product_name_div else 'N/A'

            product_link_tag = item.find('a')
            product_link = "https://jp.mercari.com" + product_link_tag['href'] if product_link_tag else 'N/A'

            image_tag = item.find('img')
            image_url = image_tag['src'] if image_tag else 'N/A'

            price_span = item.find('span', class_='number__6b270ca7')
            price = price_span.text if price_span else 'N/A'

            # Add this product's data to our list
            all_data.append({
                'Product Name': product_name,
                'Product URL': product_link,
                'Image URL': image_url,
                'Price': price
            })
        except Exception as e:
            print(f"Error processing item: {e}")

    # Try to click the "Next Page" button
    try:
        await page.locator("a:has-text('次へ')").click(timeout=10000)
    except Exception as e:
        print("No more pages or error occurred:", e)
        break  # Exit the loop if there's no next page or error occurs

# Close the browser
await context.close()
await browser.close()



In [None]:
# Convert the collected data into a pandas DataFrame
df = pd.DataFrame(all_data)

# Display the first few rows of the dataframe
df.head()

In [None]:
df.shape