In [None]:
!playwright codegen

In [None]:
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
import pandas as pd

# Placeholder for all data
all_data = []

# Start Playwright and browser
playwright = await async_playwright().start()
browser = await playwright.chromium.launch(headless=False)
context = await browser.new_context()
page = await context.new_page()
await page.goto("https://jp.mercari.com/search?category_id=79")

while True:
    # Scrape the current page
    html = await page.content()
    soup = BeautifulSoup(html, 'html.parser')
    
    # Locate all item listings by the `data-testid` attribute
    items = soup.find_all('li', {'data-testid': 'item-cell'})
    
    for item in items:
        # Extract relevant data fields
        url_tag = item.find('a', {'data-testid': 'thumbnail-link'})
        price_tag = item.find('span', {'class': 'number__6b270ca7'})
        img_tag = item.find('img')
        name_tag = item.find('div', {'class': 'imageContainer__f8ddf3a2'})
        
        data = {
            'url': url_tag['href'] if url_tag else None,
            'price': price_tag.text if price_tag else None,
            'thumbnail': img_tag['src'] if img_tag else None,
            'name': name_tag['aria-label'] if name_tag else None
        }
        
        # Add the extracted data to the list
        all_data.append(data)
    
    # Try to click the "Next" button
    try:
        await page.locator("a:has-text('次へ')").click(timeout=10000)
    except:
        # If the button is not found, end the loop
        break

# Close browser
await context.close()
await browser.close()

# Convert collected data to a pandas DataFrame and display
df = pd.DataFrame(all_data)
print(df.head())  # This will display the first few rows of the DataFrame


In [None]:
import pandas as pd
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright

# Initialize an empty list to store product data
all_data = []

async def scrape_page(page):
    # Wait for the page content to load
    await page.wait_for_selector("li[data-testid='item-cell']", timeout=10000)

    # Get the HTML content of the page
    content = await page.content()

    # Parse the page using BeautifulSoup
    soup = BeautifulSoup(content, 'html.parser')

    # Find all product listing elements
    items = soup.find_all("li", {"data-testid": "item-cell"})

    for item in items:
        try:
            # Extract product details with proper checks for missing elements
            product_url = item.find('a')['href'] if item.find('a') else None
            if product_url:
                product_url = "https://jp.mercari.com" + product_url
            
            product_name = item.find('div', {'role': 'img'})['aria-label'] if item.find('div', {'role': 'img'}) else None
            price = item.find('span', {'class': 'number__6b270ca7'}).text if item.find('span', {'class': 'number__6b270ca7'}) else None
            image_url = item.find('img')['src'] if item.find('img') else None

        except AttributeError:
            # Handle any unexpected attribute errors
            product_url = product_name = price = image_url = None

        # Add data to the list as a dictionary
        all_data.append({
            'Product URL': product_url,
            'Product Name': product_name,
            'Price': price,
            'Image URL': image_url
        })

# Start Playwright and run the scraping process
playwright = await async_playwright().start()
browser = await playwright.chromium.launch(headless=False)
context = await browser.new_context()
page = await context.new_page()

# Go to the initial page
await page.goto("https://jp.mercari.com/search?category_id=79", timeout=10000)

# Scrape data from all pages
while True:
    await scrape_page(page)
    
    # Try clicking the "Next Page" button
    try:
        await page.get_by_role("link", name="次へ").click(timeout=10000)
    except:
        # If the button is not found or there's an error, break the loop
        break

await context.close()
await browser.close()



In [None]:
# Save the data to a pandas DataFrame
df = pd.DataFrame(all_data)

# Show the first few rows to confirm the result
df.head()


In [None]:
df.to_csv("mercari.csv", index=False)

In [None]:
import pandas as pd
from playwright.async_api import async_playwright, expect

async def run(playwright):
    browser = await playwright.chromium.launch(headless=False)
    context = await browser.new_context()
    page = await context.new_page()
    await page.goto("https://suumo.jp/jj/chintai/ichiran/FR301FC005/?sc=13101&ar=030&bs=040&sd=1&ta=13&ts=1")

    all_data = []

    while True:
        await page.wait_for_selector(".property")  # Wait for property elements to load
        
        # Get all property elements on the current page
        properties = await page.query_selector_all(".property")
        
        for property in properties:
            title = await property.query_selector("h2.property_inner-title a")
            title_text = await title.inner_text()
            link = await title.get_attribute("href")
            price = await property.query_selector(".detailbox-property-point")
            price_text = await price.inner_text() if price else "N/A"
            
            # Add more fields as needed
            
            # Append the data as a dictionary
            all_data.append({
                "title": title_text,
                "link": link,
                "price": price_text,
                # Add more fields here
            })
        
        # Try to click the "Next" button
        try:
            next_button = await page.query_selector("p.pagination-parts a:has-text('次へ')")
            if next_button:
                await next_button.click()
            else:
                break  # Exit if no next button found
        except:
            break  # Exit on any error (like timeout)
    
    # Create a DataFrame from the list of dictionaries
    df = pd.DataFrame(all_data)
    df.head()  # Display the first few rows
    
    await context.close()
    await browser.close()

async def main():
    async with async_playwright() as playwright:
        await run(playwright)

# Run the main function
await main()


In [None]:
import pandas as pd

df = pd.read_csv("cleaning-worksheets/suumo-properties.csv")
df.head()