In [2]:
# Import the necessary libraries
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
import pandas as pd
import time

# Initialize an empty list to store product data
product_data = []

# Define the function to scrape a single page
async def scrape_page(page):
    html = await page.content()
    soup = BeautifulSoup(html, "html.parser")
    
    # Find all product elements on the page
    products = soup.find_all("div", class_="product_item")
    
    for product in products:
        # Extract relevant product details
        name = product.find("dt", class_="item_name").get_text(strip=True) if product.find("dt", class_="item_name") else None
        price = product.find("span", class_="price01_default").get_text(strip=True) if product.find("span", class_="price01_default") else None
        tax = product.find("p", class_="normal_price").get_text(strip=True) if product.find("p", class_="normal_price") else None
        image = product.find("img")["src"] if product.find("img") else None
        tags = product.find("ul", class_="item_icon").get_text(strip=True) if product.find("ul", class_="item_icon") else None
        
        # Append data to our list as a dictionary
        product_data.append({
            "name": name,
            "price": price,
            "tax": tax,
            "image_url": f"https://www.national-azabu.net{image}" if image else None,  # Assuming image URL is relative
            "tags": tags
        })

# Define the function to handle pagination
async def scrape_all_pages():
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)
        page = await browser.new_page()
        
        # Visit the first page
        await page.goto("https://www.national-azabu.net/products/list?category_id=52")
        
        # Scrape the first page
        await scrape_page(page)
        
        # Handle pagination
        while True:
            try:
                # Try clicking the "Next" button
                await page.locator("a:has-text('Next')").first.click(timeout=10000)
                time.sleep(3)  # Wait for the next page to load
                await scrape_page(page)
            except:
                print("No more pages to scrape.")
                break

        # Convert the product data into a DataFrame
        df = pd.DataFrame(product_data)
        print(df.head())  # Display the first few rows of data
        # Optionally save to CSV
        df.to_csv("products_output.csv", index=False)

# Run the scraper
await scrape_all_pages()


ERROR IS Timeout 10000ms exceeded.
Call log:
waiting for locator("a:has-text('Next')").first

No more pages to scrape.
             name price          tax  \
0            CORN   298  ¥298（8%tax）   
1     YUZU CITRON   358  ¥358（8%tax）   
2        BROCCOLI   398  ¥398（8%tax）   
3    CUCUMBER 1PC   128  ¥128（8%tax）   
4  CUCUMBERS 4PCS   498  ¥498（8%tax）   

                                           image_url           tags  
0  https://www.national-azabu.net/upload/save_ima...  PERISHABLE 冷蔵  
1  https://www.national-azabu.net/upload/save_ima...  PERISHABLE 冷蔵  
2  https://www.national-azabu.net/upload/save_ima...  PERISHABLE 冷蔵  
3  https://www.national-azabu.net/upload/save_ima...  PERISHABLE 冷蔵  
4  https://www.national-azabu.net/upload/save_ima...  PERISHABLE 冷蔵  


In [3]:
pd.read_csv("products_output.csv")

Unnamed: 0,name,price,tax,image_url,tags
0,CORN,298,¥298（8%tax）,https://www.national-azabu.net/upload/save_ima...,PERISHABLE 冷蔵
1,YUZU CITRON,358,¥358（8%tax）,https://www.national-azabu.net/upload/save_ima...,PERISHABLE 冷蔵
2,BROCCOLI,398,¥398（8%tax）,https://www.national-azabu.net/upload/save_ima...,PERISHABLE 冷蔵
3,CUCUMBER 1PC,128,¥128（8%tax）,https://www.national-azabu.net/upload/save_ima...,PERISHABLE 冷蔵
4,CUCUMBERS 4PCS,498,¥498（8%tax）,https://www.national-azabu.net/upload/save_ima...,PERISHABLE 冷蔵
...,...,...,...,...,...
110,NAGAIMO,324,¥324（8%tax）,https://www.national-azabu.net/upload/save_ima...,PERISHABLE 冷蔵
111,YAMATOIMO,632,¥632（8%tax）,https://www.national-azabu.net/upload/save_ima...,PERISHABLE 冷蔵
112,SATOIMO,590,¥590（8%tax）,https://www.national-azabu.net/upload/save_ima...,
113,BABY POTATOES,298,¥298（8%tax）,https://www.national-azabu.net/upload/save_ima...,


In [6]:
import pandas as pd
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright

async with async_playwright() as playwright:
    browser = await playwright.chromium.launch(headless=False)
    context = await browser.new_context()
    page = await context.new_page()

    base_url = "https://www.national-azabu.net"
    category_url = f"{base_url}/products/list?category_id=52"
    current_page = 1
    all_products = []

    while True:
        # Visit the current page
        await page.goto(f"{category_url}&pageno={current_page}")
        await page.wait_for_selector("div.product_item")  # Wait until product items are loaded
        content = await page.content()  # Get the page content
        soup = BeautifulSoup(content, "html.parser")  # Parse the content with BeautifulSoup

        # Extracting product details
        product_items = soup.select("div.product_item")
        
        for product in product_items:
            name = product.select_one("dt.item_name").get_text(strip=True)

            # Check if the price exists to avoid 'NoneType' errors
            price_element = product.select_one("p.price01_default")
            price = price_element.get_text(strip=True) if price_element else "N/A"

            img_element = product.select_one("div.item_photo img")
            img_url = img_element["src"] if img_element else "N/A"
            img_url = "https://www.national-azabu.net" + img_url if img_url != "N/A" else "N/A"

            # Add product details to list
            all_products.append({
                "name": name,
                "price": price,
                "img_url": img_url
            })

        # Check if there's a "Next" button for pagination
        next_button = await page.query_selector('li.pagenation__item-next a')
        if not next_button:
            break  # No more pages, exit the loop

        current_page += 1  # Increment the page number

    await context.close()
    await browser.close()

    # Create a pandas dataframe from the scraped data
    df = pd.DataFrame(all_products)
    df.head()  # Display the dataframe


In [7]:
df.head()

Unnamed: 0,name,price,img_url
0,CORN,,https://www.national-azabu.net/upload/save_ima...
1,YUZU CITRON,,https://www.national-azabu.net/upload/save_ima...
2,BROCCOLI,,https://www.national-azabu.net/upload/save_ima...
3,CUCUMBER 1PC,,https://www.national-azabu.net/upload/save_ima...
4,CUCUMBERS 4PCS,,https://www.national-azabu.net/upload/save_ima...
