📦 Import Libraries

In [1]:
# --- Standard Libraries ---
import time
import random
import datetime
import re

# --- Web Scraping ---
import requests
from bs4 import BeautifulSoup

# --- Data Handling ---
import pandas as pd

# --- Selenium ---
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

🌐 Configuration

In [3]:
# --- User-Agent Rotation ---
user_agents = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/116.0",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/115.0.1901.188"
]

# --- 🛒 Product Categories ---
product_categories = [
    {"product": "Washing Machines", "category_url": "https://www.idealo.de/preisvergleich/ProductCategory/1941.html"},
    {"product": "Refrigerators", "category_url": "https://www.idealo.de/preisvergleich/ProductCategory/2800.html"},
    {"product": "Freezers", "category_url": "https://www.idealo.de/preisvergleich/ProductCategory/2620.html"},
    {"product": "Dishwashers", "category_url": "https://www.idealo.de/preisvergleich/ProductCategory/2160.html"},
    {"product": "Loudspeakers", "category_url": "https://www.idealo.de/preisvergleich/ProductCategory/2021.html"},
    # Add more categories if needed
]

In [None]:
# --- Create driver for Selenium ---

def create_driver(headless=True):
    options = Options()
    if headless:
        options.add_argument("--headless")
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("--window-size=1920,1080")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--lang=en-US")
    options.add_argument(f"user-agent={random.choice(user_agents)}")

    return webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

📥 Load Idealo Category Pages

In [5]:
def load_category_page_selenium(category_url, driver, wait_time=3):
    driver.get(category_url)
    time.sleep(wait_time)
    page_source = driver.page_source
    return BeautifulSoup(page_source, "html.parser")



🔗 Extract Product Links (with 5+ Offers)

In [6]:
def extract_product_links(soup, min_offers=5):
    products = []

    # Each product block
    blocks = soup.find_all("div", class_="sr-resultList__item_m6xdA")

    for block in blocks:
        # Extract title
        title_div = block.find("div", class_="sr-productSummary__title_f5flP")
        title = title_div.get_text(strip=True) if title_div else None

        # Extract link
        link_tag = block.find("a", class_="sr-resultItemTile__link_Q8V4n")
        link = link_tag["href"] if link_tag and "href" in link_tag.attrs else None

        # Extract number of offers
        offer_div = block.find("div", class_="sr-detailedPriceInfo__offerCount_PJByo")
        offer_text = offer_div.get_text(strip=True) if offer_div else ""
        offers = int(re.search(r"\d+", offer_text).group()) if re.search(r"\d+", offer_text) else 0

        # Filter only if valid and meets offer threshold
        if title and link and offers >= min_offers:
            products.append({
                "title": title,
                "link": link if link.startswith("http") else f"https://www.idealo.de{link}",
                "offers": offers
            })

    return products


✅ Category Check: Test Page Access and Product Detection

In [7]:
def test_category_pages(driver, min_offers=5):
    summary = []

    for entry in product_categories:
        product_name = entry["product"]
        category_url = entry["category_url"]

        print(f"\n🔍 Checking category: {product_name}")

        soup = load_category_page_selenium(category_url, driver)
        if not soup:
            print("  ❌ Failed to load page.")
            summary.append({"Category": product_name, "Products Found": "❌ Failed"})
            continue

        products = extract_product_links(soup, min_offers=min_offers)
        print(f"  ✅ Found {len(products)} products with {min_offers}+ offers.")
        summary.append({"Category": product_name, "Products Found": len(products)})

    return pd.DataFrame(summary)


In [None]:
driver = create_driver(headless=True)

df_summary = test_category_pages(driver, min_offers=5)

print("\n📊 Summary of Categories:")
display(df_summary)

driver.quit()


🔍 Checking category: Washing Machines
  ✅ Found 30 products with 5+ offers.

🔍 Checking category: Refrigerators
  ✅ Found 32 products with 5+ offers.

🔍 Checking category: Freezers
  ✅ Found 35 products with 5+ offers.

🔍 Checking category: Dishwashers
  ✅ Found 33 products with 5+ offers.

🔍 Checking category: Loudspeakers
  ✅ Found 35 products with 5+ offers.

📊 Summary of Categories:


Unnamed: 0,Category,Products Found
0,Washing Machines,30
1,Refrigerators,32
2,Freezers,35
3,Dishwashers,33
4,Loudspeakers,35


💶 Extract Retailer Offers from Product Page

In [11]:
def extract_offers_from_product_page(product_url, driver, wait_time=3):
    import datetime
    from bs4 import BeautifulSoup
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC

    offers = []

    try:
        driver.get(product_url)
        time.sleep(wait_time)

        # --- Load all offers by clicking the "Load More" button repeatedly ---
        while True:
            try:
                load_more = WebDriverWait(driver, 5).until(
                    EC.element_to_be_clickable((By.CSS_SELECTOR, "button.productOffers-listLoadMore"))
                )
                driver.execute_script("arguments[0].scrollIntoView(true);", load_more)
                load_more.click()
                time.sleep(2)
            except:
                break

        # --- Parse page content ---
        soup = BeautifulSoup(driver.page_source, "html.parser")

        offer_blocks = soup.select("li.productOffers-listItem")
        for block in offer_blocks:
            # --- Extract price ---
            price_tag = block.select_one("a.productOffers-listItemOfferPrice")
            if not price_tag:
                continue
            price_text = price_tag.get_text(strip=True)
            try:
                price = float(price_text.replace(".", "").replace(",", ".").replace("€", "").strip())
            except:
                price = None

            # --- Extract retailer ---
            retailer_img = block.select_one("img.productOffers-listItemOfferShopV2LogoImage")
            retailer = retailer_img["alt"].strip() if retailer_img and "alt" in retailer_img.attrs else None

            if price is not None and retailer:
                offers.append({
                    "Retailer": retailer,
                    "Price [EUR]": price,
                    "Timepoint": datetime.datetime.now().strftime("%Y-%m-%d %H:%M"),
                    "Product URL": product_url
                })

    except Exception as e:
        print(f"❌ Error scraping {product_url}: {e}")

    return offers



🧾 Batch Scraping and CSV Export

In [12]:
import os
import pandas as pd
import datetime

def scrape_all_categories_and_save(driver, base_folder="scraped_data", min_offers=5):
    # --- Create timestamped output folder ---
    timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M")
    output_dir = os.path.join(os.getcwd(), base_folder, timestamp)
    os.makedirs(output_dir, exist_ok=True)

    # --- Loop through categories ---
    for entry in product_categories:
        category_name = entry["product"].lower().replace(" ", "_")
        category_url = entry["category_url"]
        print(f"\n🛒 Scraping category: {entry['product']}")

        # Load and extract product links
        soup = load_category_page_selenium(category_url, driver)
        product_links = extract_product_links(soup, min_offers=min_offers)
        print(f"  🔗 Found {len(product_links)} products with {min_offers}+ offers.")

        all_offers = []
        for product in product_links:
            print(f"    📦 {product['title']} ({product['offers']} offers)")
            offers = extract_offers_from_product_page(product['link'], driver)
            for offer in offers:
                all_offers.append({
                    "product": product["title"],
                    "retailer": offer["Retailer"],
                    "price": offer["Price [EUR]"],
                    "timestamp": offer["Timepoint"]
                })

        # Save to CSV
        df = pd.DataFrame(all_offers)
        output_path = os.path.join(output_dir, f"{category_name}_{timestamp}.csv")
        df.to_csv(output_path, index=False)
        print(f"  💾 Saved {len(df)} offers to {output_path}")

    print(f"\n✅ All categories completed. Data saved in: {output_dir}")




In [13]:
driver = create_driver(headless=False)
scrape_all_categories_and_save(driver)
driver.quit()



🛒 Scraping category: Washing Machines
  🔗 Found 32 products with 5+ offers.
    📦 Bosch WUU28T70 (45 offers)
    📦 Siemens WU14UT70 (39 offers)
    📦 Bosch WAN28127 (39 offers)
    📦 AEG L6FBG51470 (27 offers)
    📦 Siemens WG44G2Z40 (28 offers)
    📦 Gorenje WNEI74SAPS (12 offers)
    📦 Exquisit WA6110-020A (8 offers)
    📦 Bosch WGB2560X1 (38 offers)
    📦 Bosch WGB244040 (49 offers)
    📦 AEG LTR6A60370 (12 offers)
    📦 Siemens WM14N127 (34 offers)
    📦 Bauknecht B6R 88E SILENCE DE (13 offers)
    📦 Miele WWE 460 WPS (8 offers)
    📦 Bosch WGB244A40 (46 offers)
    📦 Bauknecht BW 719 A (13 offers)
    📦 Samsung WW90T554AAE/S2 (5 offers)
    📦 Bosch WUU28T42 (7 offers)
    📦 Gorenje WPNEI74A1TS (15 offers)
    📦 Hisense WF3S8043BW3 (7 offers)
    📦 Siemens WG44B2A40 (53 offers)
    📦 PKM WA6-ES1510 (18 offers)
    📦 LG F4WR701Y (5 offers)
    📦 Miele WQ 1000 WPS (8 offers)
    📦 AEG LTR7A71370 (8 offers)
    📦 Haier HW50-BP12307-S (11 offers)
    📦 Samsung WW90T554AAW/S2 (7 offers