📦 Import Libraries

In [1]:
# --- Standard Libraries ---
import time
import random
import datetime
import re

# --- Web Scraping ---
import requests
from bs4 import BeautifulSoup

# --- Data Handling ---
import pandas as pd

🌐 Configuration

In [2]:
# --- User-Agent Rotation ---
user_agents = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/116.0",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/115.0.1901.188"
]

# --- Product Categories ---
product_categories = [
    {"product": "Washing Machines", "category": "hwaschf"},
    {"product": "Refrigerators", "category": "hkuehlsch"},
    {"product": "Freezers", "category": "hgefr"},
    {"product": "Dishwashers", "category": "hgeschirr60"},
    {"product": "Loudspeakers", "category": "hifibox"},
    # Add more categories if needed
]

📥 Load Geizhals Category Pages

In [3]:
def load_category_page(category_name):
    """
    Load the HTML content of a Geizhals category page.

    Args:
        category_name (str): The category code from Geizhals (e.g., 'hwaschf')

    Returns:
        BeautifulSoup: Parsed HTML object of the category page.
    """
    url = f"https://geizhals.at/?cat={category_name}"
    headers = {"User-Agent": random.choice(user_agents)}
    response = requests.get(url, headers=headers)

    time.sleep(random.uniform(1.0, 2.0))

    if response.status_code == 200:
        return BeautifulSoup(response.content, "html.parser")
    else:
        print(f"❌ Failed to load category page: {category_name} (Status {response.status_code})")
        return None


🔗 Extract Product Links (with 5+ Offers)

In [4]:
def extract_product_links(soup, min_offers=5):
    """
    Extracts product titles and links from a Geizhals category page,
    filtering to include only products with at least `min_offers`.

    Args:
        soup (BeautifulSoup): Parsed HTML of the category page
        min_offers (int): Minimum number of offers a product must have to be included

    Returns:
        List[Dict]: List of products with 'title', 'link', and 'offers'
    """
    base_url = "https://geizhals.at/"
    product_data = []

    links = soup.find_all("a", class_="productlist__link")

    for link in links:
        title = link.get_text(strip=True)
        relative_url = link["href"]
        full_url = base_url + relative_url.split("?")[0]

        parent = link.find_parent("div", class_="cell productlist__item productlist__name")
        if not parent:
            continue

        offers_div = parent.find_next("div", class_="cell productlist__offerscount--standard")

        try:
            offers = int(offers_div.get_text(strip=True))
        except (AttributeError, ValueError):
            offers = 0

        if offers >= min_offers:
            product_data.append({
                "title": title,
                "link": full_url,
                "offers": offers
            })

    return product_data


✅ Category Check: Test Page Access and Product Detection

In [5]:
def test_category_pages(min_offers=5):
    """
    Checks all categories for successful loading and counts valid products per category.
    """
    summary = []

    for entry in product_categories:
        product_name = entry["product"]
        category_code = entry["category"]

        print(f"\n🔍 Checking category: {product_name} ({category_code})")

        soup = load_category_page(category_code)
        if not soup:
            print("  ❌ Failed to load page.")
            summary.append({"Category": product_name, "Products Found": "❌ Failed"})
            continue

        products = extract_product_links(soup, min_offers=min_offers)
        print(f"  ✅ Found {len(products)} products with {min_offers}+ offers.")
        summary.append({"Category": product_name, "Products Found": len(products)})

    return pd.DataFrame(summary)


In [6]:
test_category_pages()



🔍 Checking category: Washing Machines (hwaschf)
  ✅ Found 29 products with 5+ offers.

🔍 Checking category: Refrigerators (hkuehlsch)
  ✅ Found 26 products with 5+ offers.

🔍 Checking category: Freezers (hgefr)
  ✅ Found 25 products with 5+ offers.

🔍 Checking category: Dishwashers (hgeschirr60)
  ✅ Found 29 products with 5+ offers.

🔍 Checking category: Loudspeakers (hifibox)
  ✅ Found 14 products with 5+ offers.


Unnamed: 0,Category,Products Found
0,Washing Machines,29
1,Refrigerators,26
2,Freezers,25
3,Dishwashers,29
4,Loudspeakers,14


💶 Extract Retailer Offers from Product Page

In [7]:
def extract_offers_from_product_page(soup, product_title):
    """
    Extracts price offers from a single product page.

    Args:
        soup (BeautifulSoup): Parsed HTML of the product page.
        product_title (str): Name of the product for labeling each entry.

    Returns:
        List[Dict]: List of offers with 'product', 'retailer', 'price', 'timestamp'
    """
    offers = []
    offer_blocks = soup.find_all("div", class_="offer")

    for block in offer_blocks:
        # Retailer name
        retailer_div = block.find("div", class_="merchant__logo-caption")
        retailer = retailer_div.get_text(strip=True) if retailer_div else "N/A"

        # Attempt to fix encoding issues (e.g., KÃ¶ck → Köck)
        try:
            retailer = retailer.encode('latin1').decode('utf-8')
        except (UnicodeEncodeError, UnicodeDecodeError):
            pass

        # Price extraction
        price_span = block.find("span", class_="gh_price")
        if price_span:
            price_text = price_span.get_text(strip=True).replace("€", "").replace(".", "").replace(",", ".")
            try:
                price = float(price_text)
            except ValueError:
                price = None
        else:
            price = None

        # Timestamp
        timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M")

        if price is not None and retailer != "N/A":
            offers.append({
                "product": product_title,
                "retailer": retailer,
                "price": price,
                "timestamp": timestamp
            })

    return offers


🧾 Batch Scraping and CSV Export

In [8]:
import os

def scrape_all_categories(output_base="scraped_data"):
    """
    Scrapes all product categories and saves each result as a timestamped CSV
    inside a dedicated timestamped folder.
    """
    # Create timestamped folder
    run_timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M")
    output_folder = os.path.join(output_base, run_timestamp)
    os.makedirs(output_folder, exist_ok=True)

    for entry in product_categories:
        product_name = entry["product"]
        category_code = entry["category"]
        print(f"\n🔍 Scraping category: {product_name} ({category_code})")

        try:
            # Load category page and extract product links
            soup = load_category_page(category_code)
            if not soup:
                print("  ❌ Failed to load category page.")
                continue

            products = extract_product_links(soup)
            print(f"  ✅ Found {len(products)} products with 5+ offers.")

            all_offers = []

            for i, product in enumerate(products):
                print(f"    [{i+1}/{len(products)}] {product['title']}")

                try:
                    url = product["link"]
                    headers = {"User-Agent": random.choice(user_agents)}
                    response = requests.get(url, headers=headers)

                    if response.status_code == 200:
                        product_soup = BeautifulSoup(response.content, "html.parser")
                        product_offers = extract_offers_from_product_page(product_soup, product["title"])
                        all_offers.extend(product_offers)
                    else:
                        print(f"    ❌ Failed to load product page ({response.status_code})")

                    time.sleep(random.uniform(1.0, 2.0))

                except Exception as e:
                    print(f"    ❌ Error scraping product: {e}")

            # Save category data to CSV
            df = pd.DataFrame(all_offers)
            if not df.empty:
                df = df.drop_duplicates(subset=["product", "retailer", "price", "timestamp"])
                timestamped_filename = f"{product_name.replace(' ', '_').lower()}_{run_timestamp}.csv"
                filepath = os.path.join(output_folder, timestamped_filename)
                df.to_csv(filepath, index=False, encoding="utf-8-sig")
                print(f"  💾 Saved {len(df)} offers to {filepath}")
            else:
                print("  ⚠️ No offers found for this category.")

        except Exception as e:
            print(f"❌ Error scraping category {product_name}: {e}")


In [9]:
scrape_all_categories()


🔍 Scraping category: Washing Machines (hwaschf)
  ✅ Found 29 products with 5+ offers.
    [1/29] Miele WCA032 WCS Active Frontlader
    [2/29] Bosch Serie 6 WUU28T70 Frontlader
    [3/29] Elektra Bregenz WAF 71429 Frontlader
    [4/29] Bosch Serie 8 WGB244040 Frontlader
    [5/29] Gorenje WNHPI64SAPS/AT Frontlader
    [6/29] Gorenje WNHEI74SAPS/AT Frontlader
    [7/29] Siemens iQ700 WG44B2A40 Frontlader
    [8/29] Miele WWA120 WCS 8kg Active Frontlader lotosweiß
    [9/29] AEG Electrolux L6FBG51470 Frontlader
    [10/29] Siemens iQ500 WU14UT70 Frontlader
    [11/29] Bosch Serie 4 WAN28127 Frontlader
    [12/29] Bosch Serie 8 WGB244A40 Frontlader
    [13/29] Siemens iQ300 WM14N127 Frontlader
    [14/29] Gorenje WNHA74SAPS/AT Frontlader
    [15/29] AEG Electrolux LTR6A60370 Toplader
    [16/29] Beko WTV 7717 PT Frontlader
    [17/29] AEG Electrolux L6FBA51480 Frontlader
    [18/29] Siemens iQ700 WG44B2040 Frontlader
    [19/29] Bosch Serie 6 WGG256Z40 Frontlader
    [20/29] Siemens iQ70