In [None]:
import requests

url = "https://www.prospektangebote.de/anzeigen/angebote/lidl-prospekt-2384175#page=1"
headers = {
    'User-agent':
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582'
}

requests.get(url, headers=headers)
for line in r.text.split('\n'):
    print(line)

In [None]:
import time
import bs4 as bs
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


driver = webdriver.Firefox()
driver.get(url)

wait = WebDriverWait(driver, 10)

html = driver.execute_script('return document.documentElement.outerHTML')
soup = bs.BeautifulSoup(html, 'html.parser')
print(soup.prettify())

with open("output1.html", "w+", encoding="utf-8") as file:
    file.write(str(soup))

In [None]:
!pip install selenium beautifulsoup4 requests

In [4]:
import os
import time
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
from PIL import Image
from io import BytesIO

# Set the output directory for downloaded images
output_dir = "downloaded_images"
os.makedirs(output_dir, exist_ok=True)

# Base URL for supermarkets
base_url = "https://www.prospektangebote.de/neuesten-prospekte"

# Setup Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run headless for efficiency
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

# Initialize WebDriver
driver = webdriver.Chrome(options=chrome_options)

# Define minimum image size (width, height) to keep
MIN_WIDTH = 800
MIN_HEIGHT = 1200
FORMATS = (".webp")

def sanitize_filename(url):
    """Sanitize URL to create a unique filename."""
    return url.replace("https://", "").replace("http://", "").replace("/", "_").replace("?", "_").replace("=", "_")

def get_image_urls(page_html, formats:tuple=(".jpg", ".jpeg", ".png", ".gif", ".webp")):
    """Extract image URLs from page HTML."""
    soup = BeautifulSoup(page_html, "html.parser")
    img_urls = set()

    # Find all image tags with src attributes ending in common image extensions
    for img_tag in soup.find_all("img", src=True):
        src = img_tag["src"]
        if src.endswith(formats):
            img_urls.add(src)

    return img_urls


def get_supermarket_name_from_url(url):
    """Extract supermarket name from the URL path."""
    # Split the URL to find the section after "/geschaefte/"
    try:
        # Extract the part after "/geschaefte/"
        parts = url.split("/geschaefte/")[1].split("/")
        # Get the supermarket name, capitalize the first letter
        supermarket_name = parts[0].capitalize()
        return supermarket_name
    except IndexError:
        return "UnknownSupermarket"


def get_unique_filename(file_path):
    """Extend the filename if it already exists by appending a numerical suffix."""
    base, extension = os.path.splitext(file_path)
    counter = 1

    # Check if the file exists and create a new filename with a counter
    while os.path.exists(file_path):
        file_path = f"{base}-{counter}{extension}"
        counter += 1

    return file_path

def download_image(url, folder):
    """Download an image from a URL to a specified folder, convert to JPG, and filter by size."""
    try:
        response = requests.get(url, stream=True)
        response.raise_for_status()  # Raise an error for bad responses

        # Open image and convert to RGB to ensure compatibility with JPEG format
        img = Image.open(BytesIO(response.content)).convert("RGB")
        width, height = img.size

        # Check image size before saving
        if width < MIN_WIDTH or height < MIN_HEIGHT:
            # print(f"Skipping {url} due to insufficient size ({width}x{height}).")
            return

        # Sanitize URL to create a unique filename and save as JPG
        filename = sanitize_filename(url)
        file_path = os.path.join(folder, f"{filename}.jpg")
        # unique_file_path = get_unique_filename(file_path)
        if os.path.exists(file_path):
            # print(f"Skipping {url} as {file_path} already exists.")
            return

        # Save the image as JPEG
        img.save(file_path, "JPEG", quality=100)  # Adjust quality if needed
        print(f"Downloaded and saved: {file_path}")

    except requests.exceptions.RequestException as e:
        print(f"Failed to download {url}: {e}")
    except Exception as e:
        print(f"Error processing image {url}: {e}")


def extract_prospekt_urls(supermarkt_url, supermarket_name):
    """Extract all Prospekt URLs from a given supermarket page."""
    driver.get(supermarkt_url)
    time.sleep(3)  # Wait for the page to fully load
    soup = BeautifulSoup(driver.page_source, "html.parser")

    prospekt_urls = []
    for link in soup.find_all("a", class_="store-flyer__image mobile-view-flyer-button js-open-download-mobile-app-popup"):
        href = link.get("href")
        if href and "/anzeigen/angebote/" in href:
            prospekt_urls.append("https://www.prospektangebote.de" + href)

    return prospekt_urls

def process_prospekt(prospekt_url, supermarket_folder):
    """Process each Prospekt by iterating over its pages and downloading images."""
    try:
        driver.get(prospekt_url)
        time.sleep(3)  # Wait for the page to fully load

        # Loop to navigate through all pages by clicking the "Next" button
        page_count = 1
        while True:

            print(f'URL with query parameter: {prospekt_url}#page={page_count}')
            page_html = driver.page_source
            image_urls = get_image_urls(page_html, formats=FORMATS)


            # If no images are found on the current page, stop processing further pages
            if image_urls:
                print(f"Found {len(image_urls)} images on page {page_count}.")
                for img_url in image_urls:
                    download_image(img_url, supermarket_folder)

            # Download each image found on the current page

            try:
                # Locate the "Next" button using its ID and click it
                next_button = WebDriverWait(driver, 2).until(
                    EC.element_to_be_clickable((By.ID, "nextbutton"))
                )
                # if any child has id=nextflyerbutton, break
                if next_button.find_element(By.ID, "nextflyerbutton") and next_button.find_element(By.ID, "nextflyerbutton").is_displayed():
                    print(f"Found end of flyer at page {page_count}. Ending the pagination. ({next_button.find_elements(By.ID, 'nextflyerbutton')}")
                    break
                else:
                    driver.execute_script("arguments[0].click();", next_button)
                    page_count += 1
                    time.sleep(3)  # Wait for the next page to load

            except (NoSuchElementException, TimeoutException):
                print("No 'Next' button found or it's not clickable. Ending the pagination.")
                break

    except Exception as e:
        import traceback
        print(f"Error processing Prospekt {prospekt_url}: {e}")
        print(traceback.format_exc())

def main():
    try:
        # Load the base page with all supermarket flyers
        print(f"ROOT: {base_url}")
        driver.get(base_url)
        time.sleep(2)  # Wait for the page to fully load

        # Extract supermarket URLs
        soup = BeautifulSoup(driver.page_source, "html.parser")
        supermarket_links = []

        # Extract all supermarket links
        for link in soup.find_all("a", class_="js-flyer-link-item"):
            href = link.get("href")
            if href:
                supermarket_links.append("https://www.prospektangebote.de" + href)

        # Iterate over each supermarket link

        for supermarkt_url in supermarket_links:
            supermarket_name = get_supermarket_name_from_url(supermarkt_url)
            print(f"Supermarket: {supermarkt_url} as {supermarket_name}")

            # Create supermarket-specific download folder
            supermarket_folder = os.path.join("downloaded_images", supermarket_name)
            os.makedirs(supermarket_folder, exist_ok=True)

            # Extract and process each Prospekt URL within the supermarket
            prospekt_urls = extract_prospekt_urls(supermarkt_url, supermarket_name)
            for prospekt_url in prospekt_urls:
                print(f"Prospekt: {prospekt_url}")
                process_prospekt(prospekt_url, supermarket_folder)

            break  # For testing, remove to process all supermarkets

    finally:
        # Close the WebDriver
        driver.quit()

# if __name__ == "__main__":
# main()


In [5]:
from concurrent.futures import ThreadPoolExecutor, as_completed
import concurrent.futures


def process_supermarket(supermarkt_url):
    supermarket_name = get_supermarket_name_from_url(supermarkt_url)
    print(f"Supermarket: {supermarkt_url} as {supermarket_name}")

    # Create supermarket-specific download folder
    supermarket_folder = os.path.join("downloaded_images", supermarket_name)
    os.makedirs(supermarket_folder, exist_ok=True)

    # Extract and process each Prospekt URL within the supermarket
    prospekt_urls = extract_prospekt_urls(supermarkt_url, supermarket_name)
    for prospekt_url in prospekt_urls:
        print(f"Prospekt: {prospekt_url}")
        process_prospekt(prospekt_url, supermarket_folder)

def main():
    try:
        # Load the base page with all supermarket flyers
        print(f"ROOT: {base_url}")
        driver.get(base_url)
        time.sleep(2)  # Wait for the page to fully load

        # Extract supermarket URLs
        soup = BeautifulSoup(driver.page_source, "html.parser")
        supermarket_links = []

        # Extract all supermarket links
        for link in soup.find_all("a", class_="js-flyer-link-item"):
            href = link.get("href")
            if href:
                supermarket_links.append("https://www.prospektangebote.de" + href)

        # Use ThreadPoolExecutor for parallel execution
        with concurrent.futures.ThreadPoolExecutor() as executor:
            futures = [executor.submit(process_supermarket, supermarkt_url) for supermarkt_url in supermarket_links]
            for future in concurrent.futures.as_completed(futures):
                try:
                    future.result()  # Get the result to propagate any exceptions
                except Exception as exc:
                    print(f"Supermarket processing failed: {exc}")

    finally:
        # Close the WebDriver
        driver.quit()

if __name__ == "__main__":
    main()


ROOT: https://www.prospektangebote.de/neuesten-prospekte
Supermarket: https://www.prospektangebote.de/geschaefte/lidl/prospekte-angebote as Lidl
Supermarket: https://www.prospektangebote.de/geschaefte/kaufland/prospekte-angebote as Kaufland
Supermarket: https://www.prospektangebote.de/geschaefte/edeka/prospekte-angebote as Edeka
Supermarket: https://www.prospektangebote.de/geschaefte/alldrink/prospekte-angebote as Alldrink
Supermarket: https://www.prospektangebote.de/geschaefte/aktiv-irma/prospekte-angebote as Aktiv-irma
Supermarket: https://www.prospektangebote.de/geschaefte/wasgau/prospekte-angebote as Wasgau
Supermarket: https://www.prospektangebote.de/geschaefte/zimmermann/prospekte-angebote as Zimmermann
Supermarket: https://www.prospektangebote.de/geschaefte/sonderpreis-baumarkt/prospekte-angebote as Sonderpreis-baumarkt
Supermarket: https://www.prospektangebote.de/geschaefte/netto-marken-discount/prospekte-angebote as Netto-marken-discount
Supermarket: https://www.prospektangebo