In [46]:
import concurrent.futures
from loguru import logger as logging
import os
import re
from collections import defaultdict

import requests
from bs4 import BeautifulSoup
from PIL import Image
from tqdm import tqdm
import io
import yaml
import pandas as pd
from dotenv import load_dotenv

load_dotenv()
PROJECT_DIR = os.path.normpath(os.getenv("PROJECT_DIR"))
TARGET_DIR = os.path.join(PROJECT_DIR, "crawled_leaflets")
METADATA_PATH = os.path.join(TARGET_DIR, "metadata.csv")
METADATA_COLUMNS = ["supermarket_name", "leaflet_id", "num_pages", "downloaded_pages", "crawl_date"]
CRAWL_DATE = pd.Timestamp.now().strftime("%Y-%m-%d %H:%M:%S")

In [47]:
unique_names = {
    "aldi-sy-d": "aldi-sud",
    "aldi-sy-d-wein": "aldi-sud",
    "denns-biomarkt": "denn-s-biomarkt",
    "getry-nkeland": "getraenkeland",
}

leaflet_url = r"https://www.prospektangebote.de{leaflet_href}"
session = requests.Session()
session.headers.update(
    {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
)

In [48]:
from datetime import datetime


def mask_name(name, to_mask):
    for mask in to_mask:
        name = name.replace(mask, "<MASK>")
    return name


def parse_name(name):
    name = name.lower().strip().replace("-", "").replace(" ", "")
    name = name.replace("ä", "a").replace("ö", "o").replace("ü", "u").replace("ß", "ss")
    name = name.replace("Ä", "A").replace("Ö", "O").replace("Ü", "U")
    name = name.replace("é", "e").replace("è", "e").replace("ê", "e")
    name = name.replace("à", "a").replace("â", "a").replace("ç", "c")
    name = name.replace("í", "i").replace("ì", "i").replace("î", "i")
    name = name.replace("ó", "o").replace("ò", "o").replace("ô", "o")
    name = name.replace("ú", "u").replace("ù", "u").replace("û", "u")
    name = name.replace("ñ", "n").replace("ý", "y").replace("ÿ", "y")
    name = name.replace("ă", "a").replace("â", "a").replace("î", "i")
    name = name.replace("ș", "s").replace("ț", "t")
    name = name.replace("ae", "a").replace("oe", "o").replace("ue", "u")
    return name

def extract_valid_date(div):
    date_str = div.find("small", {"class": "d-block text-muted mb-1"}).text
    date_str = date_str.strip().replace("  ", " ")
    execution_year = pd.Timestamp.now().year
    execution_month = pd.Timestamp.now().month

    # Extract day and month values
    parts = date_str.replace(".", "").split()
    from_day = int(parts[2])
    from_month = {"Jan": 1, "Feb": 2, "Mär": 3, "Apr": 4, "Mai": 5, "Jun": 6,
                 "Jul": 7, "Aug": 8, "Sep": 9, "Okt": 10, "Nov": 11, "Dez": 12}[parts[3]]
    to_day = int(parts[5])
    to_month = {"Jan": 1, "Feb": 2, "Mär": 3, "Apr": 4, "Mai": 5, "Jun": 6,
               "Jul": 7, "Aug": 8, "Sep": 9, "Okt": 10, "Nov": 11, "Dez": 12}[parts[6]]

    # Handle year transition
    from_year = execution_year
    to_year = execution_year

    if from_month == 12 and to_month == 1:
        if execution_month == 12:
            to_year = to_year + 1
        else:
            from_year = from_year - 1

    # Create timestamps
    from_date = pd.Timestamp(f"{from_year}-{from_month:02d}-{from_day:02d}").strftime("%Y-%m-%d")
    to_date = pd.Timestamp(f"{to_year}-{to_month:02d}-{to_day:02d}").strftime("%Y-%m-%d")

    return from_date, to_date


In [49]:
def process_market(market):
    """
    Process a single market URL to extract leaflet information.

    :param market: Market URL to process
    :return: List of dictionaries containing leaflet information
    """
    leaflets = []
    supermarket_name = (
        re.search(
            r"https://www.prospektangebote.de/geschaefte/(.*)/prospekte-angebote",
            market,
        )
        .group(1)
        .lower()
    )
    response = session.get(market)
    response.raise_for_status()
    soup = BeautifulSoup(response.content, "html.parser")

    divs = soup.find_all("div", {"class": "store-flyer mb-3"})
    for div in divs:
        if div is None:
            continue
        if parse_name(supermarket_name) in mask_name(
            parse_name(mask_name(div["data-flyer-name"], ["hitbbq"])),
            ["hitbbq"],
        ):
            leaflet_id = div["data-flyer-id"]
            leaflet_href = div.find("a", {"class": "btn"})["href"]
            this_leaflet_url = leaflet_url.format(leaflet_href=leaflet_href)

            hidden_supermarket_name = re.search(
                r"https://www.prospektangebote.de/anzeigen/angebote/(.*)",
                this_leaflet_url,
            ).group(1)
            hidden_supermarket_name = hidden_supermarket_name.split("-prospekt-")[0]

            hidden_supermarket_name = unique_names.get(
                hidden_supermarket_name, hidden_supermarket_name
            )

            leaflet_response = session.get(this_leaflet_url)
            leaflet_response.raise_for_status()

            leaflet_pages = re.search(r"let flyerPages = (.+);", leaflet_response.text)
            if leaflet_pages:
                leaflet_pages = leaflet_pages.group(1).replace(r"\/", "/")
                leaflet_pages = eval(leaflet_pages)
                num_pages = len(leaflet_pages)

                valid_from_date, valid_to_date = extract_valid_date(div)

                leaflets.append(
                    {
                        "supermarket_name": supermarket_name,
                        "hidden_supermarket_name": hidden_supermarket_name,
                        "leaflet_id": leaflet_id,
                        "leaflet_href": leaflet_href,
                        "leaflet_url": this_leaflet_url,
                        "num_pages": num_pages,
                        "crawl_date": CRAWL_DATE,
                        "valid_from_date": valid_from_date,
                        "valid_to_date": valid_to_date,
                    }
                )

    return leaflets

In [50]:
process_market("https://www.prospektangebote.de/geschaefte/kaufland/prospekte-angebote")

[{'supermarket_name': 'kaufland',
  'hidden_supermarket_name': 'kaufland',
  'leaflet_id': '2623106',
  'leaflet_href': '/anzeigen/angebote/kaufland-prospekt-2623106',
  'leaflet_url': 'https://www.prospektangebote.de/anzeigen/angebote/kaufland-prospekt-2623106',
  'num_pages': 32,
  'crawl_date': '2024-12-27 17:56:08',
  'valid_from_date': '2024-12-27',
  'valid_to_date': '2025-01-01'},
 {'supermarket_name': 'kaufland',
  'hidden_supermarket_name': 'kaufland-feuerwerk',
  'leaflet_id': '2609067',
  'leaflet_href': '/anzeigen/angebote/kaufland-feuerwerk-prospekt-2609067',
  'leaflet_url': 'https://www.prospektangebote.de/anzeigen/angebote/kaufland-feuerwerk-prospekt-2609067',
  'num_pages': 8,
  'crawl_date': '2024-12-27 17:56:08',
  'valid_from_date': '2024-12-28',
  'valid_to_date': '2024-12-31'}]