# Hamburg Branchenbuch – Clean Pipeline

This notebook contains only the essential steps:

1) Imports and setup
2) Scraper class definition (`EnhancedHamburgBranchenbuchScraper`)
3) Large A–Z mapping runner (collect category → detail URLs with pagination, checkpoints)
4) Extract ALL company details from the mapping (fast, threaded, progress)

Run cells in order. The mapping runner writes mapping CSVs; the extraction cell uses the newest mapping and outputs a consolidated details CSV.

# Hamburg Branchenbuch Scraper

This notebook will scrape the Hamburg business directory (Branchenbuch) to extract all companies and their websites from all alphabetical sections and categories.

## Overview
- Target URL: https://www.hamburg.de/branchenbuch/hamburg/a-z/
- Structure: Alphabetical sections → Business categories → Company listings
- Goal: Extract company names, addresses, phone numbers, websites, and other contact details

# Clean Scraper Section

This section contains the consolidated, production-ready scraper with robust pagination (single-page and multi-page), fast link collection, and detail extraction. Older exploratory cells above can be ignored.

In [1]:
# 1) Imports and setup (keep)
import os, re, json, time, random, glob, threading, collections, concurrent.futures as cf
from datetime import datetime
from urllib.parse import urlparse, parse_qs, unquote

import pandas as pd
from bs4 import BeautifulSoup
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from tqdm import tqdm

BASE_DIR = os.getcwd()
print("✅ Imports ready")

✅ Imports ready


In [2]:
# 2) EnhancedHamburgBranchenbuchScraper (restored)
import time, random, re, json
from typing import Optional, Dict, List, Tuple
from urllib.parse import urljoin, urlparse, parse_qs, unquote

import requests
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

try:
    import requests_cache  # type: ignore
except Exception:
    requests_cache = None

class EnhancedHamburgBranchenbuchScraper:
    def __init__(self,
                 min_delay: float = 0.15,
                 max_delay: float = 0.35,
                 retry_attempts: int = 3,
                 enable_caching: bool = True,
                 cache_name: str = "hh_cache",
                 cache_expire_seconds: int = 24 * 3600,
                 timeout: int = 25):
        self.min_delay = min_delay
        self.max_delay = max_delay
        self.retry_attempts = retry_attempts
        self.timeout = timeout
        self.user_agents = [
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36',
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122 Safari/537.36',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15',
        ]
        self.headers = {"User-Agent": random.choice(self.user_agents)}

        if enable_caching and requests_cache is not None:
            self.session = requests_cache.CachedSession(
                cache_name=cache_name,
                backend='sqlite',
                expire_after=cache_expire_seconds,
            )
        else:
            self.session = requests.Session()

        retry = Retry(
            total=retry_attempts,
            backoff_factor=0.5,
            status_forcelist=[429, 500, 502, 503, 504],
            allowed_methods=["GET"],
        )
        adapter = HTTPAdapter(max_retries=retry, pool_connections=100, pool_maxsize=100)
        self.session.mount('http://', adapter)
        self.session.mount('https://', adapter)

    def _sleep(self):
        time.sleep(random.uniform(self.min_delay, self.max_delay))

    def get_page(self, url: str) -> Optional[BeautifulSoup]:
        last_err = None
        for _ in range(max(1, self.retry_attempts)):
            try:
                self.headers["User-Agent"] = random.choice(self.user_agents)
                r = self.session.get(url, headers=self.headers, timeout=self.timeout)
                if 300 <= r.status_code < 400 and 'Location' in r.headers:
                    loc = r.headers.get('Location')
                    if loc:
                        url = urljoin(url, loc)
                r.raise_for_status()
                return BeautifulSoup(r.text, 'html.parser')
            except Exception as e:
                last_err = e
                time.sleep(0.3)
        return None

    def get_letters(self) -> List[str]:
        base = "https://www.hamburg.de/branchenbuch/hamburg/a-z/"
        return [urljoin(base, f"{ch}/") for ch in list("abcdefghijklmnopqrstuvwxyz")]

    def get_category_links(self, letters_soup: BeautifulSoup) -> List[Dict[str, str]]:
        out = []
        if not letters_soup:
            return out
        for a in letters_soup.find_all('a', href=True):
            href = a['href']
            if not isinstance(href, str):
                continue
            if '/branchenbuch/hamburg/' in href and re.search(r'/\d+/(n\d+/)?$', href):
                name = a.get_text(' ', strip=True)
                url = href if href.startswith('http') else urljoin('https://www.hamburg.de', href)
                out.append({"name": name, "url": url})
        # de-duplicate preserving order by url
        seen = set()
        uniq = []
        for it in out:
            if it['url'] not in seen:
                seen.add(it['url'])
                uniq.append(it)
        return uniq

    def _page_url(self, category_url: str, offset: int) -> str:
        # Normalize and construct .../n{offset}/ URLs
        if re.search(r'/n\d+/?$', category_url):
            return re.sub(r'/n\d+/?$', f"/n{offset}/", category_url)
        if not category_url.endswith('/'):
            category_url += '/'
        return urljoin(category_url, f"n{offset}/")

    def _page_signature(self, soup: Optional[BeautifulSoup]) -> str:
        if not soup:
            return ""
        # signature by first few company hrefs/texts
        sig_parts = []
        for a in soup.find_all('a', href=True):
            href = a['href']
            if '/branchenbuch/hamburg/eintrag/' in href:
                sig_parts.append(href)
                if len(sig_parts) >= 8:
                    break
        if not sig_parts:
            # fallback: first headings/text blocks
            heads = [h.get_text(' ', strip=True) for h in soup.find_all(['h2','h3'])][:8]
            sig_parts = heads
        return '|'.join(sig_parts)

    def get_category_total_count(self, soup: Optional[BeautifulSoup]) -> Optional[int]:
        if not soup:
            return None
        txt = soup.get_text(" ", strip=True)
        m = re.search(r'(\d{1,6})\s*(Treffer|Ergebnisse|Eintr[aä]ge)', txt, flags=re.I)
        if m:
            try:
                return int(m.group(1))
            except Exception:
                return None
        return None

    def get_category_page_urls(self, category_url: str, first_page_soup: Optional[BeautifulSoup] = None,
                               page_size: int = 20, probe_extra_pages: int = 3) -> List[str]:
        if not first_page_soup:
            first_page_soup = self.get_page(category_url)
        if not first_page_soup:
            return [category_url]
        total = self.get_category_total_count(first_page_soup)
        pages = []
        seen_sigs = set()
        # Start with n0 and advertised pages
        max_pages = 1
        if total is not None and total > 0:
            max_pages = max(1, (total + page_size - 1) // page_size)
        # Build initial set
        for i in range(max_pages):
            offset = i * page_size
            url_i = self._page_url(category_url, offset)
            soup_i = first_page_soup if i == 0 else self.get_page(url_i)
            sig = self._page_signature(soup_i)
            if not sig or sig in seen_sigs:
                break
            seen_sigs.add(sig)
            pages.append(url_i)
            self._sleep()
        # Probe a few more in case advertised undercounts
        i = len(pages)
        while i < max_pages + probe_extra_pages:
            offset = i * page_size
            url_i = self._page_url(category_url, offset)
            soup_i = self.get_page(url_i)
            sig = self._page_signature(soup_i)
            if not sig or sig in seen_sigs:
                break
            seen_sigs.add(sig)
            pages.append(url_i)
            i += 1
            self._sleep()
        return pages

    def get_all_company_detail_links(self, category_url: str, cap_to_advertised: bool = False) -> List[str]:
        first = self.get_page(category_url)
        if not first:
            return []
        advertised = self.get_category_total_count(first)
        page_urls = self.get_category_page_urls(category_url, first)
        links: List[str] = []
        seen = set()
        for pu in page_urls:
            soup = first if pu == page_urls[0] else self.get_page(pu)
            if not soup:
                continue
            for a in soup.find_all('a', href=True):
                href = a['href']
                if '/branchenbuch/hamburg/eintrag/' in href:
                    absu = href if href.startswith('http') else urljoin('https://www.hamburg.de', href)
                    if absu not in seen:
                        seen.add(absu)
                        links.append(absu)
            if cap_to_advertised and advertised and len(links) >= advertised:
                links = links[:advertised]
                break
            self._sleep()
        return links

    # Detail extraction (JSON-LD preferred, refined website + address fallbacks)
    def extract_company_details(self, company_url: str, category_name: Optional[str] = None, letter: Optional[str] = None) -> Dict[str,str]:
        soup = self.get_page(company_url)
        try:
            from datetime import datetime as _dt
            ts = _dt.utcnow().isoformat()
        except Exception:
            ts = ""
        data = {
            "source_url": company_url,
            "name": "",
            "website": "",
            "phone": "",
            "email": "",
            "address": "",
            "latitude": "",
            "longitude": "",
            "image": "",
            "category": category_name or "",
            "letter_section": letter or "",
            "scraped_at": ts,
        }
        if not soup:
            return data

        # Try JSON-LD first
        jsonld = None
        for tag in soup.find_all('script', type='application/ld+json'):
            txt = tag.string or tag.text or ''
            try:
                obj = json.loads(txt)
            except Exception:
                continue
            if isinstance(obj, list):
                for o in obj:
                    if isinstance(o, dict) and o.get('@type'):
                        jsonld = o
                        break
            elif isinstance(obj, dict) and obj.get('@type'):
                jsonld = obj
            if jsonld:
                break
        if isinstance(jsonld, dict):
            data["name"] = (jsonld.get('name') or data["name"]).strip()
            url_val = jsonld.get('url') or jsonld.get('@id')
            if isinstance(url_val, str):
                data["website"] = url_val
            elif isinstance(url_val, list):
                data["website"] = next((u for u in url_val if isinstance(u, str) and u.startswith('http')), data["website"]) or ""
            same = jsonld.get('sameAs')
            if not data["website"] and isinstance(same, list):
                data["website"] = next((u for u in same if isinstance(u, str) and u.startswith('http')), "")
            adr = jsonld.get('address')
            if isinstance(adr, dict):
                parts = [adr.get('streetAddress',''), adr.get('postalCode',''), adr.get('addressLocality','')]
                data["address"] = ' '.join([p for p in parts if p]).strip()
            geo = jsonld.get('geo')
            if isinstance(geo, dict):
                data["latitude"] = str(geo.get('latitude') or '')
                data["longitude"] = str(geo.get('longitude') or '')

        # Name fallback from headers if still empty
        if not data["name"]:
            h = soup.find(['h1','h2'])
            if h:
                data["name"] = h.get_text(' ', strip=True)

        # Phone / Email via anchors
        a_tel = soup.find('a', href=re.compile(r'^tel:', re.I))
        if a_tel and a_tel.get('href'):
            data["phone"] = re.sub(r'^tel:', '', a_tel['href']).strip()
        a_mail = soup.find('a', href=re.compile(r'^mailto:', re.I))
        if a_mail and a_mail.get('href'):
            data["email"] = re.sub(r'^mailto:', '', a_mail['href']).strip()

        # Address fallbacks
        if not data["address"]:
            addr = soup.find('address')
            if addr:
                data["address"] = ' '.join(addr.get_text(' ', strip=True).split())
        if not data["address"]:
            # Look for microdata-style spans
            street = soup.find(attrs={"itemprop": "streetAddress"})
            plz = soup.find(attrs={"itemprop": "postalCode"})
            city = soup.find(attrs={"itemprop": "addressLocality"})
            parts = []
            for el in (street, plz, city):
                if el:
                    parts.append(el.get_text(' ', strip=True))
            if parts:
                data["address"] = ' '.join(parts)
        if not data["address"]:
            # Generic class/id based fallback (address/Adresse/Anschrift)
            cand_nodes = soup.select('[class*="address" i], [id*="address" i], .adresse, [class*="anschrift" i]')
            for node in cand_nodes:
                txt = ' '.join(node.get_text(' ', strip=True).split())
                if re.search(r'\b\d{5}\b', txt) or any(ch.isdigit() for ch in txt):
                    data["address"] = txt
                    break

        # Image
        og = soup.find('meta', property='og:image')
        if og and og.get('content'):
            data["image"] = og['content'].strip()

        # Refined website picking: avoid hamburg.de, berlin.de and redirect wrappers; prefer image-wrapped/link text cues
        def _resolve_redirect(href: str) -> str:
            try:
                if not href or not href.startswith('http'):
                    return ''
                u = urlparse(href)
                qs = parse_qs(u.query)
                for k in ['url','dest','destination','to','link','u']:
                    if k in qs and qs[k]:
                        target = unquote(qs[k][0])
                        if target.startswith('http'):
                            return target
                return href
            except Exception:
                return href
        def _hostname(u: str) -> str:
            try:
                return urlparse(u).hostname or ''
            except Exception:
                return ''
        def _is_portal(host: str) -> bool:
            h = (host or '').lower()
            return ('hamburg.de' in h) or ('berlin.de' in h)
        BLOCKLIST = {'branchenbuch.hamburg.de','google.','maps.google','hvv.de','geofox.de','booking.com','yelp.','tripadvisor.','facebook.com','instagram.com','x.com','twitter.com','berlin.de'}

        # If json-ld gave a portal URL, drop it so we can try anchors
        if data["website"] and _is_portal(_hostname(data["website"])):
            data["website"] = ''

        if not data["website"]:
            # 1) label hints + data-* attributes
            candidates: List[Tuple[str,str,bool]] = []
            for a in soup.find_all('a'):
                href = (a.get('href') or '').strip()
                durl = a.get('data-url') or a.get('data-href') or a.get('data-website')
                href = _resolve_redirect(durl.strip()) if durl else _resolve_redirect(href)
                if not href.startswith('http'):
                    continue
                if any(b in href for b in BLOCKLIST):
                    continue
                label = (a.get_text(' ', strip=True) or a.get('aria-label') or a.get('title') or '').lower()
                has_img = a.find('img') is not None
                candidates.append((href, label, has_img))
            website_keywords = ('website','webseite','zur website','homepage','zur homepage','internetseite','zur internetseite')
            for href, label, _ in candidates:
                if any(p in label for p in website_keywords) and not _is_portal(_hostname(href)):
                    data["website"] = href
                    break
            # 2) image-wrapped
            if not data["website"]:
                for href, _, has_img in candidates:
                    if has_img and not _is_portal(_hostname(href)):
                        data["website"] = href
                        break
            # 3) any external
            if not data["website"]:
                for href, _, _ in candidates:
                    if not _is_portal(_hostname(href)):
                        data["website"] = href
                        break

        return data

print("✅ EnhancedHamburgBranchenbuchScraper restored. Run this cell before mapping/extraction.")

✅ EnhancedHamburgBranchenbuchScraper restored. Run this cell before mapping/extraction.


# How to run

- Run the Imports cell and then the Scraper class cell to load the implementation.
- Run the A–Z Mapping Runner cell to scrape all letters A–Z. It will:
  - Respect category pagination using n0, n20, n40… with wrap detection.
  - Cap each category’s links to the advertised count to avoid overshoot.
  - Save partial mapping CSVs in `data/hh_branchenbuch_checkpoints/mapping/` and consolidate to `hamburg_branchenbuch_companies_category_map_<timestamp>.csv` in the workspace root.
- Run the Details Extraction cell to backfill all company details from the latest mapping CSV (threaded, resumable):
  - It auto-picks the newest `hamburg_branchenbuch_companies_category_map_*.csv` if a specific path isn’t configured.
  - It saves partials under `data/hh_branchenbuch_checkpoints/from_map/` and consolidates to `hamburg_branchenbuch_companies_details_from_map_<timestamp>.csv`.

Notes:
- Requests are cached for speed and politeness; you can rerun to resume faster.
- You can resume the mapping run; processed categories are tracked in `data/hh_branchenbuch_checkpoints/mapping/processed_categories.txt`.
- Press Ctrl+C once to stop gracefully after the current item.

In [None]:
# 3) Large A–Z mapping runner (collect category → detail URLs with pagination, checkpoints)
# - Crawls all letter sections (A–Z)
# - For each category under a letter, enumerates all paginated listing pages
# - Collects company detail URLs and writes mapping CSV rows: (url, category, letter_section, category_url)
# - Saves partial CSVs regularly and consolidates to a final mapping CSV at the end

import os, glob, json, time
from datetime import datetime
from urllib.parse import urlparse

import pandas as pd
from tqdm import tqdm

# --- Config ---
LETTERS_OVERRIDE = None  # e.g., set to ['a','b'] to limit for testing; None → all a–z
OUT_PART_DIR = os.path.join(os.getcwd(), "data", "hh_branchenbuch_checkpoints", "mapping")
os.makedirs(OUT_PART_DIR, exist_ok=True)
SAVE_EVERY_N_MAP = 5000
CAP_TO_ADVERTISED = True  # be conservative; we probe extra in page URL builder already
PROCESSED_CATS_PATH = os.path.join(OUT_PART_DIR, "processed_categories.txt")  # persistent resume

# --- Helper: load/save processed categories for resume ---
def _load_processed_categories(path: str) -> set:
    s = set()
    if os.path.exists(path):
        try:
            with open(path, 'r', encoding='utf-8') as f:
                for line in f:
                    u = line.strip()
                    if u:
                        s.add(u)
        except Exception:
            pass
    return s

def _append_processed_category(path: str, cat_url: str):
    try:
        with open(path, 'a', encoding='utf-8') as f:
            f.write(cat_url.strip() + "\n")
    except Exception:
        pass

# --- Instantiate scraper ---
scraper = EnhancedHamburgBranchenbuchScraper(min_delay=0.05, max_delay=0.20, retry_attempts=3, enable_caching=True)
letters_urls = scraper.get_letters()
if LETTERS_OVERRIDE:
    letters_urls = [u for u in letters_urls if any(u.rstrip('/').endswith(f"/{ch}") for ch in LETTERS_OVERRIDE)]

# --- Mapping collection ---
rows_map = []
ts_map = datetime.now().strftime("%Y%m%d_%H%M%S")
part_idx_map = 1
processed_cats = _load_processed_categories(PROCESSED_CATS_PATH)

def _save_partial_map(force: bool=False):
    global rows_map, part_idx_map
    if not rows_map and not force:
        return
    dfp = pd.DataFrame(rows_map)
    cols = ["url", "category", "letter_section", "category_url"]
    for c in cols:
        if c not in dfp.columns:
            dfp[c] = ""
    dfp = dfp[cols]
    outp = os.path.join(OUT_PART_DIR, f"category_map_{ts_map}_chunk{part_idx_map:03d}.csv")
    dfp.to_csv(outp, index=False)
    print(f"Saved mapping partial chunk {part_idx_map:03d}: {len(dfp)} rows → {outp}")
    part_idx_map += 1
    rows_map = []

print(f"Starting A–Z mapping run across {len(letters_urls)} letter sections…")
start_map = time.time()

for lidx, letter_url in enumerate(letters_urls, start=1):
    letter = letter_url.rstrip('/').split('/')[-1]
    soup_letter = scraper.get_page(letter_url)
    if not soup_letter:
        print(f"[warn] Could not load letter page: {letter_url}")
        continue
    categories = scraper.get_category_links(soup_letter)
    print(f"[{lidx}/{len(letters_urls)}] Letter '{letter.upper()}': {len(categories)} categories")

    for cidx, cat in enumerate(categories, start=1):
        cat_name = cat.get('name', '').strip()
        cat_url = cat.get('url', '').strip()
        if not cat_url:
            continue
        if cat_url in processed_cats:
            # already processed in a previous/resumed run
            continue

        # Enumerate all detail links for this category
        try:
            detail_links = scraper.get_all_company_detail_links(cat_url, cap_to_advertised=CAP_TO_ADVERTISED)
        except Exception:
            detail_links = []

        # De-dup only within this category to avoid page repetition; preserve cross-category associations
        seen_in_cat = set()
        added = 0
        for du in detail_links:
            if du in seen_in_cat:
                continue
            seen_in_cat.add(du)
            rows_map.append({
                "url": du,
                "category": cat_name,
                "letter_section": letter,
                "category_url": cat_url,
            })
            added += 1
        _append_processed_category(PROCESSED_CATS_PATH, cat_url)
        processed_cats.add(cat_url)

        if added == 0:
            # Even if empty, keep a heartbeat print to show progress
            print(f"  - [{cidx}/{len(categories)}] {cat_name} → 0 links")
        else:
            if len(rows_map) >= SAVE_EVERY_N_MAP:
                _save_partial_map()

# Save any remaining mapping rows and consolidate
_save_partial_map(force=True)
partials = sorted(glob.glob(os.path.join(OUT_PART_DIR, f"category_map_{ts_map}_chunk*.csv")))
all_map_df = []
for p in partials:
    try:
        if os.path.getsize(p) > 0:
            all_map_df.append(pd.read_csv(p))
    except Exception:
        pass

if all_map_df:
    map_df = pd.concat(all_map_df, ignore_index=True)
    # Ensure columns order
    map_df = map_df[["url","category","letter_section","category_url"]]
    out_final_map = f"hamburg_branchenbuch_companies_category_map_{ts_map}.csv"
    map_df.to_csv(out_final_map, index=False)
    took = time.time() - start_map
    print(f"✅ Mapping done: {len(map_df):,} rows → {out_final_map} in {took/60:.1f} min.")
    display(map_df.head(5))
else:
    print("No mapping partials to consolidate (no rows?).")

In [3]:
# 4) Extract ALL company metadata from mapping (fast, threaded, with progress)
# - Reads the latest/target mapping CSV
# - Aggregates ALL categories/letter sections per URL into JSON arrays
# - Uses EnhancedHamburgBranchenbuchScraper.extract_company_details if available
# - Fallbacks to a minimal extractor if the class isn't defined
# - Post-processes website to avoid hamburg.de/berlin.de and handle image-wrapped/redirected anchors
# - Saves partials and a consolidated CSV

import os, time, random, glob, threading, collections, concurrent.futures as cf, json, re
from datetime import datetime
import pandas as pd
from tqdm import tqdm
from urllib.parse import urlparse, parse_qs, unquote

# --- Config ---
# If this path doesn't exist, we'll auto-discover the newest mapping CSV in the workspace root.
MAPPING_PATH = "/home/thiesen/Documents/AI-Innoscence_Ecosystem/hamburg_branchenbuch_companies_category_map_20250929_050648.csv"
OUT_PART_DIR = os.path.join(os.getcwd(), "data", "hh_branchenbuch_checkpoints", "from_map")
os.makedirs(OUT_PART_DIR, exist_ok=True)
MAX_WORKERS_DETAILS = 32
RATE_LIMIT_PER_SEC = 8
SAVE_EVERY_N = 2000
SMALL_JITTER = (0.01, 0.05)

# --- Availability of full scraper ---
try:
    _ = EnhancedHamburgBranchenbuchScraper  # type: ignore
    _HAS_FULL = True
except Exception:
    _HAS_FULL = False

# --- Load mapping and build full context (all categories/letters per URL) ---
if not os.path.exists(MAPPING_PATH):
    # Auto-pick newest mapping CSV from the workspace root
    cand = sorted(glob.glob(os.path.join(os.getcwd(), "hamburg_branchenbuch_companies_category_map_*.csv")), reverse=True)
    if cand:
        MAPPING_PATH = cand[0]
        print(f"Auto-selected latest mapping: {MAPPING_PATH}")
    else:
        raise FileNotFoundError("No mapping CSV found. Please run the A–Z Mapping Runner first.")

map_df = pd.read_csv(MAPPING_PATH)
if 'url' not in map_df.columns:
    for cand in ['detail_url', 'source_url']:
        if cand in map_df.columns:
            map_df = map_df.rename(columns={cand: 'url'})
            break
assert 'url' in map_df.columns, "Mapping CSV must contain a 'url' column."

cat_col = 'category' if 'category' in map_df.columns else None
let_col = 'letter_section' if 'letter_section' in map_df.columns else ('letter' if 'letter' in map_df.columns else None)

# Aggregate ALL categories/letters per URL and keep also a primary for compatibility
ctx_all = {}
for _, r in map_df.iterrows():
    u = r['url']
    if not isinstance(u, str) or not u:
        continue
    cat = r[cat_col] if cat_col and isinstance(r.get(cat_col, None), str) else ""
    let = r[let_col] if let_col and isinstance(r.get(let_col, None), str) else ""
    if u not in ctx_all:
        ctx_all[u] = {
            'categories': set(),
            'letters': set(),
            'primary_category': cat or "",
            'primary_letter': let or ""
        }
    if cat:
        ctx_all[u]['categories'].add(cat)
    if let:
        ctx_all[u]['letters'].add(let)

urls = list(ctx_all.keys())
print(f"Unique detail URLs to backfill: {len(urls):,}")

# --- Rate limiter ---
class FixedWindowRateLimiter:
    def __init__(self, max_per_sec: int):
        self.max_per_sec = max_per_sec
        self.lock = threading.Lock()
        self.window = collections.deque()
    def acquire(self):
        while True:
            with self.lock:
                now = time.monotonic()
                while self.window and (now - self.window[0]) > 1.0:
                    self.window.popleft()
                if len(self.window) < self.max_per_sec:
                    self.window.append(now)
                    return
            time.sleep(0.01)

limiter = FixedWindowRateLimiter(RATE_LIMIT_PER_SEC)

# --- Website enhancer (handle image-wrapped and redirected anchors; avoid hamburg.de/berlin.de) ---
import requests
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

BLOCKLIST = {'branchenbuch.hamburg.de','google.','maps.google','hvv.de','geofox.de','booking.com','yelp.','tripadvisor.','facebook.com','instagram.com','x.com','twitter.com','berlin.de'}

thread_local = threading.local()

def _resolve_redirect(href: str) -> str:
    try:
        if not href or not href.startswith('http'):
            return ''
        u = urlparse(href)
        qs = parse_qs(u.query)
        for k in ['url','dest','destination','to','link','u']:
            if k in qs and qs[k]:
                target = qs[k][0]
                target = unquote(target)
                if target.startswith('http'):
                    return target
        return href
    except Exception:
        return href

def _hostname(u: str) -> str:
    try:
        return urlparse(u).hostname or ''
    except Exception:
        return ''

def _is_portal_domain(host: str) -> bool:
    host = (host or '').lower()
    return ('hamburg.de' in host) or ('berlin.de' in host)

def _name_tokens(name: str):
    name = (name or '').lower()
    name = re.sub(r'[^a-z0-9äöüß\s-]', ' ', name)
    toks = [t for t in re.split(r'[\s-]+', name) if len(t) >= 3]
    stop = {'gmbh','mbh','kg','ag','ug','ev','e.v','gbr','ohg','co','und','the','hamburg','firma','betrieb','service','services','recycling','gruppe'}
    return [t for t in toks if t not in stop]

def _pick_website_from_html(html: str, company_name: str = '') -> str:
    soup = BeautifulSoup(html, 'html.parser')
    # json-ld hint
    jsonld_obj = None
    for tag in soup.find_all('script', type='application/ld+json'):
        try:
            data = json.loads(tag.string or tag.text or '{}')
        except Exception:
            data = None
        if isinstance(data, dict) and data.get('@type'):
            jsonld_obj = data
            break
    # collect anchors
    candidates = []
    for a in soup.find_all('a'):
        href = (a.get('href') or '').strip()
        durl = a.get('data-url') or a.get('data-href') or a.get('data-website')
        href = _resolve_redirect(durl.strip()) if durl else _resolve_redirect(href)
        if not href.startswith('http'):
            continue
        if any(b in href for b in BLOCKLIST):
            continue
        label = (a.get_text(' ', strip=True) or a.get('aria-label') or a.get('title') or '').lower()
        has_img = a.find('img') is not None
        candidates.append((href, label, has_img))
    # json-ld preference
    if isinstance(jsonld_obj, dict):
        for k in ('url','sameAs','@id'):
            v = jsonld_obj.get(k)
            if isinstance(v, str):
                vv = _resolve_redirect(v)
                if vv.startswith('http') and not any(b in vv for b in BLOCKLIST) and not _is_portal_domain(_hostname(vv)):
                    return vv
            elif isinstance(v, list):
                for x in v:
                    if isinstance(x, str):
                        vv = _resolve_redirect(x)
                        if vv.startswith('http') and not any(b in vv for b in BLOCKLIST) and not _is_portal_domain(_hostname(vv)):
                            return vv
    name_toks = set(_name_tokens(company_name))
    website_keywords = ('website','webseite','zur website','homepage','zur homepage','internetseite','zur internetseite')
    # label-based
    for href, label, _ in candidates:
        if any(p in label for p in website_keywords) and not _is_portal_domain(_hostname(href)):
            return href
    # image-wrapped
    for href, _, has_img in candidates:
        if has_img and not _is_portal_domain(_hostname(href)):
            return href
    # name-token match
    for href, _, _ in candidates:
        host = _hostname(href).lower()
        if not _is_portal_domain(host) and any(tok in host for tok in name_toks):
            return href
    # any external
    for href, _, _ in candidates:
        if not _is_portal_domain(_hostname(href)):
            return href
    return ''

def _get_fallback_session():
    sess = getattr(thread_local, 'http_sess', None)
    if sess is None:
        sess = requests.Session()
        retry = Retry(total=3, backoff_factor=0.4, status_forcelist=[429,500,502,503,504], allowed_methods=["GET"]) 
        adapter = HTTPAdapter(max_retries=retry, pool_connections=100, pool_maxsize=100)
        sess.mount('http://', adapter)
        sess.mount('https://', adapter)
        sess.headers.update({'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36'})
        thread_local.http_sess = sess
    return sess

def enhance_website_if_needed(row: dict):
    w = (row.get('website') or '').strip()
    if w and not _is_portal_domain(_hostname(w)):
        return row
    url = row.get('source_url') or ''
    if not url:
        return row
    try:
        sess = _get_fallback_session()
        r = sess.get(url, timeout=20)
        r.raise_for_status()
        better = _pick_website_from_html(r.text, company_name=row.get('name',''))
        if better:
            row['website'] = better
    except Exception:
        pass
    return row

# --- Thread-local scraper instance ---

def get_worker_scraper():
    s = getattr(thread_local, 'scraper', None)
    if s is None:
        if _HAS_FULL:
            s = EnhancedHamburgBranchenbuchScraper(min_delay=0.0, max_delay=0.02, retry_attempts=2, enable_caching=False)
        else:
            # Minimal inline extractor used if full class isn't defined
            class MinimalExtractor:
                def __init__(self):
                    self.sess = _get_fallback_session()
                def extract_company_details(self, company_url: str, category_name: str = '', letter: str = '') -> dict:
                    try:
                        r = self.sess.get(company_url, timeout=20)
                        r.raise_for_status()
                        html = r.text
                        # basic name
                        soup = BeautifulSoup(html, 'html.parser')
                        name = ''
                        h = soup.find(['h1','h2'])
                        if h:
                            name = h.get_text(strip=True)
                        # phone/email/address
                        tel = soup.find('a', href=re.compile(r'^tel:', re.I))
                        phone = re.sub(r'^tel:', '', tel['href']).strip() if tel and tel.get('href') else ''
                        mail = soup.find('a', href=re.compile(r'^mailto:', re.I))
                        email = re.sub(r'^mailto:', '', mail['href']).strip() if mail and mail.get('href') else ''
                        address = ''
                        add_tag = soup.find('address')
                        if add_tag:
                            address = ' '.join(add_tag.get_text(' ', strip=True).split())
                        website = _pick_website_from_html(html, company_name=name)
                        return {
                            'source_url': company_url, 'name': name, 'website': website, 'phone': phone, 'email': email,
                            'address': address, 'latitude': '', 'longitude': '', 'image': '',
                            'category': category_name, 'letter_section': letter,
                            'scraped_at': datetime.utcnow().isoformat()
                        }
                    except Exception:
                        return {'source_url': company_url, 'name': '', 'website': '', 'phone': '', 'email': '', 'address': '', 'latitude': '', 'longitude': '', 'image': '', 'category': category_name, 'letter_section': letter, 'scraped_at': datetime.utcnow().isoformat()}
            s = MinimalExtractor()
        thread_local.scraper = s
    return s

# --- Robust fetch using extract_company_details + website enhancement ---
from datetime import datetime as _dt

def _fallback_row(url: str, ctx: dict):
    cats = sorted(list(ctx.get('categories', set())))
    lets = sorted(list(ctx.get('letters', set())))
    return {
        "source_url": url,
        "name": "",
        "website": "",
        "phone": "",
        "email": "",
        "address": "",
        "latitude": "",
        "longitude": "",
        "image": "",
        # keep compatibility single values, but also provide aggregated lists
        "category": ctx.get('primary_category', cats[0] if cats else ""),
        "letter_section": ctx.get('primary_letter', lets[0] if lets else ""),
        "categories_all": json.dumps(cats, ensure_ascii=False),
        "letter_sections_all": json.dumps(lets, ensure_ascii=False),
        "scraped_at": _dt.utcnow().isoformat(),
    }

def fetch_detail(url: str) -> dict:
    ctx = ctx_all.get(url, {})
    cats = sorted(list(ctx.get('categories', set())))
    lets = sorted(list(ctx.get('letters', set())))
    primary_cat = ctx.get('primary_category', cats[0] if cats else "")
    primary_let = ctx.get('primary_letter', lets[0] if lets else "")

    limiter.acquire()
    time.sleep(random.uniform(*SMALL_JITTER))
    s = get_worker_scraper()
    for attempt in range(3):
        try:
            row = s.extract_company_details(url, category_name=primary_cat, letter=primary_let)
            if isinstance(row, dict):
                row.setdefault('source_url', url)
                row['category'] = row.get('category') or primary_cat
                row['letter_section'] = row.get('letter_section') or primary_let
                row['categories_all'] = json.dumps(cats, ensure_ascii=False)
                row['letter_sections_all'] = json.dumps(lets, ensure_ascii=False)
                row.setdefault('scraped_at', _dt.utcnow().isoformat())
                # Enhance website if missing/incorrect (hamburg/berlin portal)
                row = enhance_website_if_needed(row)
                # If name/address still blank, try quick fallbacks from HTML
                if not row.get('name') or not row.get('address'):
                    try:
                        sess = _get_fallback_session()
                        r = sess.get(url, timeout=20)
                        r.raise_for_status()
                        soup = BeautifulSoup(r.text, 'html.parser')
                        if not row.get('name'):
                            h = soup.find(['h1','h2'])
                            if h:
                                row['name'] = h.get_text(' ', strip=True)
                        if not row.get('address'):
                            addr = soup.find('address')
                            if addr:
                                row['address'] = ' '.join(addr.get_text(' ', strip=True).split())
                            if not row['address']:
                                # microdata fallback
                                street = soup.find(attrs={"itemprop": "streetAddress"})
                                plz = soup.find(attrs={"itemprop": "postalCode"})
                                city = soup.find(attrs={"itemprop": "addressLocality"})
                                parts = []
                                for el in (street, plz, city):
                                    if el:
                                        parts.append(el.get_text(' ', strip=True))
                                if parts:
                                    row['address'] = ' '.join(parts)
                    except Exception:
                        pass
                return row
        except Exception:
            time.sleep(0.25 * (2 ** attempt) + random.uniform(0.02, 0.15))
    return _fallback_row(url, ctx)

# --- Execute with progress + partial saves ---
rows = []
started = time.time()
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
part_idx = 1

EXPECTED_COLS = [
    "name","address","phone","email","website","latitude","longitude","image",
    "category","letter_section","categories_all","letter_sections_all",
    "source_url","scraped_at"
]

def save_partial(force: bool=False):
    global rows, part_idx
    if not rows and not force:
        return
    dfp = pd.DataFrame(rows)
    # normalize columns
    for c in EXPECTED_COLS:
        if c not in dfp.columns:
            dfp[c] = ""
    dfp = dfp[EXPECTED_COLS]
    if 'source_url' in dfp.columns:
        dfp = dfp.drop_duplicates(subset=['source_url']).reset_index(drop=True)
    outp = os.path.join(OUT_PART_DIR, f"details_from_map_{ts}_chunk{part_idx:03d}.csv")
    dfp.to_csv(outp, index=False)
    print(f"Saved partial chunk {part_idx:03d}: {len(dfp)} rows → {outp}")
    part_idx += 1
    rows = []

with cf.ThreadPoolExecutor(max_workers=MAX_WORKERS_DETAILS) as ex:
    futures = [ex.submit(fetch_detail, u) for u in urls]
    for fut in tqdm(cf.as_completed(futures), total=len(futures), desc="Extracting company details", unit="company"):
        try:
            res = fut.result()
            if isinstance(res, dict):
                rows.append(res)
        except Exception:
            pass
        if len(rows) >= SAVE_EVERY_N:
            save_partial()

# Save remaining and consolidate
save_partial(force=True)
partials = sorted(glob.glob(os.path.join(OUT_PART_DIR, f"details_from_map_{ts}_chunk*.csv")))
all_df = []
for p in partials:
    try:
        if os.path.getsize(p) > 0:
            all_df.append(pd.read_csv(p))
    except Exception:
        pass
if all_df:
    df = pd.concat(all_df, ignore_index=True)
    for c in EXPECTED_COLS:
        if c not in df.columns:
            df[c] = ""
    df = df[EXPECTED_COLS]
    if 'source_url' in df.columns:
        df = df.drop_duplicates(subset=['source_url']).reset_index(drop=True)
    out_final = f"hamburg_branchenbuch_companies_details_from_map_{ts}.csv"
    df.to_csv(out_final, index=False)
    took = time.time() - started
    print(f"✅ Done: {len(df)} rows → {out_final} in {took/60:.1f} min.")
    display(df.head(5))
else:
    print("No partials to consolidate (no rows?).")

Unique detail URLs to backfill: 67,899


Extracting company details:   3%|▎         | 2002/67899 [04:11<2:27:27,  7.45company/s]

Saved partial chunk 001: 2000 rows → /home/thiesen/Documents/AI-Innoscence_Ecosystem/data/hh_branchenbuch_checkpoints/from_map/details_from_map_20250930_134410_chunk001.csv


Extracting company details:   6%|▌         | 4000/67899 [08:22<1:55:57,  9.18company/s]

Saved partial chunk 002: 2000 rows → /home/thiesen/Documents/AI-Innoscence_Ecosystem/data/hh_branchenbuch_checkpoints/from_map/details_from_map_20250930_134410_chunk002.csv


Extracting company details:   9%|▉         | 6003/67899 [12:34<1:47:55,  9.56company/s]

Saved partial chunk 003: 2000 rows → /home/thiesen/Documents/AI-Innoscence_Ecosystem/data/hh_branchenbuch_checkpoints/from_map/details_from_map_20250930_134410_chunk003.csv


Extracting company details:  12%|█▏        | 8000/67899 [16:45<1:45:01,  9.51company/s]

Saved partial chunk 004: 2000 rows → /home/thiesen/Documents/AI-Innoscence_Ecosystem/data/hh_branchenbuch_checkpoints/from_map/details_from_map_20250930_134410_chunk004.csv


Extracting company details:  15%|█▍        | 10001/67899 [20:57<1:42:30,  9.41company/s]

Saved partial chunk 005: 2000 rows → /home/thiesen/Documents/AI-Innoscence_Ecosystem/data/hh_branchenbuch_checkpoints/from_map/details_from_map_20250930_134410_chunk005.csv


Extracting company details:  18%|█▊        | 12007/67899 [25:12<1:04:49, 14.37company/s]

Saved partial chunk 006: 2000 rows → /home/thiesen/Documents/AI-Innoscence_Ecosystem/data/hh_branchenbuch_checkpoints/from_map/details_from_map_20250930_134410_chunk006.csv


Extracting company details:  21%|██        | 14000/67899 [29:25<2:11:56,  6.81company/s]

Saved partial chunk 007: 2000 rows → /home/thiesen/Documents/AI-Innoscence_Ecosystem/data/hh_branchenbuch_checkpoints/from_map/details_from_map_20250930_134410_chunk007.csv


Extracting company details:  24%|██▎       | 16000/67899 [33:36<1:59:48,  7.22company/s]

Saved partial chunk 008: 2000 rows → /home/thiesen/Documents/AI-Innoscence_Ecosystem/data/hh_branchenbuch_checkpoints/from_map/details_from_map_20250930_134410_chunk008.csv


Extracting company details:  27%|██▋       | 17999/67899 [37:49<1:41:44,  8.17company/s]

Saved partial chunk 009: 2000 rows → /home/thiesen/Documents/AI-Innoscence_Ecosystem/data/hh_branchenbuch_checkpoints/from_map/details_from_map_20250930_134410_chunk009.csv


Extracting company details:  29%|██▉       | 20002/67899 [42:01<1:20:46,  9.88company/s]

Saved partial chunk 010: 2000 rows → /home/thiesen/Documents/AI-Innoscence_Ecosystem/data/hh_branchenbuch_checkpoints/from_map/details_from_map_20250930_134410_chunk010.csv


Extracting company details:  32%|███▏      | 21999/67899 [46:13<1:33:27,  8.18company/s]

Saved partial chunk 011: 2000 rows → /home/thiesen/Documents/AI-Innoscence_Ecosystem/data/hh_branchenbuch_checkpoints/from_map/details_from_map_20250930_134410_chunk011.csv


Extracting company details:  35%|███▌      | 24002/67899 [50:25<1:36:23,  7.59company/s]

Saved partial chunk 012: 2000 rows → /home/thiesen/Documents/AI-Innoscence_Ecosystem/data/hh_branchenbuch_checkpoints/from_map/details_from_map_20250930_134410_chunk012.csv


Extracting company details:  38%|███▊      | 26003/67899 [54:37<1:13:41,  9.47company/s]

Saved partial chunk 013: 2000 rows → /home/thiesen/Documents/AI-Innoscence_Ecosystem/data/hh_branchenbuch_checkpoints/from_map/details_from_map_20250930_134410_chunk013.csv


Extracting company details:  41%|████      | 28003/67899 [58:50<1:09:05,  9.62company/s]

Saved partial chunk 014: 2000 rows → /home/thiesen/Documents/AI-Innoscence_Ecosystem/data/hh_branchenbuch_checkpoints/from_map/details_from_map_20250930_134410_chunk014.csv


Extracting company details:  44%|████▍     | 30005/67899 [1:03:01<57:19, 11.02company/s]  

Saved partial chunk 015: 2000 rows → /home/thiesen/Documents/AI-Innoscence_Ecosystem/data/hh_branchenbuch_checkpoints/from_map/details_from_map_20250930_134410_chunk015.csv


Extracting company details:  47%|████▋     | 32004/67899 [1:07:13<47:33, 12.58company/s]  

Saved partial chunk 016: 2000 rows → /home/thiesen/Documents/AI-Innoscence_Ecosystem/data/hh_branchenbuch_checkpoints/from_map/details_from_map_20250930_134410_chunk016.csv


Extracting company details:  50%|█████     | 34005/67899 [1:11:26<52:06, 10.84company/s]  

Saved partial chunk 017: 2000 rows → /home/thiesen/Documents/AI-Innoscence_Ecosystem/data/hh_branchenbuch_checkpoints/from_map/details_from_map_20250930_134410_chunk017.csv


Extracting company details:  53%|█████▎    | 36001/67899 [1:15:37<1:13:28,  7.24company/s]

Saved partial chunk 018: 2000 rows → /home/thiesen/Documents/AI-Innoscence_Ecosystem/data/hh_branchenbuch_checkpoints/from_map/details_from_map_20250930_134410_chunk018.csv


Extracting company details:  56%|█████▌    | 38001/67899 [1:19:49<1:13:55,  6.74company/s]

Saved partial chunk 019: 2000 rows → /home/thiesen/Documents/AI-Innoscence_Ecosystem/data/hh_branchenbuch_checkpoints/from_map/details_from_map_20250930_134410_chunk019.csv


Extracting company details:  59%|█████▉    | 40001/67899 [1:24:01<1:08:01,  6.84company/s]

Saved partial chunk 020: 2000 rows → /home/thiesen/Documents/AI-Innoscence_Ecosystem/data/hh_branchenbuch_checkpoints/from_map/details_from_map_20250930_134410_chunk020.csv


Extracting company details:  62%|██████▏   | 42003/67899 [1:28:13<38:08, 11.31company/s]  

Saved partial chunk 021: 2000 rows → /home/thiesen/Documents/AI-Innoscence_Ecosystem/data/hh_branchenbuch_checkpoints/from_map/details_from_map_20250930_134410_chunk021.csv


Extracting company details:  65%|██████▍   | 44001/67899 [1:32:24<43:03,  9.25company/s]  

Saved partial chunk 022: 2000 rows → /home/thiesen/Documents/AI-Innoscence_Ecosystem/data/hh_branchenbuch_checkpoints/from_map/details_from_map_20250930_134410_chunk022.csv


Extracting company details:  68%|██████▊   | 46002/67899 [1:36:37<48:17,  7.56company/s]  

Saved partial chunk 023: 2000 rows → /home/thiesen/Documents/AI-Innoscence_Ecosystem/data/hh_branchenbuch_checkpoints/from_map/details_from_map_20250930_134410_chunk023.csv


Extracting company details:  71%|███████   | 47999/67899 [1:40:51<45:17,  7.32company/s]  

Saved partial chunk 024: 2000 rows → /home/thiesen/Documents/AI-Innoscence_Ecosystem/data/hh_branchenbuch_checkpoints/from_map/details_from_map_20250930_134410_chunk024.csv


Extracting company details:  74%|███████▎  | 50001/67899 [1:45:03<42:34,  7.01company/s]  

Saved partial chunk 025: 2000 rows → /home/thiesen/Documents/AI-Innoscence_Ecosystem/data/hh_branchenbuch_checkpoints/from_map/details_from_map_20250930_134410_chunk025.csv


Extracting company details:  77%|███████▋  | 52002/67899 [1:49:15<35:22,  7.49company/s]  

Saved partial chunk 026: 2000 rows → /home/thiesen/Documents/AI-Innoscence_Ecosystem/data/hh_branchenbuch_checkpoints/from_map/details_from_map_20250930_134410_chunk026.csv


Extracting company details:  80%|███████▉  | 53997/67899 [1:53:27<35:27,  6.54company/s]

Saved partial chunk 027: 2000 rows → /home/thiesen/Documents/AI-Innoscence_Ecosystem/data/hh_branchenbuch_checkpoints/from_map/details_from_map_20250930_134410_chunk027.csv


Extracting company details:  82%|████████▏ | 56000/67899 [1:57:39<30:00,  6.61company/s]

Saved partial chunk 028: 2000 rows → /home/thiesen/Documents/AI-Innoscence_Ecosystem/data/hh_branchenbuch_checkpoints/from_map/details_from_map_20250930_134410_chunk028.csv


Extracting company details:  85%|████████▌ | 57998/67899 [2:01:50<19:58,  8.26company/s]

Saved partial chunk 029: 2000 rows → /home/thiesen/Documents/AI-Innoscence_Ecosystem/data/hh_branchenbuch_checkpoints/from_map/details_from_map_20250930_134410_chunk029.csv


Extracting company details:  88%|████████▊ | 60000/67899 [2:06:01<11:26, 11.50company/s]

Saved partial chunk 030: 2000 rows → /home/thiesen/Documents/AI-Innoscence_Ecosystem/data/hh_branchenbuch_checkpoints/from_map/details_from_map_20250930_134410_chunk030.csv


Extracting company details:  91%|█████████▏| 62001/67899 [2:10:14<10:36,  9.27company/s]

Saved partial chunk 031: 2000 rows → /home/thiesen/Documents/AI-Innoscence_Ecosystem/data/hh_branchenbuch_checkpoints/from_map/details_from_map_20250930_134410_chunk031.csv


Extracting company details:  94%|█████████▍| 64003/67899 [2:14:27<08:21,  7.77company/s]

Saved partial chunk 032: 2000 rows → /home/thiesen/Documents/AI-Innoscence_Ecosystem/data/hh_branchenbuch_checkpoints/from_map/details_from_map_20250930_134410_chunk032.csv


Extracting company details:  97%|█████████▋| 66000/67899 [2:18:39<05:39,  5.59company/s]

Saved partial chunk 033: 2000 rows → /home/thiesen/Documents/AI-Innoscence_Ecosystem/data/hh_branchenbuch_checkpoints/from_map/details_from_map_20250930_134410_chunk033.csv


Extracting company details: 100%|██████████| 67899/67899 [2:22:38<00:00,  7.93company/s]


Saved partial chunk 034: 1899 rows → /home/thiesen/Documents/AI-Innoscence_Ecosystem/data/hh_branchenbuch_checkpoints/from_map/details_from_map_20250930_134410_chunk034.csv
✅ Done: 67899 rows → hamburg_branchenbuch_companies_details_from_map_20250930_134410.csv in 142.7 min.


Unnamed: 0,name,address,phone,email,website,latitude,longitude,image,category,letter_section,categories_all,letter_sections_all,source_url,scraped_at
0,Abbruchunternehmen Peter Frank,,49407219189.0,postfach@abbruch-frank.de,http://www.abbruch-frank.de,,,,Abbruchunternehmen,a,"[""Abbruchunternehmen"", ""Bauen & Wohnen"", ""Bauu...","[""a"", ""b""]",https://www.hamburg.de/branchenbuch/hamburg/ei...,2025-09-30T11:44:11.623791
1,PBR Peter Beuck Recycling GmbH,,494020905925.0,info@peter-beuck.de,https://peter-beuck.de/,,,,Abbruchunternehmen,a,"[""Abbruchunternehmen"", ""Auto & Verkehr"", ""Baue...","[""a"", ""b"", ""c"", ""e"", ""h"", ""l"", ""m"", ""o"", ""p"", ...",https://www.hamburg.de/branchenbuch/hamburg/ei...,2025-09-30T11:44:11.829827
2,DCD Schröder & Kindler GbR,,494080007474.0,moin@dcd24.de,http://www.dcd24.de,,,,Abbruchunternehmen,a,"[""Abbruchunternehmen"", ""Bauen & Wohnen"", ""Bauu...","[""a"", ""b"", ""e"", ""h"", ""k"", ""l"", ""m"", ""o"", ""r"", ...",https://www.hamburg.de/branchenbuch/hamburg/ei...,2025-09-30T11:44:11.522036
3,Abbruchunternehmen Hermann Mock,,49405235288.0,info@abbruch-mock.de,http://www.abbruch-mock.de/,,,,Abbruchunternehmen,a,"[""Abbruchunternehmen"", ""Baudienstleistungen"", ...","[""a"", ""b"", ""e"", ""o"", ""s""]",https://www.hamburg.de/branchenbuch/hamburg/ei...,2025-09-30T11:44:11.790108
4,André Howeiler,,4917661743982.0,,https://www.koeln.de/,,,,Abbruchunternehmen,a,"[""Abbruchunternehmen"", ""Bauen & Wohnen"", ""Bauu...","[""a"", ""b"", ""k""]",https://www.hamburg.de/branchenbuch/hamburg/ei...,2025-09-30T11:44:11.874438


In [4]:
# 5) Combine all from_map chunks, clean wrong websites, and export consolidated CSVs
import os, glob
from datetime import datetime
import pandas as pd

# Directory containing partial chunks written by the backfill
FROM_MAP_DIR = os.path.join(os.getcwd(), "data", "hh_branchenbuch_checkpoints", "from_map")
assert os.path.isdir(FROM_MAP_DIR), f"Directory not found: {FROM_MAP_DIR}"

# Prefer only chunk files; if none exist, fall back to any CSVs in the directory
chunk_pattern = os.path.join(FROM_MAP_DIR, "details_from_map_*_chunk*.csv")
chunk_files = sorted(glob.glob(chunk_pattern))
if not chunk_files:
    chunk_files = sorted(glob.glob(os.path.join(FROM_MAP_DIR, "*.csv")))
    print("No explicit chunk files found; falling back to all CSVs in from_map directory.")

print(f"Discovered {len(chunk_files)} CSV file(s) to merge from: {FROM_MAP_DIR}")
if not chunk_files:
    raise FileNotFoundError("No CSV files found to merge. Ensure the backfill created chunk files in from_map.")

# Read and concatenate
frames = []
for p in chunk_files:
    try:
        if os.path.getsize(p) > 0:
            frames.append(pd.read_csv(p))
    except Exception as e:
        print(f"[warn] Skipping {p}: {e}")

if not frames:
    raise RuntimeError("Found CSVs but none could be loaded. Inspect logs above.")

merged = pd.concat(frames, ignore_index=True)
print(f"Merged rows before de-duplication: {len(merged):,}")

# Normalize expected columns to avoid KeyErrors downstream
EXPECTED_COLS = [
    "name","address","phone","email","website","latitude","longitude","image",
    "category","letter_section","categories_all","letter_sections_all",
    "source_url","scraped_at"
]
for c in EXPECTED_COLS:
    if c not in merged.columns:
        merged[c] = ""

# Drop duplicate companies by source_url (keep the last occurrence in case later chunks contain improved data)
if "source_url" in merged.columns:
    merged = merged.drop_duplicates(subset=["source_url"], keep="last").reset_index(drop=True)

# Clean incorrect website placeholder: delete website value where it's exactly https://www.koeln.de/
# (Keep rows; just clear the incorrect website field)
if "website" in merged.columns:
    w = merged["website"].astype(str).str.strip()
    mask_wrong = w.str.lower().eq("https://www.koeln.de/")
    merged.loc[mask_wrong, "website"] = ""
    print(f"Cleared website for {mask_wrong.sum():,} row(s) with placeholder https://www.koeln.de/ .")

# Reorder columns (optional but consistent)
merged = merged[EXPECTED_COLS]

# Save outputs in workspace root
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
out_all = f"hamburg_branchenbuch_companies_details_from_map_{ts}_all.csv"
out_with_sites = f"hamburg_branchenbuch_companies_details_from_map_{ts}_with_websites.csv"

merged.to_csv(out_all, index=False)
with_web = merged[merged["website"].astype(str).str.strip().ne("")].copy()
with_web.to_csv(out_with_sites, index=False)

print(f"Saved ALL entries: {len(merged):,} rows → {out_all}")
print(f"Saved entries WITH websites: {len(with_web):,} rows → {out_with_sites}")

display(merged.head(3))
display(with_web.head(3))

Discovered 34 CSV file(s) to merge from: /home/thiesen/Documents/AI-Innoscence_Ecosystem/data/hh_branchenbuch_checkpoints/from_map
Merged rows before de-duplication: 67,899
Cleared website for 28,392 row(s) with placeholder https://www.koeln.de/ .
Saved ALL entries: 67,899 rows → hamburg_branchenbuch_companies_details_from_map_20250930_162445_all.csv
Saved entries WITH websites: 39,507 rows → hamburg_branchenbuch_companies_details_from_map_20250930_162445_with_websites.csv


Unnamed: 0,name,address,phone,email,website,latitude,longitude,image,category,letter_section,categories_all,letter_sections_all,source_url,scraped_at
0,Abbruchunternehmen Peter Frank,,49407219189.0,postfach@abbruch-frank.de,http://www.abbruch-frank.de,,,,Abbruchunternehmen,a,"[""Abbruchunternehmen"", ""Bauen & Wohnen"", ""Bauu...","[""a"", ""b""]",https://www.hamburg.de/branchenbuch/hamburg/ei...,2025-09-30T11:44:11.623791
1,PBR Peter Beuck Recycling GmbH,,494020905925.0,info@peter-beuck.de,https://peter-beuck.de/,,,,Abbruchunternehmen,a,"[""Abbruchunternehmen"", ""Auto & Verkehr"", ""Baue...","[""a"", ""b"", ""c"", ""e"", ""h"", ""l"", ""m"", ""o"", ""p"", ...",https://www.hamburg.de/branchenbuch/hamburg/ei...,2025-09-30T11:44:11.829827
2,DCD Schröder & Kindler GbR,,494080007474.0,moin@dcd24.de,http://www.dcd24.de,,,,Abbruchunternehmen,a,"[""Abbruchunternehmen"", ""Bauen & Wohnen"", ""Bauu...","[""a"", ""b"", ""e"", ""h"", ""k"", ""l"", ""m"", ""o"", ""r"", ...",https://www.hamburg.de/branchenbuch/hamburg/ei...,2025-09-30T11:44:11.522036


Unnamed: 0,name,address,phone,email,website,latitude,longitude,image,category,letter_section,categories_all,letter_sections_all,source_url,scraped_at
0,Abbruchunternehmen Peter Frank,,49407219189.0,postfach@abbruch-frank.de,http://www.abbruch-frank.de,,,,Abbruchunternehmen,a,"[""Abbruchunternehmen"", ""Bauen & Wohnen"", ""Bauu...","[""a"", ""b""]",https://www.hamburg.de/branchenbuch/hamburg/ei...,2025-09-30T11:44:11.623791
1,PBR Peter Beuck Recycling GmbH,,494020905925.0,info@peter-beuck.de,https://peter-beuck.de/,,,,Abbruchunternehmen,a,"[""Abbruchunternehmen"", ""Auto & Verkehr"", ""Baue...","[""a"", ""b"", ""c"", ""e"", ""h"", ""l"", ""m"", ""o"", ""p"", ...",https://www.hamburg.de/branchenbuch/hamburg/ei...,2025-09-30T11:44:11.829827
2,DCD Schröder & Kindler GbR,,494080007474.0,moin@dcd24.de,http://www.dcd24.de,,,,Abbruchunternehmen,a,"[""Abbruchunternehmen"", ""Bauen & Wohnen"", ""Bauu...","[""a"", ""b"", ""e"", ""h"", ""k"", ""l"", ""m"", ""o"", ""r"", ...",https://www.hamburg.de/branchenbuch/hamburg/ei...,2025-09-30T11:44:11.522036
