# UmweltPartner – Namen aus PDF extrahieren

Dieses Notebook lädt die UmweltPartner-Partnerliste als PDF, extrahiert die Firmennamen und speichert sie als CSV.

- Quelle (PDF): https://www.hamburg.de/resource/blob/288634/d7c60a4a7225771269ea685876377238/d-partnerliste-data.pdf
- Ausgabe (Namen): data/umweltpartner_names.csv
- Optional: URL-Anreicherung via DuckDuckGo für firmeneigene Websites (data/umweltpartner_enriched.csv)

Hinweise:
- Die Namen können im PDF als Tabelle oder Fließtext vorliegen. Wir versuchen zuerst Tabellen zu extrahieren und fallen dann auf Text-Zeilen zurück.
- Social- und Aggregator-Domains werden beim URL-Matching gefiltert.
- Bitte respektieren Sie die Website beim Abruf (kleine Delays bei Enrichment).

In [9]:
import io, re, time, pathlib, sys
from typing import Iterable, List
import requests
import pandas as pd
from pypdf import PdfReader

PDF_URL = 'https://www.hamburg.de/resource/blob/288634/d7c60a4a7225771269ea685876377238/d-partnerliste-data.pdf'
OUTPUT_DIR = pathlib.Path('data'); OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
NAMES_CSV = OUTPUT_DIR / 'umweltpartner_names.csv'
ENRICHED_CSV = OUTPUT_DIR / 'umweltpartner_enriched.csv'

# Flags
RUN_EXTRACT = True   # Namen aus PDF extrahieren
RUN_ENRICH = True   # URLs via DuckDuckGo suchen (optional)

headers = {
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0 Safari/537.36',
    'Accept-Language': 'de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7',
}
session = requests.Session(); session.headers.update(headers)

def fetch_pdf_bytes(url: str) -> bytes:
    r = session.get(url, timeout=60)
    r.raise_for_status()
    return r.content

def clean(s: str | None) -> str:
    if not s: return ''
    return re.sub(r'\s+', ' ', s).strip()

ORG_SUFFIX = {
    'gmbh','mbh','kg','co','ag','se','ug','eg','ev','e.v.','mbb','llp','ltd','inc','stiftung','verein',
    'partnerschaft','partner','rechtsanwälte','rechtsanwaelte','anwälte','anwaelte','gbr','ohg','kgaa'
}

HEADER_TOKENS = {
    'liste der umweltpartner',
    'stand:',
    'stand dezember',
    'hamburg.de',
}

def looks_like_name(line: str) -> bool:
    if not line: return False
    low = line.lower()
    if any(tok in low for tok in HEADER_TOKENS):
        return False
    if any(tok in low for tok in ['www.', 'http://', 'https://', '@', 'telefon', 'fax', 'tel.', 'mail']):
        return False
    # likely address lines
    if re.search(r'\b(str\.|strasse|straße|weg|allee|platz|chaussee|damm|ring|ufer|markt)\b', low):
        return False
    # Accept if it contains org suffix or multiple capitalized tokens
    if any(suf in low for suf in ORG_SUFFIX):
        return True
    tokens = re.split(r'[^A-Za-zÄÖÜäöüß0-9&\-\+\.]+', line)
    caps = sum(1 for t in tokens if t and t[0].isupper())
    return caps >= 2 and len(tokens) >= 2

def extract_text_lines(pdf_bytes: bytes) -> list[str]:
    reader = PdfReader(io.BytesIO(pdf_bytes))
    lines: list[str] = []
    for page in reader.pages:
        txt = page.extract_text() or ''
        # normalize Windows/Mac line breaks
        txt = txt.replace('\r\n', '\n').replace('\r', '\n')
        for raw in txt.split('\n'):
            line = clean(raw)
            if line:
                lines.append(line)
    return lines

# --- New: consolidate bullet items and fix hyphenation across wrapped lines ---
BULLET_CHARS = {'•', '-', '–', '—'}

def is_bullet_start(line: str) -> bool:
    s = line.lstrip()
    return any(s.startswith(ch) for ch in BULLET_CHARS)

def strip_bullet(line: str) -> str:
    s = line.lstrip()
    if s and s[0] in BULLET_CHARS:
        s = s[1:].strip()
    return s

def merge_hyphenation(a: str, b: str) -> str:
    # If a ends with '-', join without space; also fix patterns like 'ge - sellschaft' -> 'gesellschaft'
    if a.endswith('-'):
        return a[:-1] + b.lstrip()
    # fix spaced hyphenation artifacts
    joined = a + ' ' + b
    joined = re.sub(r'(\w)\s+-\s+(\w)', r'\1\2', joined)
    return clean(joined)

def consolidate_bullets(lines: list[str]) -> list[str]:
    items: list[str] = []
    current: str | None = None
    for line in lines:
        if is_bullet_start(line):
            # flush previous
            if current:
                items.append(clean(current))
            current = strip_bullet(line)
        else:
            if current:
                current = merge_hyphenation(current, line)
            else:
                # no active bullet; ignore or treat as header
                pass
    if current:
        items.append(clean(current))
    # final cleanup per item
    cleaned = []
    for it in items:
        it = re.sub(r'\s+', ' ', it).strip(' •\t')
        # remove trailing comma/semicolon and duplicate spaces
        it = it.rstrip(',;')
        cleaned.append(it)
    return cleaned


def extract_names_from_text(pdf_bytes: bytes) -> list[str]:
    lines = extract_text_lines(pdf_bytes)
    items = consolidate_bullets(lines)
    candidates: list[str]
    if len(items) > 50:  # most pages use bullets; if many, prefer them
        candidates = items
    else:
        # fallback to line-by-line heuristic
        candidates = [ln for ln in lines if looks_like_name(ln)]
    # Post-filter
    out = []
    for l in candidates:
        l = clean(l)
        if not l: continue
        if any(tok in l.lower() for tok in HEADER_TOKENS):
            continue
        if re.fullmatch(r'[0-9A-Za-z/]+', l):
            continue
        if looks_like_name(l):
            out.append(l)
    return out


def extract_unique_names(pdf_bytes: bytes) -> list[str]:
    names = extract_names_from_text(pdf_bytes)
    # Deduplicate while preserving order
    seen = set(); out = []
    for n in names:
        if n in seen: continue
        seen.add(n); out.append(n)
    # Trim trailing commas and normalize internal spaces
    out = [clean(n).rstrip(',;') for n in out]
    return out

if RUN_EXTRACT:
    pdf_bytes = fetch_pdf_bytes(PDF_URL)
    names = extract_unique_names(pdf_bytes)
    df = pd.DataFrame({'name': names})
    df.to_csv(NAMES_CSV, index=False)
    print(f'✓ Extracted {len(df)} names from PDF')
    print(f'✓ Saved to {NAMES_CSV}')
    display(df.head(20))
else:
    print('Extraction disabled (set RUN_EXTRACT=True to run).')

✓ Extracted 1621 names from PDF
✓ Saved to data/umweltpartner_names.csv


Unnamed: 0,name
0,8.2 Renewable Energy Experts Hamburg GmbH
1,A & F Drucklufttechnik GmbH
2,A + S Antriebstechnik + Spannsysteme Vertriebs...
3,a. hartrodt (GmbH & Co) KG
4,A. Schmidt & Co. GmbH
5,A. Witt & Co. GmbH
6,A.W. Niemeyer GmbH
7,A.Walther & co. (GmbH & Co.) Spedition
8,A201 GmbH
9,abasto- Gesellschaft für re generativen und ra...


In [13]:
# Optional: URL-Enrichment via DuckDuckGo (prefer `ddgs`, fallback to `duckduckgo_search`)
import warnings
import atexit
from urllib.parse import urlparse

# Prefer the renamed package `ddgs`; fallback to legacy `duckduckgo_search`
HAS_DDG = False
DDG_PROVIDER = None
try:
    from ddgs import DDGS  # pip install ddgs
    HAS_DDG = True
    DDG_PROVIDER = 'ddgs'
except Exception:
    try:
        from duckduckgo_search import DDGS  # legacy package name
        # Silence runtime rename warning from legacy package
        warnings.filterwarnings(
            'ignore',
            message=r'This package.*renamed to ddgs',
            category=RuntimeWarning
        )
        HAS_DDG = True
        DDG_PROVIDER = 'duckduckgo_search'
    except Exception:
        HAS_DDG = False
        DDG_PROVIDER = None

# Optional: keep notebook output clean once we've addressed proper cleanup below
SUPPRESS_SOCKET_WARNINGS = True
if SUPPRESS_SOCKET_WARNINGS:
    warnings.simplefilter('ignore', ResourceWarning)

SOCIAL = {'linkedin.com','facebook.com','twitter.com','x.com','instagram.com','youtube.com','xing.com','wikipedia.org'}
AGGREGATORS = {'hamburg.de','gelbeseiten.de','meinestadt.de','kompass.com','kununu.com','stepstone.de','indeed.de','yelp.de'}

# Maintain a single DDGS client for the whole notebook to avoid opening/closing many HTTP/2 connections
_ddg_client = None

def _get_ddg_client():
    global _ddg_client
    if not HAS_DDG:
        return None
    if _ddg_client is None:
        try:
            _ddg_client = DDGS()
        except Exception:
            _ddg_client = None
    return _ddg_client

@atexit.register
def _close_ddg_client():
    global _ddg_client
    if _ddg_client is not None:
        try:
            _ddg_client.close()
        except Exception:
            pass
        _ddg_client = None

def is_good_candidate(url: str, name_tokens: list[str]) -> bool:
    host = urlparse(url).netloc.lower().replace('www.','')
    if any(dom in host for dom in SOCIAL | AGGREGATORS):
        return False
    # prefer .de/.com/.eu and alignment with name tokens
    tld_ok = host.endswith('.de') or host.endswith('.com') or host.endswith('.eu')
    sld = host.split('.')
    sld = sld[0] if sld else host
    align = sum(1 for t in name_tokens if t in sld)
    return tld_ok and align >= 1

def tokens(name: str) -> list[str]:
    name = name.lower()
    toks = re.split(r'[^a-z0-9]+', name)
    return [t for t in toks if len(t) > 2 and t not in {'hamburg','gmbh','mbh','kg','ag','se','ug','eg','ev','mbb','llp','ltd','inc','partnerschaft','stiftung','verein'}]

def ddg_text(query: str, max_results: int = 6, region: str = 'de-de'):
    # Unified wrapper so both providers behave the same
    if not HAS_DDG:
        return []
    client = _get_ddg_client()
    if client is None:
        return []
    try:
        return list(client.text(query, region=region, max_results=max_results))
    except Exception:
        return []

def search_best_website(name: str) -> tuple[str | None, str | None]:
    if not HAS_DDG:
        return None, None
    query = f"{name} Hamburg Website"
    toks = tokens(name)
    results = ddg_text(query, region='de-de', max_results=6)
    for r in results:
        url = r.get('href') or r.get('url')
        title = r.get('title') or r.get('body') or ''
        if not url: continue
        if is_good_candidate(url, toks):
            return url, title
    # fallback: first non-social, non-aggregator
    for r in results:
        url = r.get('href') or r.get('url')
        title = r.get('title') or r.get('body') or ''
        if not url: continue
        host = urlparse(url).netloc.lower().replace('www.','')
        if any(dom in host for dom in SOCIAL | AGGREGATORS):
            continue
        return url, title
    return None, None

def enrich_with_websites(names: Iterable[str]) -> pd.DataFrame:
    rows = []
    for i, n in enumerate(names, 1):
        url, title = search_best_website(n)
        rows.append({'name': n, 'website_url': url, 'website_title': title})
        if i % 10 == 0:
            print(f'  processed {i}…')
        time.sleep(0.3)  # be polite
    return pd.DataFrame(rows)

if RUN_ENRICH:
    base_df = pd.read_csv(NAMES_CSV) if NAMES_CSV.exists() else pd.DataFrame({'name': []})
    if not HAS_DDG:
        print('ddgs/duckduckgo_search not available; install ddgs or set RUN_ENRICH=False')
    elif base_df.empty:
        print('No names CSV found or empty; run extraction first.')
    else:
        enr = enrich_with_websites(base_df['name'].tolist())
        # keep first non-null per name
        enr = enr.sort_values(by=['name','website_url'], na_position='last').drop_duplicates(subset=['name'], keep='first')
        enr.to_csv(ENRICHED_CSV, index=False)
        print(f'✓ Enriched {len(enr)} rows; saved to {ENRICHED_CSV}')
        display(enr.head(20))
else:
    print('Enrichment disabled (set RUN_ENRICH=True to run).')

  processed 10…
  processed 20…
  processed 20…
  processed 30…
  processed 30…
  processed 40…
  processed 40…
  processed 50…
  processed 50…
  processed 60…
  processed 60…
  processed 70…
  processed 70…
  processed 80…
  processed 80…
  processed 90…
  processed 90…
  processed 100…
  processed 100…
  processed 110…
  processed 110…
  processed 120…
  processed 120…
  processed 130…
  processed 130…
  processed 140…
  processed 140…
  processed 150…
  processed 150…
  processed 160…
  processed 160…
  processed 170…
  processed 170…
  processed 180…
  processed 180…
  processed 190…
  processed 190…
  processed 200…
  processed 200…
  processed 210…
  processed 210…
  processed 220…
  processed 220…
  processed 230…
  processed 230…
  processed 240…
  processed 240…
  processed 250…
  processed 250…
  processed 260…
  processed 260…
  processed 270…
  processed 270…
  processed 280…
  processed 280…
  processed 290…
  processed 290…
  processed 300…
  processed 300…
  processed 31

Unnamed: 0,name,website_url,website_title
0,8.2 Renewable Energy Experts Hamburg GmbH,https://www.zhihu.com/tardis/bd/ans/32050636875,骁龙 8 Gen3 和骁龙 8 至尊版的差距有多大? - 知乎
1,A & F Drucklufttechnik GmbH,https://www.zhihu.com/question/584414009,函数符号 f 和 f (·) 和 f (-) 有什么区别？ - 知乎
2,A + S Antriebstechnik + Spannsysteme Vertriebs...,https://www.s-bahn-muenchen.de/de,"S -Bahn München - MVV-Fahrplan, Bahn Tickets, ..."
4,A. Schmidt & Co. GmbH,https://www.a-schmidtco.de/,Ihr Partner in der Lebensmittelindustrie - A.S...
5,A. Witt & Co. GmbH,https://www.witt-weiden.de/,Witt - Ihr Online Shop für Damenmode & Wäsche
6,A.W. Niemeyer GmbH,https://stackoverflow.com/questions/2477452/â€...,"""â€™"" showing on page instead of - Stack Overflow"
7,A.Walther & co. (GmbH & Co.) Spedition,https://www.awalther-co.de/,A. Walther & Co. - Spedition Hamburg (Transpor...
8,A201 GmbH,https://www.hamburg-web.de/guide/rubrik/bauunt...,Bauunternehmen Hamburg - 38 Unternehmen im Ham...
11,ABC Reifendienst Bernd Hartmann Kfz. Meisterbe...,https://kfz-serviceportal.de/werkstatt/hamburg...,ABC Reifendienst Bernd Hartmann in 22339 Hambu...
13,ABS – Jachtservice Andreas Dose,https://www.europages.co.uk/ABSJACHTSERVICE/00...,"ABS-JACHTSERVICE in Hamburg, Naval constructio..."
