# RETech Mitglieder – Partner-Links extrahieren

Dieses Notebook lädt die Mitglieder-Seite von RETech, extrahiert pro Karte den Namen und den externen Link ("Zur Website") und speichert die Ergebnisse als CSV.

- Quelle: https://retech-germany.net/mitglieder/
- Ausgabe: data/retech_mitglieder.csv

Hinweise:
- Die Seite lädt visuell weitere Karten beim Scrollen. Wir umgehen dies, indem wir zusätzlich auf mögliche Folgeseiten (page/2, page/3, …) prüfen und alle gefundenen Karten zusammensetzen. Falls die Seite serverseitig bereits alle Karten liefert, reicht ein Request.
- Aus jedem Kartenblock wird der Name aus der nächstgelegenen Überschrift (H2/H3/H4) extrahiert, der externe Link über die Schaltfläche "Zur Website". Interne Navigations-Links (retech-germany.net) sowie Social-Profile werden gefiltert.
- Doppelte Einträge werden dedupliziert.

In [1]:
import re
import time
import pathlib
from urllib.parse import urljoin, urlparse

import requests
from bs4 import BeautifulSoup
import pandas as pd

BASE_URL = "https://retech-germany.net/mitglieder/"
OUTPUT_DIR = pathlib.Path("data")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
OUTPUT_FILE = OUTPUT_DIR / "retech_mitglieder.csv"

headers = {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0 Safari/537.36",
    "Accept-Language": "de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7",
}

# session with headers
session = requests.Session()
session.headers.update(headers)

SOCIAL_DOMAINS = {
    "linkedin.com", "facebook.com", "twitter.com", "x.com", "youtube.com", "instagram.com",
    "xing.com", "kununu.com", "google.com", "goo.gl", "bit.ly", "t.me"
}
WEBSITE_TOKENS = ["zur website", "website", "webseite", "homepage", "mehr erfahren", "mehr infos"]
IGNORED_HEADING_TOKENS = {
    "retech", "mitglieder", "jetzt", "kontakt", "additional", "links",
    "tätigkeitsbereich", "wertschöpfungskettenbereich", "forschung", "entwicklung",
    "beratung", "planung", "logistik", "management", "rechtlicher", "rahmen", "epr",
    "maschinen", "anlagenbau", "abfallmanagement", "markterweiterung", "exportförderung",
    "sonstiges"
}


def clean_text(s: str | None) -> str:
    if not s:
        return ""
    s = re.sub(r"\s+", " ", s)
    return s.strip()


def normalize_url(u: str | None) -> str | None:
    if not u:
        return None
    u = u.strip()
    # Fix double schemes like https://https://...
    u = re.sub(r"^(https?://)+", "https://", u, flags=re.I)
    # Add scheme if missing
    if not re.match(r"^https?://", u, re.I):
        if u.startswith("//"):
            u = "https:" + u
        else:
            u = "https://" + u.lstrip("/")
    return u


def is_social_url(url: str) -> bool:
    host = urlparse(url).netloc.lower()
    return any(dom in host for dom in SOCIAL_DOMAINS)


def is_external(url: str) -> bool:
    p = urlparse(url)
    return bool(p.scheme and p.netloc) and 'retech-germany.net' not in p.netloc.lower()


def normalize_company_name(name: str | None) -> str | None:
    if not name:
        return None
    n = clean_text(name)
    # Strip common company suffixes for matching
    n = re.sub(r"\b(gmbh & co\.? kg|gmbh & co|gmbh|ag|kg|ug|e\.v\.|e\.v|ev|mbh)\b", "", n, flags=re.I)
    return clean_text(n)


def nearest_heading_in_ancestors(node) -> str | None:
    # Prefer headings inside ancestor containers; choose the longest reasonable one, skipping ignored tokens
    cur = node
    for _ in range(7):
        if cur is None:
            break
        if hasattr(cur, 'find_all'):
            heads = cur.find_all(['h2','h3','h4'])
            if heads:
                # Choose heading with max text length after filtering
                candidates = []
                for h in heads:
                    t = clean_text(h.get_text())
                    low = t.lower()
                    if not t:
                        continue
                    # Skip generic taxonomy headings
                    if any(tok in low for tok in IGNORED_HEADING_TOKENS):
                        continue
                    candidates.append(t)
                if candidates:
                    return max(candidates, key=len)
        cur = cur.parent
    return None


def nearest_heading_text(node) -> str | None:
    # First try ancestor-based search
    t = nearest_heading_in_ancestors(node)
    if t:
        return t
    # Fallback: walk backwards but skip generic taxonomy headings
    for prev in node.previous_elements:
        name = getattr(prev, 'name', None)
        if name in ('h2','h3','h4'):
            txt = clean_text(prev.get_text())
            if txt and not any(tok in txt.lower() for tok in IGNORED_HEADING_TOKENS):
                return txt
    return None


def score_anchor(a, base_url: str, name_hint: str | None = None) -> tuple[int, str]:
    href = a.get('href') or ''
    u = normalize_url(urljoin(base_url, href))
    if not u:
        return (-1, '')
    if not is_external(u) or is_social_url(u):
        return (-1, u)
    label = clean_text(a.get_text()).lower()
    score = 0
    if any(tok in label for tok in WEBSITE_TOKENS):
        score += 4
    # Favor domains
    if re.search(r"([a-z0-9-]+\.)+[a-z]{2,}", u, re.I):
        score += 2
    # Attributes
    if a.get('rel') and ('external' in ' '.join(a.get('rel')).lower()):
        score += 1
    if a.get('target') == '_blank':
        score += 1
    # If company token appears in domain, boost strongly
    if name_hint:
        nh = normalize_company_name(name_hint).lower()
        host = urlparse(u).netloc.lower()
        sld = host.split('.')[-2] if '.' in host else host
        if sld and sld in nh:
            score += 4
    return (score, u)


def find_card_container_for_heading(h):
    parent = h
    for _ in range(7):
        if parent is None:
            break
        if hasattr(parent, 'find_all'):
            links = parent.find_all('a', href=True)
            if len(links) >= 1:
                return parent
        parent = parent.parent
    return h.parent if h else None


def extract_from_headings(soup: BeautifulSoup) -> list[dict]:
    rows: list[dict] = []
    seen: set[tuple[str, str]] = set()
    for h in soup.find_all(['h2','h3','h4']):
        name = clean_text(h.get_text())
        if not name or len(name) < 2 or len(name) > 200:
            continue
        low = name.lower()
        if any(t in low for t in IGNORED_HEADING_TOKENS):
            continue
        container = find_card_container_for_heading(h)
        if not container:
            continue
        candidates = []
        for a in container.find_all('a', href=True):
            sc, u = score_anchor(a, BASE_URL, name_hint=name)
            if sc >= 0:
                candidates.append((sc, u))
        # Look in a few following siblings as fallback
        if not candidates:
            sib_limit = 0
            for sib in h.next_siblings:
                if sib_limit > 10:
                    break
                sib_limit += 1
                if getattr(sib, 'find_all', None):
                    for a in sib.find_all('a', href=True):
                        sc, u = score_anchor(a, BASE_URL, name_hint=name)
                        if sc >= 0:
                            candidates.append((sc, u))
        if not candidates:
            continue
        candidates.sort(key=lambda x: x[0], reverse=True)
        top_url = candidates[0][1]
        key = (name, top_url)
        if key in seen:
            continue
        seen.add(key)
        rows.append({
            'category': 'mitglieder',
            'name': name,
            'website_url': top_url,
            'retech_profile_url': None,
        })
    return rows


def extract_cards_from_soup(soup: BeautifulSoup) -> list[dict]:
    rows: list[dict] = []
    seen: set[tuple[str, str]] = set()
    for a in soup.select('a[href]'):
        sc, u = score_anchor(a, BASE_URL)
        if sc < 0:
            continue
        name = nearest_heading_text(a)
        if not name:
            domain = urlparse(u).netloc or u
            name = re.sub(r'^www\.', '', domain)
        # Filter taxonomy names accidentally captured
        if any(tok in name.lower() for tok in IGNORED_HEADING_TOKENS):
            continue
        key = (name, u)
        if key in seen:
            continue
        seen.add(key)
        rows.append({
            'category': 'mitglieder',
            'name': name,
            'website_url': u,
            'retech_profile_url': None,
        })
    return rows


def merge_rows(primary: list[dict], secondary: list[dict]) -> list[dict]:
    out: list[dict] = []
    seen: set[tuple[str, str]] = set()
    for src in (primary, secondary):
        for r in src:
            key = (r.get('name'), r.get('website_url'))
            if key in seen:
                continue
            seen.add(key)
            out.append(r)
    return out


def fetch_all_members() -> list[dict]:
    rows: list[dict] = []
    # 1) Hauptseite
    r = session.get(BASE_URL, timeout=30)
    r.raise_for_status()
    soup = BeautifulSoup(r.text, 'lxml')
    rows = merge_rows(extract_from_headings(soup), extract_cards_from_soup(soup))

    # 2) WordPress-typische Pagination /page/2, ...
    seen_before = {(d['name'], d['website_url']) for d in rows}
    for i in range(2, 21):
        page_url = urljoin(BASE_URL, f'page/{i}/')
        try:
            rp = session.get(page_url, timeout=20)
            if rp.status_code == 404 or len(rp.text) < 2000:
                break
            psoup = BeautifulSoup(rp.text, 'lxml')
            new_rows = merge_rows(extract_from_headings(psoup), extract_cards_from_soup(psoup))
            added = 0
            for row in new_rows:
                key = (row['name'], row['website_url'])
                if key not in seen_before:
                    rows.append(row)
                    seen_before.add(key)
                    added += 1
            if added == 0:
                break
            time.sleep(0.2)
        except requests.RequestException:
            break
    return rows


rows = fetch_all_members()
df = pd.DataFrame(rows) if rows else pd.DataFrame(columns=['category','name','website_url','retech_profile_url'])

# Aufbereiten, sortieren, speichern
# Normalize URLs (fix any leftover duplicates)
df['website_url'] = df['website_url'].apply(normalize_url)
df['name'] = df['name'].fillna('').apply(clean_text).replace({'': None})
# Drop rows with missing or obviously malformed URLs
mask_valid = df['website_url'].fillna('').str.match(r'^https?://', case=False)
df = df[mask_valid]

df = df.drop_duplicates(subset=['name','website_url']).sort_values(by=['category','name','website_url'], na_position='last').reset_index(drop=True)

df.to_csv(OUTPUT_FILE, index=False)
print(f'Saved {len(df)} entries to {OUTPUT_FILE}')
try:
    print(df['category'].value_counts(dropna=False))
except Exception:
    pass

df.head(20)

Saved 65 entries to data/retech_mitglieder.csv
category
mitglieder    65
Name: count, dtype: int64


Unnamed: 0,category,name,website_url,retech_profile_url
0,mitglieder,ABC Circular,https://www.ab-circular.de,
1,mitglieder,BC Berlin Consult GmbH,https://www.berlin-consult.de/,
2,mitglieder,BN Umwelt GmbH,https://bn-umwelt.sh/,
3,mitglieder,Bergischer Abfallwirtschaftsverband,https://www.bavweb.de/Bergischer-Abfallwirtsch...,
4,mitglieder,Berliner Stadtreinigungsbetriebe (BSR),https://www.bsr.de/,
5,mitglieder,Berliner Stadtreinigungsbetriebe (BSR),https://www.dbfz.de/,
6,mitglieder,BlackForest Solutions GmbH,https://www.blackforest-solutions.com/,
7,mitglieder,Bondacon International,https://www.bondacon.com/,
8,mitglieder,COMMIT Project Partners GmbH,https://commit-group.com/,
9,mitglieder,Compost Systems GmbH,https://www.compost-systems.com/de,


In [2]:
df.tail(20)

Unnamed: 0,category,name,website_url,retech_profile_url
45,mitglieder,Ramboll Deutschland GmbH,https://www.ramboll.com/de-de,
46,mitglieder,SSI SCHÄFER Plastics GmbH,https://ssi-plastic.com/,
47,mitglieder,STEINERT GmbH,https://steinertglobal.com/de/,
48,mitglieder,Stadtreinigung Hamburg,https://www.stadtreinigung.hamburg/,
49,mitglieder,Sutco RecyclingTechnik GmbH,https://www.sutco.com,
50,mitglieder,TU Dresden – Institut für Abfall- und Kreislau...,https://tu-dresden.de/bu/umwelt/hydro/iak,
51,mitglieder,TU Dresden – Institut für Abfall- und Kreislau...,https://weima.com/de/,
52,mitglieder,Tietjen Verfahrenstechnik GmbH,https://www.tietjen-original.com,
53,mitglieder,Universität Rostock,https://www.auf.uni-rostock.de/professuren/bau...,
54,mitglieder,Vecoplan AG,https://vecoplan.com/de,


# Kombiniere Daten in ein Standardformat

Wie im BDE-Notebook führen wir ein vereinheitlichtes Schema ein: `section, name, url, profile_url, source_page` und speichern es unter `data/retech_scrape.csv`.

In [3]:
import pandas as pd
from pathlib import Path

out_dir = Path('data')
out_dir.mkdir(parents=True, exist_ok=True)
combined_path = out_dir / 'retech_scrape.csv'

# Lade Mitglieder (falls nicht bereits im RAM)
df_mem = None
p = out_dir / 'retech_mitglieder.csv'
if p.exists():
    df_mem = pd.read_csv(p)
else:
    try:
        df_mem = df.copy()
    except NameError:
        df_mem = pd.DataFrame(columns=['category','name','website_url','retech_profile_url'])

# Normalisieren
if df_mem is not None and not df_mem.empty:
    df_mem = df_mem.rename(columns={
        'category': 'section',
        'website_url': 'url',
        'retech_profile_url': 'profile_url',
    })
    if 'section' not in df_mem.columns:
        df_mem['section'] = 'mitglieder'
    df_mem['source_page'] = 'mitglieder'

# Einheitliches Schema
cols = ['section','name','url','profile_url','source_page']
if df_mem is None or df_mem.empty:
    df_all = pd.DataFrame(columns=cols)
else:
    df_all = df_mem.copy()
    for c in cols:
        if c not in df_all.columns:
            df_all[c] = None

# Deduplizieren und sortieren
df_all = df_all.drop_duplicates(subset=['name','url']).sort_values(by=['source_page','section','name','url'], na_position='last').reset_index(drop=True)

# Speichern
df_all.to_csv(combined_path, index=False)
print(f'Saved {len(df_all)} combined rows to {combined_path}')
try:
    print(df_all.groupby(['source_page','section']).size())
except Exception:
    pass

df_all.head(20)

Saved 65 combined rows to data/retech_scrape.csv
source_page  section   
mitglieder   mitglieder    65
dtype: int64


Unnamed: 0,section,name,url,profile_url,source_page
0,mitglieder,ABC Circular,https://www.ab-circular.de,,mitglieder
1,mitglieder,BC Berlin Consult GmbH,https://www.berlin-consult.de/,,mitglieder
2,mitglieder,BN Umwelt GmbH,https://bn-umwelt.sh/,,mitglieder
3,mitglieder,Bergischer Abfallwirtschaftsverband,https://www.bavweb.de/Bergischer-Abfallwirtsch...,,mitglieder
4,mitglieder,Berliner Stadtreinigungsbetriebe (BSR),https://www.bsr.de/,,mitglieder
5,mitglieder,Berliner Stadtreinigungsbetriebe (BSR),https://www.dbfz.de/,,mitglieder
6,mitglieder,BlackForest Solutions GmbH,https://www.blackforest-solutions.com/,,mitglieder
7,mitglieder,Bondacon International,https://www.bondacon.com/,,mitglieder
8,mitglieder,COMMIT Project Partners GmbH,https://commit-group.com/,,mitglieder
9,mitglieder,Compost Systems GmbH,https://www.compost-systems.com/de,,mitglieder


In [None]:
# Zufallsstichprobe
df_all.sample(10) if len(df_all) >= 10 else df_all