# EEHH Mitgliedsunternehmen – externe Websites extrahieren

Dieses Notebook scrapt die Mitgliederübersicht der Erneuerbare Energien Hamburg (EEHH) und besucht jede Profilseite, um die externe Website der Firma aus dem Profiltext zu extrahieren.

- Übersicht: https://www.erneuerbare-energien-hamburg.de/de/mitglieder/mitgliedsunternehmen.html
- Pagination: über Parameter `?page_c3=2..7` (Seite 1 ist die Basis-URL)
- Ausgabe: data/eeh_mitglieder.csv

Hinweise:
- Die Firmenseite ist auf der Detailseite häufig im Fließtext erwähnt (z. T. ohne anklickbaren Link). Das Notebook erkennt Domain-/URL-Muster (z. B. `www.example.com`) auch im reinen Text und normalisiert sie zu `https://...`.
- Social-Links und interne Links (EEHH-Domain) werden gefiltert.
- Ergebnis wird dedupliziert und nach Name/URL sortiert.

In [7]:
import re
import time
import pathlib
from urllib.parse import urljoin, urlparse

import requests
from bs4 import BeautifulSoup
import pandas as pd

LIST_BASE = 'https://www.erneuerbare-energien-hamburg.de/de/mitglieder/mitgliedsunternehmen.html'
PAGES = list(range(1, 31))  # 1..30 pages total
OUTPUT_DIR = pathlib.Path('data')
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
OUTPUT_FILE = OUTPUT_DIR / 'eeh_mitglieder.csv'

# Set to True to run the full crawl; keep False when testing functions quickly in this notebook
RUN_FULL = False

headers = {
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0 Safari/537.36',
    'Accept-Language': 'de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7',
}

session = requests.Session()
session.headers.update(headers)

SOCIAL = {
    'linkedin.com','facebook.com','twitter.com','x.com','youtube.com','instagram.com',
    'xing.com','kununu.com','t.me','bit.ly','goo.gl','google.com'
}

# Known non-company or project domains frequently mentioned in profiles
BAD_DOMAINS = {
    'norddeutsches-reallabor.de',
    'eehh.de',
}

STOPWORDS = {
    'gmbh','mbh','kg','co','ag','se','ug','eg','ev','e.v.','mbb','llp','ltd','inc',
    'partnerschaft','partner','rechtsanwälte','rechtsanwaelte','anwälte','anwaelte',
    'und','the','der','die','das','von','für','fuer','mit','hamburg','deutschland'
}

def clean(s: str | None) -> str:
    if not s: return ''
    return re.sub(r'\s+', ' ', s).strip()

def normalize_url(u: str | None) -> str | None:
    if not u: return None
    u = u.strip()
    u = re.sub(r'^(https?://)+', 'https://', u, flags=re.I)
    if not re.match(r'^https?://', u, flags=re.I):
        if u.startswith('//'): u = 'https:' + u
        else: u = 'https://' + u.lstrip('/')
    return u

def is_external(u: str) -> bool:
    p = urlparse(u)
    return bool(p.scheme and p.netloc) and 'erneuerbare-energien-hamburg.de' not in p.netloc.lower()

def is_social(u: str) -> bool:
    host = urlparse(u).netloc.lower().replace('www.', '')
    return any(dom in host for dom in SOCIAL)

def domain_from_url(u: str) -> str:
    try:
        host = urlparse(u).netloc.lower().replace('www.', '')
        return host
    except Exception:
        return ''

URL_RE = re.compile(r'(?:https?://)?(?:www\.)?([A-Za-z0-9][-A-Za-z0-9]{0,62}\.[A-Za-z]{2,})(?:/[^\s<]*)?')

# --- Name ↔ domain alignment helpers ---

def tokens_from_name(name: str) -> list[str]:
    name = clean(name).lower()
    # split on spaces, punctuation and hyphens
    toks = re.split(r'[^a-z0-9]+', name)
    toks = [t for t in toks if len(t) > 2 and t not in STOPWORDS]
    return toks

def domain_match_score(name_tokens: list[str], domain: str) -> int:
    sld = domain.split(':')[0].split('.')
    if len(sld) >= 2:
        sld = sld[0]  # take most-specific label (left-most)
    else:
        sld = domain.split(':')[0]
    score = 0
    for t in name_tokens:
        if t and t in sld:
            score += 5
    # small bonus for initials like BN in bn-kollegen
    initials = ''.join(w[0] for w in name_tokens if w)
    if len(initials) >= 2 and initials.lower() in sld:
        score += 2
    return score

# --- Extract profile links on a listing page ---

def extract_profile_links(list_url: str) -> list[tuple[str,str]]:
    r = session.get(list_url, timeout=30); r.raise_for_status()
    s = BeautifulSoup(r.text, 'lxml')
    links = []
    base_domain = 'https://www.erneuerbare-energien-hamburg.de/'
    
    for a in s.select('a[href*="mitgliedsunternehmen/details/"]'):
        href = a.get('href'); name = clean(a.get_text())
        if not href or not name:
            continue
        
        # Build URL correctly - href often starts with 'de/' so prepend base domain
        if href.startswith('de/'):
            u = base_domain + href
        elif href.startswith('/'):
            u = base_domain + href.lstrip('/')
        else:
            u = urljoin(base_domain, href)
        
        links.append((name, u))
    # Dedupe by URL keeping first name encountered
    out, seen = [], set()
    for name, u in links:
        if u in seen: continue
        seen.add(u); out.append((name, u))
    return out

# --- Pick the best external website from a profile page ---

def best_external_from_profile(profile_url: str, name_hint: str | None) -> str | None:
    try:
        rp = session.get(profile_url, timeout=30)
        if not rp.ok: return None
        sp = BeautifulSoup(rp.text, 'lxml')
        name_tokens = tokens_from_name(name_hint or '')
        
        # 1) Consider explicit anchors with external URLs
        candidates: list[tuple[int,str]] = []
        for a in sp.select('a[href]'):
            href = a.get('href')
            if not href: continue
            
            # Handle protocol-relative URLs (//example.com)
            if href.startswith('//'):
                href = 'https:' + href
            
            # Build absolute URL
            if href.startswith('http://') or href.startswith('https://'):
                u = href
            else:
                u = urljoin(profile_url, href)
            
            # Normalize and check if external
            u = normalize_url(u)
            if not u or not is_external(u) or is_social(u):
                continue
            
            dom = domain_from_url(u)
            if any(bad in dom for bad in BAD_DOMAINS):
                continue
            
            # Skip mailto, tel, fax, javascript
            if any(u.lower().startswith(prefix) for prefix in ['mailto:', 'tel:', 'fax:', 'javascript:']):
                continue
            
            # Score the link
            label = clean(a.get_text()).lower()
            score = 0
            
            # Base score for typical TLDs
            if re.search(r'^https?://[^/]*\.(com|de|io|net|org)/?$', u, re.I):
                score += 3
            
            # Add alignment with company name
            score += domain_match_score(name_tokens, dom)
            
            # Bonus for typical website indicators
            if any(t in label for t in ['website','webseite','homepage','zur website']):
                score += 2
            if a.get('target') == '_blank':
                score += 1
            
            candidates.append((score, u))
        
        if candidates:
            candidates.sort(key=lambda x: x[0], reverse=True)
            return candidates[0][1]
        
        # 2) Fallback: mine plain text for domain-like patterns
        body_text = sp.get_text(' ', strip=True)
        text_domains: list[str] = []
        for m in URL_RE.finditer(body_text):
            domain = m.group(1).lower()
            text_domains.append(domain)
        # Preserve order, dedupe
        seen = set(); ordered = []
        for d in text_domains:
            if d in seen: continue
            seen.add(d); ordered.append(d)
        # Filter externals and bad domains
        filtered = []
        for d in ordered:
            if any(bad in d for bad in BAD_DOMAINS):
                continue
            u = normalize_url(d)
            if not u: continue
            if not is_external(u) or is_social(u):
                continue
            filtered.append((d, u))
        if filtered:
            # Score by name alignment
            scored = []
            for d, u in filtered:
                s = domain_match_score(name_tokens, d)
                # small bias towards shorter, brand-like domains
                s += max(0, 4 - d.count('-'))
                scored.append((s, u))
            scored.sort(key=lambda x: x[0], reverse=True)
            return scored[0][1]
        
        return None
    except Exception:
        return None

# --- Crawl all pages and profiles (guarded by RUN_FULL) ---
if RUN_FULL:
    rows = []
    for p in PAGES:
        list_url = LIST_BASE if p == 1 else f'{LIST_BASE}?page_c3={p}'
        pairs = extract_profile_links(list_url)
        print(f'Page {p}/{len(PAGES)}: Found {len(pairs)} companies')
        
        for i, (name_guess, prof_url) in enumerate(pairs, 1):
            website = best_external_from_profile(prof_url, name_guess)
            rows.append({
                'category': 'mitgliedsunternehmen',
                'name': name_guess,
                'website_url': website,
                'eeh_profile_url': prof_url,
            })
            if i % 5 == 0:
                print(f'  Processed {i}/{len(pairs)} companies on page {p}')
            time.sleep(0.1)

    # Build dataframe and clean
    df = pd.DataFrame(rows) if rows else pd.DataFrame(columns=['category','name','website_url','eeh_profile_url'])
    # Cleanup
    df['name'] = df['name'].fillna('').apply(clean).replace({'': None})
    df['website_url'] = df['website_url'].apply(normalize_url)
    valid = df['website_url'].notna() & df['website_url'].str.match(r'^https?://', na=False, case=False)
    df = df[valid]
    # Drop anything accidentally landing on a bad domain (defensive)
    df = df[~df['website_url'].str.contains('|'.join(BAD_DOMAINS), case=False, na=False)]

    df = df.drop_duplicates(subset=['name','website_url']).sort_values(by=['category','name','website_url'], na_position='last').reset_index(drop=True)

    # Save
    df.to_csv(OUTPUT_FILE, index=False)
    print(f'\n{"="*60}')
    print(f'✓ Successfully scraped {len(rows)} total entries')
    print(f'✓ {len(df)} entries with valid website URLs (after filtering bad domains)')
    print(f'✓ Saved to {OUTPUT_FILE}')
    print(f'{"="*60}\n')
    display(df.head(20))
else:
    print("Definitions loaded. Set RUN_FULL=True and re-run this cell to scrape.")

Definitions loaded. Set RUN_FULL=True and re-run this cell to scrape.


In [5]:
df.sample(10)

Unnamed: 0,category,name,website_url,eeh_profile_url
65,mitgliedsunternehmen,DURAG GmbH,https://www.durag.com/de/wasserstoff-4745.htm,https://www.erneuerbare-energien-hamburg.de/de...
296,mitgliedsunternehmen,morEnergy GmbH,https://www.morenergy.net,https://www.erneuerbare-energien-hamburg.de/de...
240,mitgliedsunternehmen,Staatliche Gewerbeschule Energietechnik-G 10,https://norddeutsches-reallabor.de/,https://www.erneuerbare-energien-hamburg.de/de...
107,mitgliedsunternehmen,GFA Consulting Group GmbH,https://www.gfa-group.de,https://www.erneuerbare-energien-hamburg.de/de...
137,mitgliedsunternehmen,Hamburger Energienetze GmbH,https://www.hamburger-energienetze.de/,https://www.erneuerbare-energien-hamburg.de/de...
220,mitgliedsunternehmen,SAPOtech GmbH,https://www.sapotech.de,https://www.erneuerbare-energien-hamburg.de/de...
206,mitgliedsunternehmen,PricewaterhouseCoopers AG,https://norddeutsches-reallabor.de/,https://www.erneuerbare-energien-hamburg.de/de...
254,mitgliedsunternehmen,Teos Energy GmbH,https://www.teosenergy.com,https://www.erneuerbare-energien-hamburg.de/de...
124,mitgliedsunternehmen,HFK Rechtsanwälte PartGmbB,https://norddeutsches-reallabor.de/,https://www.erneuerbare-energien-hamburg.de/de...
117,mitgliedsunternehmen,H2Perform GmbH,https://www.h2perform.de,https://www.erneuerbare-energien-hamburg.de/de...


In [8]:
# Quick check: pick best website from BRAHMS NEBEL profile
profile = "https://www.erneuerbare-energien-hamburg.de/de/mitglieder/mitgliedsunternehmen/details/BRAHMS-NEBEL-Partnerschaft-von-Rechtsanw%C3%A4lten-mbB.html"
name_hint = "BRAHMS NEBEL Partnerschaft von Rechtsanwälten mbB"
print(best_external_from_profile(profile, name_hint))

https://www.bn-kollegen.de


In [9]:
# Optional repair: patch existing CSV by re-extracting only rows with bad domains
import os
if os.path.exists(OUTPUT_FILE):
    fix_df = pd.read_csv(OUTPUT_FILE)
    def has_bad(u: str) -> bool:
        if not isinstance(u, str):
            return False
        host = domain_from_url(u)
        return any(b in host for b in BAD_DOMAINS)
    mask = fix_df['website_url'].apply(has_bad)
    to_fix = fix_df[mask].copy()
    print(f"Rows to fix: {len(to_fix)}")
    updated = 0
    for idx, row in to_fix.iterrows():
        new_u = best_external_from_profile(str(row['eeh_profile_url']), str(row['name']))
        if new_u and not any(b in domain_from_url(new_u) for b in BAD_DOMAINS):
            fix_df.at[idx, 'website_url'] = new_u
            updated += 1
    # Drop any remaining bad domain rows entirely
    fix_df = fix_df[~fix_df['website_url'].astype(str).str.contains('|'.join(BAD_DOMAINS), case=False, na=False)]
    fix_df.to_csv(OUTPUT_FILE, index=False)
    print(f"Updated {updated} rows and removed remaining bad-domain rows. Saved to {OUTPUT_FILE}")
else:
    print("No existing CSV found to repair.")

Rows to fix: 81
Updated 81 rows and removed remaining bad-domain rows. Saved to data/eeh_mitglieder.csv
