# NGObase — Hamburg organizations (Development, Environment & Climate)

We scrape two category pages:
- https://ngobase.org/ciwa/DE.HA.HA/DEV/development-ngos-charities-hamburg
- https://ngobase.org/ciwa/DE.HA.HA/ENC/environment-and-climate-ngos-charities-hamburg

For each card, we extract:
- name (title in the card)
- website (external link behind the WWW icon)
- domain (normalized)
- category (development | environment)
- source (page URL)

Strategy:
- Static parse (requests + BeautifulSoup) by locating external, non-social links and pairing them with the nearest title within the same card container.
- If static yields 0, use Playwright fallback to evaluate anchors and headings in the rendered DOM.

Output: `Input/Hamburg/ngobase_hamburg_organizations.csv`.

In [1]:
# Imports & helpers
from __future__ import annotations
import os
import re
import time
from typing import List, Dict, Optional, Tuple
from urllib.parse import urlparse, urljoin

import requests
from bs4 import BeautifulSoup
import pandas as pd

URLS = [
    ("https://ngobase.org/ciwa/DE.HA.HA/DEV/development-ngos-charities-hamburg", "development"),
    ("https://ngobase.org/ciwa/DE.HA.HA/ENC/environment-and-climate-ngos-charities-hamburg", "environment"),
]

notebook_dir = os.path.abspath(".")
output_path = os.path.join(notebook_dir, "ngobase_hamburg_organizations.csv")

SESSION = requests.Session()
SESSION.headers.update({
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121 Safari/537.36",
})

SOCIAL_DOMAINS = (
    "facebook.com", "twitter.com", "x.com", "instagram.com", "linkedin.com",
    "youtube.com", "youtu.be", "t.me", "tiktok.com", "vk.com", "pinterest.com",
    "maps.google.", "goo.gl/maps", "openstreetmap.org"
)


def is_external(href: str) -> bool:
    try:
        netloc = urlparse(href).netloc.lower()
        return bool(netloc) and "ngobase.org" not in netloc
    except Exception:
        return False


def is_social(href: str) -> bool:
    href_l = href.lower()
    return any(dom in href_l for dom in SOCIAL_DOMAINS)


def extract_domain(u: str) -> str:
    try:
        host = urlparse(u).netloc.lower()
        return host[4:] if host.startswith("www.") else host
    except Exception:
        return ""


def fetch_html(url: str, retries: int = 2, timeout: int = 30) -> Optional[str]:
    for i in range(retries + 1):
        try:
            r = SESSION.get(url, timeout=timeout)
            if r.status_code == 200:
                return r.text
        except Exception:
            time.sleep(1 + i)
    return None


def get_soup(url: str) -> Optional[BeautifulSoup]:
    html = fetch_html(url)
    if not html:
        return None
    return BeautifulSoup(html, "lxml")


def nearest_title_for_anchor(a) -> Optional[str]:
    cur = a
    # climb to card container and look for a clear title
    for _ in range(7):
        if not cur:
            break
        # within this container, look for an h2/h3/a strong text that looks like the organization name
        for sel in ["h3", "h2", "a", "strong"]:
            t = cur.select_one(sel)
            if t and (t.get_text(strip=True) or "").strip():
                text = t.get_text(strip=True)
                # keep it reasonable (avoid "Report correction")
                if len(text) > 2 and "report correction".lower() not in text.lower():
                    return text
        cur = cur.parent
    # fallback: search previous heading siblings
    cur = a
    for _ in range(3):
        if not cur:
            break
        prev = cur.find_previous(["h3", "h2"]) if hasattr(cur, 'find_previous') else None
        if prev and prev.get_text(strip=True):
            return prev.get_text(strip=True)
        cur = cur.parent
    return None



In [2]:
# Static scraper for both categories

def parse_ngobase_static() -> List[Dict]:
    rows: List[Dict] = []
    for url, category in URLS:
        soup = get_soup(url)
        if not soup:
            print(f"Static: failed to fetch {url}")
            continue

        anchors = soup.find_all("a", href=True)
        used_in_this_page = set()

        for a in anchors:
            href = a.get("href")
            if not href:
                continue
            if not href.startswith("http"):
                href = urljoin(url, href)
            if not is_external(href) or is_social(href):
                continue

            # Heuristic: prefer small icon links near a title within the same card
            name = nearest_title_for_anchor(a) or ""
            if not name:
                continue

            domain = extract_domain(href)
            key = (category, domain)
            if not domain or key in used_in_this_page:
                continue
            used_in_this_page.add(key)

            rows.append({
                "category": category,
                "name": name,
                "website": href,
                "domain": domain,
                "source": url,
            })

    # Global dedupe by (domain, category)
    seen = set()
    dedup = []
    for r in rows:
        key = (r["category"], r["domain"])  # preserve per-category uniqueness
        if key in seen:
            continue
        seen.add(key)
        dedup.append(r)

    print(f"Static total: {len(dedup)} rows across {len(URLS)} pages")
    return dedup

static_rows = parse_ngobase_static()
if static_rows:
    pd.DataFrame(static_rows).to_csv(output_path, index=False)
    print(f"Wrote {output_path}")
else:
    print("Static returned 0; try the Playwright fallback below.")

Static total: 19 rows across 2 pages
Wrote /home/thiesen/Documents/AI-Innoscence_Ecosystem/Input/Hamburg/ngobase_hamburg_organizations.csv


In [None]:
# Playwright fallback (if static fails)
import asyncio

async def scrape_ngobase_playwright(headless: bool = True) -> List[Dict]:
    try:
        from playwright.async_api import async_playwright
    except Exception:
        print("Playwright not available in this kernel.")
        return []

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=headless, args=["--no-sandbox", "--disable-dev-shm-usage"])  
        ctx = await browser.new_context()
        page = await ctx.new_page()

        all_rows: List[Dict] = []
        for url, category in URLS:
            await page.goto(url, wait_until="domcontentloaded", timeout=60000)

            # Let network settle & lazy images load
            try:
                await page.wait_for_load_state("networkidle", timeout=12000)
            except Exception:
                pass

            data = await page.evaluate(
                """
                () => {
                  const SOCIAL = ['facebook.com','twitter.com','x.com','instagram.com','linkedin.com','youtube.com','youtu.be','t.me','tiktok.com','vk.com','pinterest.com','maps.google.','goo.gl/maps','openstreetmap.org'];
                  const isSocial = (u) => SOCIAL.some(s => (u || '').toLowerCase().includes(s));

                  const anchors = Array.from(document.querySelectorAll('a[href]')).filter(a => {
                    const href = a.getAttribute('href') || '';
                    return href.startsWith('http') && !href.includes('ngobase.org') && !isSocial(href);
                  });

                  const titleFor = (a) => {
                    let cur = a;
                    for (let i = 0; i < 7 && cur; i++) {
                      cur = cur.parentElement;
                      if (!cur) break;
                      const t = cur.querySelector('h3, h2, a, strong');
                      if (t && (t.textContent || '').trim() && !/report correction/i.test(t.textContent)) return t.textContent.trim();
                    }
                    // fallback: previous headings
                    let node = a;
                    for (let i = 0; i < 3 && node; i++) {
                      const prev = node.previousElementSibling;
                      if (prev && /H[23]/.test(prev.tagName) && (prev.textContent || '').trim()) return prev.textContent.trim();
                      node = node.parentElement;
                    }
                    return '';
                  };

                  const rows = [];
                  for (const a of anchors) {
                    const href = a.href;
                    const name = titleFor(a);
                    if (!name) continue;
                    rows.push({ name, website: href });
                  }
                  return rows;
                }
                """
            )

            # Normalize and dedupe per-category
            used = set()
            for rec in data:
                href = rec.get('website')
                nm = (rec.get('name') or '').strip()
                if not href or not nm:
                    continue
                dom = extract_domain(href)
                key = (category, dom)
                if not dom or key in used:
                    continue
                used.add(key)
                all_rows.append({
                    'category': category,
                    'name': nm,
                    'website': href,
                    'domain': dom,
                    'source': url,
                })

        await browser.close()
        return all_rows

# Driver for fallback
fallback_rows: List[Dict] = []
if not static_rows:
    try:
        loop = asyncio.get_event_loop()
    except RuntimeError:
        import nest_asyncio, asyncio as _asyncio
        nest_asyncio.apply()
        loop = _asyncio.get_event_loop()
    fallback_rows = loop.run_until_complete(scrape_ngobase_playwright(headless=True))
    print(f"Playwright fallback total: {len(fallback_rows)}")
    if fallback_rows:
        pd.DataFrame(fallback_rows).to_csv(output_path, index=False)
        print(f"Wrote {output_path}")
    else:
        print("No rows via fallback. Consider headless=False.")

In [3]:
# Preview
import os
if os.path.exists(output_path):
    df = pd.read_csv(output_path)
    print(df.shape)
    display(df.head(10))
else:
    print("Output not found yet.")

(19, 5)


Unnamed: 0,category,name,website,domain,source
0,development,Der Paritätische Wohlfahrtsverband Hamburg,https://www.paritaet-hamburg.de/,paritaet-hamburg.de,https://ngobase.org/ciwa/DE.HA.HA/DEV/developm...
1,development,Economy For The Common Good - ECG,https://www.econgood.org/,econgood.org,https://ngobase.org/ciwa/DE.HA.HA/DEV/developm...
2,development,"F.E.E.D. - Food, Education, Energy & Developme...",https://feed-ev.de/,feed-ev.de,https://ngobase.org/ciwa/DE.HA.HA/DEV/developm...
3,development,Gefangene Helfen Jugendlichen E.V.,https://ghj.social/,ghj.social,https://ngobase.org/ciwa/DE.HA.HA/DEV/developm...
4,development,Hand In Hand Zurück Ins Leben Hamburg,https://handinhandhh.com/,handinhandhh.com,https://ngobase.org/ciwa/DE.HA.HA/DEV/developm...
5,development,ReeWie-Haus,https://www.reewie-haus.de/,reewie-haus.de,https://ngobase.org/ciwa/DE.HA.HA/DEV/developm...
6,development,THE NEW INSTITUTE,https://thenew.institute/,thenew.institute,https://ngobase.org/ciwa/DE.HA.HA/DEV/developm...
7,development,Der Paritätische Wohlfahrtsverband Hamburg,https://techbeavers.net/web-design-packages-pk...,techbeavers.net,https://ngobase.org/ciwa/DE.HA.HA/DEV/developm...
8,development,Verwaiste Eltern Und Geschwister Hamburg E.V.,https://www.verwaiste-eltern.de/,verwaiste-eltern.de,https://ngobase.org/ciwa/DE.HA.HA/DEV/developm...
9,environment,ANU Landesverband Hamburg / Schleswig-Holstein...,https://www.anu-hh-sh.de/,anu-hh-sh.de,https://ngobase.org/ciwa/DE.HA.HA/ENC/environm...
