# Green NGOs of Moldova — site-wide NGO list extraction

We scrape the NGO directory at https://www.greenngosofmoldova.org/ngos/.
For each NGO tile, we follow its subpage (e.g., https://www.greenngosofmoldova.org/ngo/eteco/) and extract:
- name
- website (external link shown under “Web” on the NGO page)
- domain (normalized)
- source (subpage URL)

Output: `Input/Cahul/green_ngos_of_moldova.csv`.

We first use a static approach (requests + BeautifulSoup). If the index is paginated or tiles are lazy-loaded, we can extend this with pagination or a browser fallback later.

In [1]:
# Imports & helpers
from __future__ import annotations
import os
import time
from typing import List, Dict, Optional
from urllib.parse import urljoin, urlparse

import requests
from bs4 import BeautifulSoup
import pandas as pd

INDEX_URL = "https://www.greenngosofmoldova.org/ngos/"
BASE = "https://www.greenngosofmoldova.org/"
notebook_dir = os.path.abspath(".")
output_path = os.path.join(notebook_dir, "green_ngos_of_moldova.csv")

SESSION = requests.Session()
SESSION.headers.update({
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121 Safari/537.36",
})

SOCIAL = (
    "facebook.com", "twitter.com", "x.com", "instagram.com", "linkedin.com",
    "youtube.com", "youtu.be", "t.me", "tiktok.com", "vk.com"
)


def fetch_html(url: str, retries: int = 2, timeout: int = 30) -> Optional[str]:
    for i in range(retries + 1):
        try:
            r = SESSION.get(url, timeout=timeout)
            if r.status_code == 200:
                return r.text
        except Exception:
            time.sleep(1 + i)
    return None


def get_soup(url: str) -> Optional[BeautifulSoup]:
    html = fetch_html(url)
    if not html:
        return None
    return BeautifulSoup(html, "lxml")


def extract_domain(u: str) -> str:
    try:
        host = urlparse(u).netloc.lower()
        return host[4:] if host.startswith("www.") else host
    except Exception:
        return ""


def is_external(u: str) -> bool:
    try:
        net = urlparse(u).netloc.lower()
        return net and "greenngosofmoldova.org" not in net
    except Exception:
        return False


def is_social(u: str) -> bool:
    u = (u or '').lower()
    return any(s in u for s in SOCIAL)




In [2]:
# Index parser — find NGO tiles and subpage URLs

def get_index_entries(url: str = INDEX_URL) -> List[Dict]:
    soup = get_soup(url)
    if not soup:
        print("Failed to load index page")
        return []

    rows: List[Dict] = []
    # Heuristic: tiles likely link to /ngo/<slug>/ via anchors
    for a in soup.find_all('a', href=True):
        href = a['href']
        if not href:
            continue
        full = urljoin(BASE, href)
        if '/ngo/' in full and full.startswith('https://www.greenngosofmoldova.org/ngo/'):
            # Infer a name: either anchor text, alt on image, or surrounding heading
            name = (a.get_text(" ") or "").strip()
            if not name:
                img = a.find('img', alt=True)
                if img and img.get('alt'):
                    name = img['alt'].strip()
            if not name:
                # try heading near the link
                h = a.find_next(['h2','h3']) or a.find_previous(['h2','h3'])
                if h:
                    name = (h.get_text(" ") or "").strip()
            rows.append({ 'name': name, 'subpage': full })

    # Dedupe by subpage
    seen = set()
    out = []
    for r in rows:
        if r['subpage'] in seen:
            continue
        seen.add(r['subpage'])
        out.append(r)

    print(f"Index entries discovered: {len(out)}")
    return out

In [3]:
# Subpage parser — extract website from NGO page

def get_website_from_subpage(url: str) -> str:
    soup = get_soup(url)
    if not soup:
        return ""

    # 1) Prefer anchors that are external and not social
    for a in soup.find_all('a', href=True):
        href = a['href']
        if href.startswith('mailto:') or href.startswith('tel:'):
            continue
        full = urljoin(url, href)
        if is_external(full) and not is_social(full):
            return full

    # 2) Fallback: look for text blocks containing 'Web' and an URL-like token
    text = soup.get_text(" ", strip=True)
    import re
    m = re.search(r"\bWeb\b\s+((?:https?://)?(?:www\.)?[A-Za-z0-9.-]+\.[A-Za-z]{2,}(?:/\S*)?)", text, re.IGNORECASE)
    if m:
        val = m.group(1)
        if not val.lower().startswith(('http://', 'https://')):
            val = 'https://' + val if val.lower().startswith('www.') else 'http://' + val
        if is_external(val) and not is_social(val):
            return val

    return ""


def scrape_all() -> List[Dict]:
    entries = get_index_entries(INDEX_URL)
    rows: List[Dict] = []
    for e in entries:
        name = (e.get('name') or '').strip()
        sub = e['subpage']
        site = get_website_from_subpage(sub)
        rows.append({
            'name': name,
            'website': site,
            'domain': extract_domain(site) if site else '',
            'source': sub,
        })
    return rows

rows = scrape_all()
print(f"Total NGOs scraped: {len(rows)}")

# Write CSV
pd.DataFrame(rows).to_csv(output_path, index=False)
print(f"Wrote {output_path}")

Index entries discovered: 19
Total NGOs scraped: 19
Wrote /home/thiesen/Documents/AI-Innoscence_Ecosystem/Input/Cahul/green_ngos_of_moldova.csv


In [4]:
# Preview a sample
import os
if os.path.exists(output_path):
    df = pd.read_csv(output_path)
    print(df.shape)
    display(df.head(12))
    print("Websites present:", (df['website'].astype(str)!='').sum())
else:
    print("CSV not found.")

(19, 4)


Unnamed: 0,name,website,domain,source
0,EtEcO,http://etecotiras.ru/,etecotiras.ru,https://www.greenngosofmoldova.org/ngo/eteco/
1,EcoPMR,https://ekopmr.ru/,ekopmr.ru,https://www.greenngosofmoldova.org/ngo/ecovisi...
2,AVD,http://www.e-circular.org,e-circular.org,https://www.greenngosofmoldova.org/ngo/avd/
3,AFPMDD,http://www.mediu.md,mediu.md,https://www.greenngosofmoldova.org/ngo/afpmdd/
4,Gutta-Club,,,https://www.greenngosofmoldova.org/ngo/gutta-c...
5,Media-Grup MERIDIAN,http://www.ecofm.md,ecofm.md,https://www.greenngosofmoldova.org/ngo/meridian/
6,AJMTEM,https://ecopresa.md/,ecopresa.md,https://www.greenngosofmoldova.org/ngo/ajmtem/
7,Medics for Ecology,http://dr-ecology.blogspot.com/,dr-ecology.blogspot.com,https://www.greenngosofmoldova.org/ngo/medics-...
8,MEM,http://www.mem.md,mem.md,https://www.greenngosofmoldova.org/ngo/mem/
9,MEGA,http://megageneration.com,megageneration.com,https://www.greenngosofmoldova.org/ngo/mega/


Websites present: 19
