# Startup Moldova — Community Map scraper

This notebook scrapes entities from the Startup Moldova Community Map and extracts each entity’s name and website for the five service categories:
- Mentoring and Events
- Accelerators, Incubators
- Co-working
- Funding
- Lobby, Advocacy

Approach:
- Primary: use a headless browser to click each category and read the visible tiles (hoverable logos) and their links.
- Fallback: if a browser is unavailable, parse the static HTML and extract all visible logo links (category set to "Unknown").

Outputs:
- CSV: `startupmoldova_community_map.csv` with columns: category, name, url, domain, source.
- A quick preview and per-category counts.

Run the code cell below; if you see a browser install error, the fallback will still produce a useful CSV.

In [4]:
# Scraper implementation (Playwright + fallback)
from __future__ import annotations
import time
import re
import asyncio
from typing import List, Dict, Tuple
from pathlib import Path
from urllib.parse import urlparse

import pandas as pd
import requests
from bs4 import BeautifulSoup

COMMUNITY_URL = "https://www.startupmoldova.digital/community-map"
CATEGORIES = [
    "Mentoring and Events",
    "Accelerators, Incubators",
    "Co-working",
    "Funding",
    "Lobby, Advocacy",
]


def extract_domain(url: str | None) -> str:
    if not url:
        return ""
    p = urlparse(url)
    host = (p.netloc or "").lower()
    if host.startswith("www."):
        host = host[4:]
    return host


def canonical_name_from_domain(domain: str) -> str:
    if not domain:
        return ""
    label = domain.split(".")[0]
    return label.replace("-", " ").title()


# ------------- Playwright scraper (async, Jupyter-safe) -------------
async def scrape_with_playwright_async(url: str) -> List[Dict[str, str]]:
    try:
        from playwright.async_api import async_playwright
    except Exception as e:
        raise RuntimeError(f"Playwright import failed: {e}")

    rows: List[Dict[str, str]] = []

    js_extract = """
        (catLabel) => {
            const isExternal = (href) => {
                try { const u = new URL(href, location.href); return !u.hostname.includes('startupmoldova.digital'); } catch { return false; }
            };
            const heading = Array.from(document.querySelectorAll('h1,h2,h3,h4,h5,h6,div,p,span'))
                .find(el => (el.textContent||'').toLowerCase().includes('choose a service'));
            const rectH = heading ? heading.getBoundingClientRect() : {top: 0};
            const anchors = Array.from(document.querySelectorAll('a[href]'));
            const out = [];
            for (const a of anchors) {
                const href = a.getAttribute('href') || '';
                if (!href) continue;
                if (!isExternal(href)) continue;
                const r = a.getBoundingClientRect();
                // Keep links in the main grid area beneath the heading
                if (r.width < 40 || r.height < 30) continue;
                if (r.top < (rectH.top + 10) || r.top > (rectH.top + 1800)) continue;
                // Name candidates
                const img = a.querySelector('img');
                const name = (a.getAttribute('aria-label')||a.getAttribute('title')||a.textContent||''||(img?img.getAttribute('alt'):'')||'').trim();
                out.push([name, href]);
            }
            return out;
        }
    """

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        ctx = await browser.new_context(user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0 Safari/537.36")
        page = await ctx.new_page()
        page.set_default_timeout(25000)
        await page.goto(url, wait_until="networkidle")
        await page.wait_for_selector("text=Choose a service", timeout=20000)
        await asyncio.sleep(0.8)

        for cat in CATEGORIES:
            # Try to click/switch the category, but continue if not clickable
            for attempt in (
                lambda: page.get_by_role("button", name=cat).click(timeout=1200),
                lambda: page.get_by_role("link", name=cat).click(timeout=1200),
                lambda: page.get_by_text(cat, exact=True).first.click(timeout=1200),
                lambda: page.locator(f"text={cat}").first.click(timeout=1200),
            ):
                try:
                    await attempt()
                    break
                except Exception:
                    pass
            await asyncio.sleep(0.8)

            pairs = await page.evaluate(js_extract, cat)
            for name, href in pairs:
                domain = extract_domain(href)
                if not domain:
                    continue
                if not name:
                    name = canonical_name_from_domain(domain)
                rows.append({
                    "category": cat,
                    "name": name,
                    "url": href,
                    "domain": domain,
                    "source": url,
                })

        await browser.close()

    # Deduplicate by (category, domain)
    dedup: Dict[Tuple[str,str], Dict[str,str]] = {}
    for r in rows:
        key = (r["category"], r["domain"]) 
        dedup[key] = r
    return list(dedup.values())


def scrape_with_playwright(url: str) -> List[Dict[str, str]]:
    try:
        import nest_asyncio
        nest_asyncio.apply()
    except Exception:
        pass
    loop = asyncio.get_event_loop()
    return loop.run_until_complete(scrape_with_playwright_async(url))


# ------------- Static HTML fallback -------------

def scrape_static(url: str) -> List[Dict[str, str]]:
    headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64)"}
    resp = requests.get(url, headers=headers, timeout=30)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "lxml")

    # Try to scope to the section, otherwise scan entire doc
    heading = None
    for tag in soup.find_all(["h1","h2","h3","h4","h5","h6","p","div","span"]):
        if (tag.get_text(" ", strip=True) or "").lower().find("choose a service") >= 0:
            heading = tag
            break
    container = heading.find_parent(["section","div","main"]) if heading else soup

    rows: List[Dict[str,str]] = []
    for a in container.find_all("a", href=True):
        href = a.get("href").strip()
        dom = extract_domain(href)
        if not dom:
            continue
        name = a.get("aria-label") or a.get("title") or a.get_text(" ", strip=True)
        if not name:
            img = a.find("img")
            if img and img.get("alt"):
                name = img.get("alt").strip()
        if not name:
            name = canonical_name_from_domain(dom)
        rows.append({
            "category": "Unknown",
            "name": name,
            "url": href,
            "domain": dom,
            "source": url,
        })

    # Deduplicate by domain
    dedup: Dict[str, Dict[str,str]] = {}
    for r in rows:
        dedup[r["domain"]] = r
    return list(dedup.values())


# ------------- Main execution -------------
try:
    data = scrape_with_playwright(COMMUNITY_URL)
    used_fallback = False
except Exception as e:
    print("Playwright path failed, switching to static HTML fallback:", e)
    data = scrape_static(COMMUNITY_URL)
    used_fallback = True


df = pd.DataFrame(data)

# Normalize URLs
import re as _re

def normalize_url(u: str) -> str:
    u = (u or "").strip()
    if not u:
        return u
    if not _re.match(r"^https?://", u, _re.IGNORECASE):
        u = "https://" + u
    return u

if not df.empty and "url" in df.columns:
    df["url"] = df["url"].map(normalize_url)

out_path = Path.cwd() / "startupmoldova_community_map.csv"
df.to_csv(out_path, index=False)

print(f"Saved {len(df)} rows to {out_path}")
print("Mode:", "fallback" if used_fallback else "playwright")

if not df.empty and "category" in df.columns:
    display(df.groupby("category").size().rename("count").reset_index())

if not df.empty:
    display(df.head(25))
else:
    print("No rows extracted. Try rerunning or check site availability.")

Saved 40 rows to /home/thiesen/Documents/AI-Innoscence_Ecosystem/Input/Cahul/startupmoldova_community_map.csv
Mode: playwright


Unnamed: 0,category,count
0,"Accelerators, Incubators",5
1,Co-working,8
2,Funding,8
3,"Lobby, Advocacy",6
4,Mentoring and Events,13


Unnamed: 0,category,name,url,domain,source
0,Mentoring and Events,Dreamups,https://dreamups.com/,dreamups.com,https://www.startupmoldova.digital/community-map
1,Mentoring and Events,Yepmoldova,https://yepmoldova.org/,yepmoldova.org,https://www.startupmoldova.digital/community-map
2,Mentoring and Events,Technovator,https://technovator.world/,technovator.world,https://www.startupmoldova.digital/community-map
3,Mentoring and Events,Mozaic,https://mozaic.md/,mozaic.md,https://www.startupmoldova.digital/community-map
4,Mentoring and Events,Mitp,https://mitp.md/p/web/ePark#about,mitp.md,https://www.startupmoldova.digital/community-map
5,Mentoring and Events,Investitii,https://investitii.md/,investitii.md,https://www.startupmoldova.digital/community-map
6,Mentoring and Events,Ebrd,https://www.ebrd.com/moldova.html,ebrd.com,https://www.startupmoldova.digital/community-map
7,Mentoring and Events,Ict,https://ict.md/,ict.md,https://www.startupmoldova.digital/community-map
8,Mentoring and Events,Xy,https://xy.md/,xy.md,https://www.startupmoldova.digital/community-map
9,Mentoring and Events,Finantari,https://www.finantari.md/,finantari.md,https://www.startupmoldova.digital/community-map
