# Moldovan research ecosystem scraper (.md/.ro)

This notebook scrapes two ANCD pages and extracts organization names with website domains ending in .md (Moldova) or .ro (Romania):

- Innovation incubators and science/technology parks: https://ancd.gov.md/en/content/innovation-incubators-science-and-technology-parks
- Research organizations: https://ancd.gov.md/en/content/research-organizations

What you get:
- Two CSV files saved in the same directory as this notebook:
  - innovation_incubators_parks_md_ro.csv
  - research_organizations_md_ro.csv

How to use:
- Run the code cell below. It will fetch the pages, parse them, filter to .md/.ro domains, display the results, and save the CSVs.

In [1]:
# Simple scraper for two ANCD pages (filters to .md/.ro)
import re
from pathlib import Path
from urllib.parse import urlparse, urljoin

import pandas as pd
import requests
from bs4 import BeautifulSoup


# ---------- HTTP and parsing helpers ----------
def fetch_soup(url: str, timeout: int = 30) -> BeautifulSoup:
    headers = {
        "User-Agent": (
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
            "(KHTML, like Gecko) Chrome/120.0 Safari/537.36"
        )
    }
    resp = requests.get(url, headers=headers, timeout=timeout)
    resp.raise_for_status()
    return BeautifulSoup(resp.text, "lxml")


def canonicalize_url(base_url: str, href: str | None) -> str | None:
    if not href:
        return None
    href = href.strip()
    # Turn scheme-less urls like 'www.site.md' into http://www.site.md
    if href.startswith("www."):
        href = "http://" + href
    return urljoin(base_url, href)


def extract_domain(url: str | None) -> str:
    if not url:
        return ""
    p = urlparse(url)
    host = p.netloc or ""
    if not host and p.path.startswith("www."):
        host = p.path.split("/")[0]
    host = host.lower()
    if host.startswith("www."):
        host = host[4:]
    return host


def is_md_or_ro(host: str) -> bool:
    return host.endswith(".md") or host.endswith(".ro")


# ---------- Scrapers ----------
def scrape_innovation_incubators(url: str) -> pd.DataFrame:
    """Extract links and associated entity names from the page text.
    Heuristics:
    - For each anchor with .md/.ro, use the nearest <li> text (before 'Pagina web') as the entity name.
    - Fallback to surrounding paragraph or the link text itself.
    - Deduplicate by domain.
    """
    soup = fetch_soup(url)
    rows: list[dict] = []

    for a in soup.find_all("a", href=True):
        full_url = canonicalize_url(url, a.get("href"))
        domain = extract_domain(full_url)
        if not is_md_or_ro(domain):
            continue

        # Try to infer entity name from the nearest list item or paragraph
        name = None
        li = a.find_parent("li")
        if li is not None:
            text = " ".join(li.stripped_strings)
            parts = re.split(r"pagina\s*web\s*: ?", text, flags=re.IGNORECASE)
            candidate = parts[0].strip() if parts else text.strip()
            # Prefer quoted entity names if present
            m = re.search(r"[\"“„]([^\"””]+)[\"””]", candidate)
            name = (m.group(1).strip() if m else candidate)
        else:
            p = a.find_parent(["p", "div"]) or soup
            text = " ".join(p.stripped_strings)
            parts = re.split(r"pagina\s*web\s*: ?", text, flags=re.IGNORECASE)
            candidate = parts[0].strip() if parts else (a.get_text(strip=True) or domain)
            name = candidate or domain

        if not name:
            name = a.get_text(strip=True) or domain

        rows.append({
            "source": url,
            "entity_name": name,
            "url": full_url,
            "domain": domain,
        })

    df = pd.DataFrame(rows)
    if not df.empty:
        df = df.drop_duplicates(subset=["domain"]).reset_index(drop=True)
    return df


def scrape_research_organizations(url: str) -> pd.DataFrame:
    """Parse tables and extract organization name (2nd column) and website link (3rd+ columns).
    Only keep .md/.ro domains and deduplicate by domain.
    """
    soup = fetch_soup(url)
    rows: list[dict] = []

    for table in soup.find_all("table"):
        for tr in table.find_all("tr"):
            cells = tr.find_all(["td", "th"])
            if len(cells) < 3:
                continue
            name = cells[1].get_text(" ", strip=True)
            link_tag = None
            # The site is usually in the 3rd column; search 3rd+ just in case
            for td in cells[2:]:
                link_tag = td.find("a", href=True)
                if link_tag:
                    break
            if not link_tag:
                continue
            full_url = canonicalize_url(url, link_tag.get("href"))
            domain = extract_domain(full_url)
            if not is_md_or_ro(domain):
                continue
            # Clean leading numbering like "1." or "2)"
            name = re.sub(r"^\s*\d+[\.)]\s*", "", name)
            rows.append({
                "source": url,
                "entity_name": name,
                "url": full_url,
                "domain": domain,
            })

    df = pd.DataFrame(rows)
    if not df.empty:
        df = df.drop_duplicates(subset=["domain"]).reset_index(drop=True)
    return df


# ---------- Main run ----------
URL_INCUBATORS = "https://ancd.gov.md/en/content/innovation-incubators-science-and-technology-parks"
URL_RESEARCH = "https://ancd.gov.md/en/content/research-organizations"

incubators_df = scrape_innovation_incubators(URL_INCUBATORS)
research_df = scrape_research_organizations(URL_RESEARCH)

# Save results next to this notebook
out_dir = Path.cwd()
file_incubators = out_dir / "innovation_incubators_parks_md_ro.csv"
file_research = out_dir / "research_organizations_md_ro.csv"

incubators_df.to_csv(file_incubators, index=False)
research_df.to_csv(file_research, index=False)

print("Saved files:")
print(file_incubators)
print(file_research)

# Display a preview
display(incubators_df)
display(research_df)



Saved files:
/home/thiesen/Documents/AI-Innoscence_Ecosystem/Input/Cahul/innovation_incubators_parks_md_ro.csv
/home/thiesen/Documents/AI-Innoscence_Ecosystem/Input/Cahul/research_organizations_md_ro.csv


Unnamed: 0,source,entity_name,url,domain
0,https://ancd.gov.md/en/content/innovation-incu...,Skip to main content,https://ancd.gov.md/en/content/innovation-incu...,ancd.gov.md
1,https://ancd.gov.md/en/content/innovation-incu...,Horizon Europe National Office Description,https://horizoneurope.md/ro,horizoneurope.md
2,https://ancd.gov.md/en/content/innovation-incu...,pstacademica.md,http://pstacademica.md/,pstacademica.md
3,https://ancd.gov.md/en/content/innovation-incu...,incubator.utm.md,http://www.incubator.utm.md/,incubator.utm.md
4,https://ancd.gov.md/en/content/innovation-incu...,inno-center.md,http://www.inno-center.md/,inno-center.md
5,https://ancd.gov.md/en/content/innovation-incu...,inventica.usm.md,http://www.inventica.usm.md/,inventica.usm.md
6,https://ancd.gov.md/en/content/innovation-incu...,it4ba.ase.md,http://www.it4ba.ase.md/,it4ba.ase.md
7,https://ancd.gov.md/en/content/innovation-incu...,edu.asm.md,http://edu.asm.md/en/content/univer-science-cl...,edu.asm.md
8,https://ancd.gov.md/en/content/innovation-incu...,biodanubius.ro,http://www.biodanubius.ro/,biodanubius.ro
9,https://ancd.gov.md/en/content/innovation-incu...,Sesizați CNA,https://www.cna.md/pageview.php?l=ro&idc=112&t...,cna.md


Unnamed: 0,source,entity_name,url,domain
0,https://ancd.gov.md/en/content/research-organi...,Universitatea de Stat din Moldova,http://usm.md/,usm.md
1,https://ancd.gov.md/en/content/research-organi...,Universitatea Pedagogică de Stat din Moldova ”...,https://www.upsc.md/en/,upsc.md
2,https://ancd.gov.md/en/content/research-organi...,Universitatea Pedagogică de Stat ”Alecu Russo”...,http://usarb.md/,usarb.md
3,https://ancd.gov.md/en/content/research-organi...,Universitatea de Stat de Educație Fizică și Sport,http://www.usefs.md/,usefs.md
4,https://ancd.gov.md/en/content/research-organi...,Universitatea de Stat ”Dimitrie Cantemir”,http://edu.asm.md/md,edu.asm.md
5,https://ancd.gov.md/en/content/research-organi...,Universitatea Cooperatist Comercială,http://www.uccm.md/en/,uccm.md
6,https://ancd.gov.md/en/content/research-organi...,Universitatea Tehnică a Moldovei,https://utm.md/,utm.md
7,https://ancd.gov.md/en/content/research-organi...,Academia de Studii Economice din Moldova,http://www.ase.md/ro/,ase.md
8,https://ancd.gov.md/en/content/research-organi...,"Academia de Muzică, Teatru și Arte Plastice",http://amtap.md/,amtap.md
9,https://ancd.gov.md/en/content/research-organi...,Institutul de Științe ale Educației,http://www.ise.md/,ise.md


## Notes
- Filtering keeps only domains ending in `.md` or `.ro`.
- If the ANCD pages change structure, you can tweak the simple heuristics:
  - For incubators/parks, we grab the nearest list item text before “Pagina web”.
  - For research organizations, we read the 2nd column as the name and the next column with a link as the website.
- The CSVs are saved next to this notebook as:
  - `innovation_incubators_parks_md_ro.csv`
  - `research_organizations_md_ro.csv`