# AmCham Moldova members scraper

This notebook extracts all member names and their website URLs (when available) from:

- https://www.amcham.md/?go=members&sub=members

Behavior:
- Each member card has a name and on hover may show a website; not every member has one.
- The scraper collects the name and the first website-like string found in the same card; if none is found, it records a blank value.
- Results are saved as `amcham_members.csv` in the same directory and also displayed inline.

In [None]:
# Utilities and parsers
import re
from pathlib import Path
from urllib.parse import urlparse, urljoin

import requests
import pandas as pd
from bs4 import BeautifulSoup


AMCHAM_MEMBERS_URL = "https://www.amcham.md/?go=members&sub=members"


def fetch_soup(url: str, timeout: int = 30) -> BeautifulSoup:
    headers = {
        "User-Agent": (
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
            "(KHTML, like Gecko) Chrome/120.0 Safari/537.36"
        )
    }
    r = requests.get(url, headers=headers, timeout=timeout)
    r.raise_for_status()
    return BeautifulSoup(r.text, "lxml")


# URL-like token detector (keeps simple domains and http/https)
URL_RE = re.compile(r"((?:https?://)?(?:www\.)?[\w.-]+\.[a-z]{2,}(?:/[\w\-./%]*)?)", re.IGNORECASE)


def is_external(href: str | None) -> bool:
    """True if the href points outside amcham.md; relative links are NOT external."""
    if not href:
        return False
    p = urlparse(href)
    # If no scheme and no netloc, it's a relative link -> not external
    if not p.scheme and not p.netloc:
        return False
    host = (p.netloc or "").lower()
    return "amcham.md" not in host


def canonicalize(base_url: str, href: str | None) -> str | None:
    if not href:
        return None
    return urljoin(base_url, href)


def _read_more_count(node) -> int:
    return sum(1 for a in node.find_all("a") if a.get_text(strip=True).lower() == "read more")


def find_card_container(read_more_a):
    node = read_more_a
    candidate = node
    for _ in range(8):
        parent = node.parent
        if not parent or parent.name not in {"div", "article", "li", "section"}:
            break
        if _read_more_count(parent) > 1:
            break
        candidate = parent
        node = parent
    return candidate


def extract_member_cards(soup: BeautifulSoup) -> list[dict]:
    """Primary strategy: each 'Read More' corresponds to one member tile.
    We attach the tile's website (if present) to the NEXT heading after that button,
    which matches the page's DOM ordering (overlay before title for many items).
    """
    rows: list[dict] = []

    read_more_candidates = [a for a in soup.find_all("a") if a.get_text(strip=True).lower() == "read more"]

    for a in read_more_candidates:
        card = find_card_container(a)
        if not card:
            continue

        # WEBSITE from card overlay
        website = ""
        for link in card.find_all("a", href=True):
            if link is a:
                continue
            href = (link.get("href") or "").strip()
            if is_external(href) and URL_RE.search(href):
                website = href
                break
        if not website:
            text = "\n".join(card.stripped_strings)
            text = re.sub(r"\bRead\s*More\b", "", text, flags=re.IGNORECASE)
            m = URL_RE.search(text)
            if m and "amcham.md" not in m.group(1).lower():
                website = m.group(1).strip()

        # NAME: use the next heading after this button (works with overlay-first DOM)
        name = ""
        next_heading = a.find_next(["h3", "h4", "h5", "h6", "strong"])  # next in DOM
        if next_heading:
            name = next_heading.get_text(" ", strip=True)
        if not name:
            continue

        rows.append({
            "name": name,
            "website": website,
            "source": AMCHAM_MEMBERS_URL,
        })

    # Deduplicate by name, prefer entries with a non-empty website
    dedup: dict[str, dict] = {}
    for row in rows:
        key = row["name"].strip()
        if key not in dedup or (not dedup[key]["website"] and row["website"]):
            dedup[key] = row
    return list(dedup.values())


In [16]:
# Run extraction and save CSV
soup = fetch_soup(AMCHAM_MEMBERS_URL)
rows = extract_member_cards(soup)

import pandas as _pd

df = _pd.DataFrame(rows).reset_index(drop=True)

# Normalize website strings a bit: trim and make sure empty stays empty

def _clean_site(s: str) -> str:
    s = (s or "").strip()
    # Remove trailing punctuation commonly found in copy
    s = s.rstrip(".,; ")
    return s

df["website"] = df["website"].map(_clean_site)

# Heuristic alignment fix: check if many websites seem to refer to the NEXT name
# If so, shift websites down by one (assign previous site's value to current name)

def _looks_shifted(names: _pd.Series, sites: _pd.Series, sample: int = 30) -> bool:
    import re as _re
    n = min(sample, len(names) - 1)
    hits = 0
    for i in range(n):
        name_next = names[i + 1].lower()
        # token: first word with letters/digits longer than 3
        tokens = [t for t in _re.findall(r"[a-z0-9]+", name_next) if len(t) >= 4]
        site = (sites[i] or "").lower()
        if any(t in site for t in tokens):
            hits += 1
    return hits >= max(5, int(0.5 * n))

if _looks_shifted(df["name"], df["website"], sample=40):
    df["website"] = df["website"].shift(1).fillna("")

# Save next to notebook
out_path = Path.cwd() / "amcham_members.csv"
df.to_csv(out_path, index=False)

print(f"Saved {len(df)} members to: {out_path}")
df.head(20)

Saved 191 members to: /home/thiesen/Documents/AI-Innoscence_Ecosystem/Input/Cahul/amcham_members.csv


Unnamed: 0,name,website,source
0,Members:,,https://www.amcham.md/?go=members&sub=members
1,AQUATRADE,,https://www.amcham.md/?go=members&sub=members
2,Hospices of Hope Moldova,www.hospicesofhope.md/en,https://www.amcham.md/?go=members&sub=members
3,Abbott,www.abbott.com,https://www.amcham.md/?go=members&sub=members
4,ACI Partners,www.aci.md,https://www.amcham.md/?go=members&sub=members
5,Agro Mester HD,www.agromester.md,https://www.amcham.md/?go=members&sub=members
6,AgroProfi,www.agroprofi.md,https://www.amcham.md/?go=members&sub=members
7,Agrostoc,https://agrostoc.md/,https://www.amcham.md/?go=members&sub=members
8,ArtGranit,https://artgranit.md/,https://www.amcham.md/?go=members&sub=members
9,AUTOMALL GROUP,www.automall.group/,https://www.amcham.md/?go=members&sub=members


In [17]:
# Quick sanity checks for specific entries
_df = df.set_index("name")
for key in ["Avis Car Rental", "Baker Tilly", "AQUATRADE", "ACI Partners"]:
    if key in _df.index:
        print(key, "->", _df.loc[key, "website"])

Avis Car Rental -> www.avis.md
Baker Tilly -> www.bakertilly.md
AQUATRADE -> 
ACI Partners -> www.aci.md
