# Extract AmCham Serbia Members (name + website)

This notebook scrapes all members from https://amcham.rs/members/ (all letters are on one page) and extracts:

- company_name
- website (as shown on the page)

Results are previewed and saved to CSV in this folder as `amcham_members.csv`.

In [1]:
# Setup: imports and configuration
import re, time
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urlparse

AMCHAM_URL = "https://amcham.rs/members/"



In [2]:
# Fetch helper with simple retry and user-agent
def fetch_html(url: str, timeout: int = 30, retries: int = 3, backoff: float = 1.5) -> str:
    headers = {
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36',
        'Accept-Language': 'en,en-US;q=0.8',
    }
    err = None
    for i in range(retries):
        try:
            resp = requests.get(url, headers=headers, timeout=timeout)
            resp.raise_for_status()
            return resp.text
        except Exception as e:
            err = e
            time.sleep(backoff ** (i + 1))
    raise RuntimeError(f"Failed to fetch {url}: {err}")

html = fetch_html(AMCHAM_URL)

In [3]:
# Parse members: extract (company_name, website) pairs
URL_RE = re.compile(r"((?:https?://)?(?:www\.)?[\w.-]+\.[a-z]{2,}(?:/[\w\-./?%&=]*)?)", re.I)

def normalize_site(site: str) -> str:
    s = site.strip()
    s = s.rstrip('.,;:\u2013\u2014')
    if not s:
        return s
    if not s.lower().startswith(('http://', 'https://')):
        s = 'https://' + s.lstrip('/')
    s = re.sub(r'^(https?://)+(https?://)+', r'\1', s)
    return s

def choose_container(soup: BeautifulSoup):
    selectors = [
        'div.entry-content',
        'div.page-content',
        'div.elementor-widget-text-editor',
        'div.elementor-widget-container',
        'main',
        'article',
        'div.site-content',
        'body'
    ]
    best, best_score = soup.body, -1
    for sel in selectors:
        for node in soup.select(sel):
            text = node.get_text('\n', strip=True)
            score = text.count(' | ') + text.count('www.') + len(node.find_all('a'))
            if score > best_score:
                best, best_score = node, score
    return best

def parse_members(html: str):
    soup = BeautifulSoup(html, 'html.parser')
    container = choose_container(soup)
    results = []
    seen = set()
    for tag in container.find_all(['p','li','div','span'], recursive=True):
        text = tag.get_text(' ', strip=True)
        if not text:
            continue
        m = URL_RE.search(text)
        if not m:
            continue
        site_raw = m.group(1)
        site = site_raw.replace('\u200b','').strip()
        name = None
        a = tag.find('a')
        if a and a.get_text(strip=True):
            name = a.get_text(' ', strip=True)
        if not name:
            if '|' in text:
                name = text.split('|', 1)[0].strip()
            else:
                name = text.replace(site_raw, '').strip().strip('-|:').strip()
        if len(name) < 2:
            continue
        display = site
        canonical = normalize_site(site)
        key = (name, canonical.lower())
        if key in seen:
            continue
        seen.add(key)
        results.append({'company_name': name, 'website': display})
    return results

members = parse_members(html)

In [4]:
# Clean results and create DataFrame
from pathlib import Path

def domain_of(url_like: str) -> str:
    u = url_like.strip()
    if not u.lower().startswith(('http://','https://')):
        u = 'https://' + u
    try:
        netloc = urlparse(u).netloc.lower()
        if netloc.startswith('www.'):
            netloc = netloc[4:]
        return netloc
    except Exception:
        return ''

clean = []
for row in members:
    name = row['company_name'].strip()
    site = row['website'].strip()
    dom = domain_of(site)
    if not dom or dom.endswith('amcham.rs'):
        continue
    if '@' in name or 'email' in name.lower():
        continue
    if re.search(r'\b\+?\d[\d\s()./-]{5,}', name):
        continue
    clean.append({'company_name': name, 'website': site})

df = pd.DataFrame(clean).drop_duplicates().reset_index(drop=True)
df.head()

Unnamed: 0,company_name,website
0,A1 Srbija d.o.o. Beograd,www.a1.rs
1,A3 Architects Studio d.o.o.,www.a3-architects.com
2,AbbVie d.o.o. Beograd,www.abbvie.com
3,Actavis d.o.o.,www.actavis.rs
4,Addiko Bank,www.addiko.rs


In [5]:
# Save to CSV in the same folder
from pathlib import Path

output_path = Path('/home/thiesen/Documents/AI-Innoscence_Ecosystem/Input/Novi Sad/amcham_members.csv')
df.to_csv(output_path, index=False)
print(f'Saved {len(df)} rows to {output_path}')

Saved 271 rows to /home/thiesen/Documents/AI-Innoscence_Ecosystem/Input/Novi Sad/amcham_members.csv
