# Startap Portal Serbia â€” Organizations extraction

Goal: Scrape https://startap.gov.rs/en/organizations/ and produce a CSV with columns: group, name, website, domain, source.

Approach:
- Try a fast static parse (requests + BeautifulSoup).
- If static returns 0, fallback to a browser approach (Playwright) to extract all cards including any lazy-loaded content.

Output file: `Input/Novi Sad/startap_portal_serbia_organizations.csv`.

In [None]:
# Imports and helpers
from __future__ import annotations
import re
import time
from typing import List, Dict, Tuple, Optional
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urlparse, urljoin
import os
import sys

SOURCE_URL = "https://startap.gov.rs/en/organizations/"
notebook_dir = os.path.abspath(".")  # use current notebook folder
output_path = os.path.join(notebook_dir, "startap_portal_serbia_organizations.csv")

SESSION = requests.Session()
SESSION.headers.update({
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121 Safari/537.36",
})

def fetch_html(url: str, retries: int = 2, timeout: int = 30) -> Optional[str]:
    for i in range(retries + 1):
        try:
            r = SESSION.get(url, timeout=timeout)
            if r.status_code == 200:
                return r.text
        except Exception:
            time.sleep(1 + i)
    return None

def get_soup(url: str) -> Optional[BeautifulSoup]:
    html = fetch_html(url)
    if not html:
        return None
    return BeautifulSoup(html, "lxml")

def canonicalize_url(base: str, href: str) -> Optional[str]:
    if not href:
        return None
    href = href.strip()
    if href.startswith(("mailto:", "tel:")):
        return None
    return urljoin(base, href)

def extract_domain(u: str) -> str:
    try:
        host = urlparse(u).netloc.lower()
        return host[4:] if host.startswith("www.") else host
    except Exception:
        return ""



In [3]:
# Static scraper
def parse_startap_static(url: str = SOURCE_URL) -> List[Dict]:
    soup = get_soup(url)
    if not soup:
        print("Static: failed to load HTML")
        return []

    rows: List[Dict] = []
    # Strategy: find anchors whose text includes 'Visit website' and pair with a nearby title within the same card.
    anchors = []
    for a in soup.find_all("a", href=True):
        txt = (a.get_text(" ") or "").strip().lower()
        if "visit website" in txt:
            anchors.append(a)

    def find_title(node) -> Optional[str]:
        # Search within the same card for a title
        # climb up to a container with 'card' or 'organization' in class, then look for h3/h2/a strong text
        cur = node
        for _ in range(5):
            if not cur:
                break
            classes = " ".join(cur.get("class", [])) if hasattr(cur, 'get') and cur.has_attr("class") else ""
            if any(tok in classes.lower() for tok in ["card", "organization", "org", "item", "entry"]):
                # within this, search for a likely title
                for sel in ["h3", "h2", "a", "strong"]:
                    h = cur.select_one(sel)
                    if h and (h.get_text(strip=True) or "").strip():
                        return h.get_text(strip=True)
            cur = cur.parent
        # fallback: siblings upwards
        cur = node
        for _ in range(3):
            if not cur:
                break
            sib = cur.find_previous(["h3","h2"])
            if sib and sib.get_text(strip=True):
                return sib.get_text(strip=True)
            cur = cur.parent
        return None

    for a in anchors:
        href = canonicalize_url(url, a.get("href"))
        if not href:
            continue
        name = find_title(a) or ""
        rows.append({
            "group": "organizations",
            "name": name,
            "website": href,
            "domain": extract_domain(href),
            "source": SOURCE_URL,
        })

    # Dedupe by domain
    seen = set()
    dedup = []
    for r in rows:
        key = r["domain"]
        if not key or key in seen:
            continue
        seen.add(key)
        dedup.append(r)

    print(f"Static parse: found {len(dedup)} rows")
    return dedup

static_rows = parse_startap_static()
if static_rows:
    pd.DataFrame(static_rows).to_csv(output_path, index=False)
    print(f"Wrote {output_path}")
else:
    print("Static returned 0; you can run the Playwright fallback cell next.")

Static parse: found 46 rows
Wrote /home/thiesen/Documents/AI-Innoscence_Ecosystem/Input/startap_portal_serbia_organizations.csv


In [None]:
# Playwright fallback (handles dynamic loading if any)
import asyncio

async def scrape_startap_playwright(headless: bool = True) -> List[Dict]:
    try:
        from playwright.async_api import async_playwright
    except Exception as e:
        print("Playwright not available in this kernel.")
        return []

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=headless, args=["--no-sandbox", "--disable-dev-shm-usage"])\

        ctx = await browser.new_context()
        page = await ctx.new_page()
        await page.goto(SOURCE_URL, wait_until="domcontentloaded", timeout=60000)

        # Accept cookies if banner exists
        for sel in ["button:has-text('Accept')", "#cn-accept-cookie", "button[aria-label='Accept']", "//button[contains(., 'Accept')]"]:
            try:
                await page.locator(sel).first.click(timeout=2000)
                break
            except Exception:
                pass

        # Wait for cards to appear
        try:
            await page.locator("a:has-text('Visit website')").first.wait_for(timeout=8000)
        except Exception:
            pass

        # Scroll to bottom to ensure all items are rendered
        try:
            prev = 0
            for _ in range(12):
                await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
                await page.wait_for_timeout(600)
                cur = await page.evaluate("document.body.scrollHeight")
                if cur == prev:
                    break
                prev = cur
        except Exception:
            pass

        data = await page.evaluate("""
() => {
  const items = [];
  const anchors = Array.from(document.querySelectorAll("a[href]"))
    .filter(a => (a.textContent || '').toLowerCase().includes('visit website'));

  const titleFor = (a) => {
    let cur = a;
    for (let i = 0; i < 6 && cur; i++) {
      cur = cur.parentElement;
      if (!cur) break;
      const cls = (cur.getAttribute('class') || '').toLowerCase();
      if (/(card|organization|org|item|entry)/.test(cls)) {
        const t = cur.querySelector('h3, h2, a, strong');
        if (t && (t.textContent || '').trim()) return t.textContent.trim();
      }
    }
    // Fallback: look for closest preceding heading
    let node = a;
    for (let i = 0; i < 3 && node; i++) {
      const prev = node.previousElementSibling;
      if (prev && /H[23]/.test(prev.tagName) && (prev.textContent || '').trim()) return prev.textContent.trim();
      node = node.parentElement;
    }
    return '';
  };

  for (const a of anchors) {
    const href = a.href;
    if (!href || href.startsWith('mailto:') || href.startsWith('tel:')) continue;
    items.push({ name: titleFor(a), website: href });
  }
  return items;
}
""")

        await browser.close()

        # Normalize & dedupe
        rows: List[Dict] = []
        for rec in data:
            href = rec.get("website")
            if not href:
                continue
            rows.append({
                "group": "organizations",
                "name": (rec.get("name") or "").strip(),
                "website": href,
                "domain": extract_domain(href),
                "source": SOURCE_URL,
            })

        seen = set()
        dedup = []
        for r in rows:
            if not r["domain"] or r["domain"] in seen:
                continue
            seen.add(r["domain"])
            dedup.append(r)
        return dedup

# Execute fallback if static_rows empty
fallback_rows: List[Dict] = []
if not static_rows:
    try:
        loop = asyncio.get_event_loop()
    except RuntimeError:
        import nest_asyncio, asyncio as _asyncio
        nest_asyncio.apply()
        loop = _asyncio.get_event_loop()
    fallback_rows = loop.run_until_complete(scrape_startap_playwright(headless=True))
    print(f"Playwright fallback: found {len(fallback_rows)} rows")
    if fallback_rows:
        pd.DataFrame(fallback_rows).to_csv(output_path, index=False)
        print(f"Wrote {output_path}")
    else:
        print("No rows found via fallback. You can try setting headless=False inside scrape_startap_playwright().")

In [None]:
# Preview a few lines if file exists
import os
from pathlib import Path

candidates = [
    output_path,
    str(Path(os.getcwd())/"startap_portal_serbia_organizations.csv"),
    "/home/thiesen/Documents/AI-Innoscence_Ecosystem/Input/Novi Sad/startap_portal_serbia_organizations.csv",
]

found = None
for p in candidates:
    if os.path.exists(p):
        found = p
        break

if found:
    print(f"Previewing: {found}")
    df = pd.read_csv(found)
    print(df.shape)
    display(df.head(10))
else:
    print("Output file not found yet.")

Previewing: /home/thiesen/Documents/AI-Innoscence_Ecosystem/Input/startap_portal_serbia_organizations.csv
(46, 5)


Unnamed: 0,group,name,website,domain,source
0,organizations,Founder Institute Srbija,https://fi.co/,fi.co,https://startap.gov.rs/en/organizations/
1,organizations,GameBiz Consulting,https://www.gamebizconsulting.com/,gamebizconsulting.com,https://startap.gov.rs/en/organizations/
2,organizations,Haos Community Space,https://www.haos.space/,haos.space,https://startap.gov.rs/en/organizations/
3,organizations,ICT Hub,https://www.icthub.rs/,icthub.rs,https://startap.gov.rs/en/organizations/
4,organizations,ICT Hub Venture,https://icthubventure.com/en/,icthubventure.com,https://startap.gov.rs/en/organizations/
5,organizations,Impact Hub,https://belgrade.impacthub.net/,belgrade.impacthub.net,https://startap.gov.rs/en/organizations/
6,organizations,InspiraHub,https://www.inspirahub.rs/,inspirahub.rs,https://startap.gov.rs/en/organizations/
7,organizations,IVM Coworking Loznica,https://ivmcoworking.com,ivmcoworking.com,https://startap.gov.rs/en/organizations/
8,organizations,Loud Crowd,https://loudcrowd.rs/,loudcrowd.rs,https://startap.gov.rs/en/organizations/
9,organizations,Mini Hub Workspace,https://coworkingsmederevo.rs/,coworkingsmederevo.rs,https://startap.gov.rs/en/organizations/
