# Extract NTPNS Members (name + website) across all categories

Scrape https://www.ntpns.rs/members?lang=en for all categories (Startups, R&D Companies, Research Institutes, Virtual Members, Innovation Partners).
For each entity, capture:

- name
- website (from the "Visit Website" button in the modal, when available)
- category

Saves to `ntpns_members.csv` in this folder.

In [1]:
# Imports and setup
import time, re, json
from pathlib import Path
import pandas as pd
from playwright.sync_api import TimeoutError as PlaywrightTimeout
import asyncio, nest_asyncio
from playwright.async_api import async_playwright, TimeoutError as AsyncPlaywrightTimeout

BASE_URL = "https://www.ntpns.rs/members?lang=en"

CATEGORIES = [
    "Startups",
    "R&D Companies",
    "Research Institutes",
    "Virtual Members",
    "Innovation Partners",
]

# Tunable timeouts (ms) to keep runs snappy in notebooks
PAGE_TIMEOUT = 6000
TAB_CLICK_TIMEOUT = 800
MODAL_TIMEOUT = 2000
CARD_HEADING_TIMEOUT = 500

In [2]:
# Ensure Chromium is available for Playwright (safe no-op if already installed)
import sys, subprocess
try:
    subprocess.run([sys.executable, '-m', 'playwright', 'install', 'chromium'], check=False, stdout=None, stderr=None)
except Exception as e:
    print('Playwright install hint:', e)

BEWARE: your OS is not officially supported by Playwright; downloading fallback build for ubuntu20.04-x64.
BEWARE: your OS is not officially supported by Playwright; downloading fallback build for ubuntu20.04-x64.
BEWARE: your OS is not officially supported by Playwright; downloading fallback build for ubuntu20.04-x64.


In [3]:
# Scrape function using Playwright (async, with robust website capture + card-level fallbacks)
nest_asyncio.apply()

def _norm_text(t: str) -> str:
    return re.sub(r"\s+", " ", (t or "").strip())

def _unique_preserve(seq):
    seen = set()
    out = []
    for s in seq:
        if s in seen:
            continue
        seen.add(s)
        out.append(s)
    return out

async def scrape_ntpns_async():
    rows = []
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context()
        page = await context.new_page()
        page.set_default_timeout(PAGE_TIMEOUT)
        print('Opening page…')
        await page.goto(BASE_URL, wait_until='load')
        try:
            await page.wait_for_load_state('networkidle', timeout=3000)
        except Exception:
            pass
        # Accept cookies if present
        try:
            await page.get_by_role('button', name=re.compile('accept|agree', re.I)).click(timeout=2000)
        except Exception:
            pass
        for cat in CATEGORIES:
            print(f'Category: {cat} …', end=' ')
            # Click category tab using multiple strategies
            clicked = False
            for locator in [
                page.get_by_role('tab', name=cat),
                page.get_by_role('button', name=cat),
                page.get_by_role('link', name=cat),
                page.get_by_text(cat, exact=False),
            ]:
                try:
                    await locator.first.click(timeout=TAB_CLICK_TIMEOUT)
                    clicked = True
                    break
                except Exception:
                    continue
            if not clicked:
                print('[tab not found]')
                continue
            # Collect visible unique card names from H3 headings only
            await page.wait_for_timeout(500)
            raw_names = await page.locator('h3:visible').all_text_contents()
            names = _unique_preserve([_norm_text(n) for n in raw_names if _norm_text(n)])
            print(f'{len(names)} cards')
            for name in names:
                website = None
                # Try multiple robust ways to open the card modal by its heading
                opened = False
                heading = page.locator('h3').filter(has_text=name).first
                try:
                    await heading.scroll_into_view_if_needed(timeout=800)
                except Exception:
                    pass
                # Strategy 1: direct click
                try:
                    await heading.click(timeout=TAB_CLICK_TIMEOUT)
                    opened = True
                except Exception:
                    pass
                # Strategy 2: click closest clickable ancestor via JS (broadened)
                if not opened:
                    try:
                        el = await heading.element_handle()
                        if el:
                            await page.evaluate('(el) => { const sel = "a,button,[role=button],[onclick], .card, .member, .company, .item, .startup"; const c = el.closest(sel); if (c) c.click(); }', el)
                            opened = True
                    except Exception:
                        pass
                # Strategy 3: force click on heading
                if not opened:
                    try:
                        await heading.click(timeout=TAB_CLICK_TIMEOUT, force=True)
                        opened = True
                    except Exception:
                        pass
                # Strategy 4: click by bounding box center
                if not opened:
                    try:
                        box = await heading.bounding_box()
                        if box:
                            await page.mouse.click(box['x'] + box['width']/2, box['y'] + box['height']/2)
                            opened = True
                    except Exception:
                        pass
                # Attempt to extract website directly from the card tile if modal didn't open
                if not opened and website is None:
                    try:
                        el = await heading.element_handle()
                        if el:
                            urls = await page.evaluate("""(h)=>{
                                function collectFrom(node){
                                  const urls = new Set();
                                  const add=u=>{ if(u && /^https?:\/\//i.test(u)) urls.add(u); };
                                  let cur=node;
                                  for(let i=0;i<5 && cur;i++, cur=cur.parentElement){
                                    cur.querySelectorAll('a[href]').forEach(a=>add(a.getAttribute('href')));
                                    cur.querySelectorAll('[data-href]').forEach(a=>add(a.getAttribute('data-href')));
                                    const oc = cur.getAttribute('onclick') || '';
                                    const m = oc.match(/https?:\/\/[^'\"\s)]+/i);
                                    if(m) add(m[0]);
                                  }
                                  return Array.from(urls);
                                }
                                return collectFrom(h);
                            """ , el)
                            # Prefer external (non-ntpns) links first
                            ext = [u for u in urls if 'ntpns.rs' not in (u or '')]
                            website = (ext[0] if ext else (urls[0] if urls else None)) or None
                    except Exception:
                        pass
                # If opened, try to find URL from modal
                if opened and website is None:
                    try:
                        await page.wait_for_selector('[role="dialog"], .modal, .modal-dialog, .modal-content, .MuiDialog-root, .chakra-modal__content', timeout=MODAL_TIMEOUT)
                        modal = page.locator('[role="dialog"], .modal, .modal-dialog, .modal-content, .MuiDialog-root, .chakra-modal__content').last
                        # Try common variants of 'Visit Website' labelling
                        btn = None
                        for cand in [
                            modal.get_by_role('link', name=re.compile('Visit Website|Website|Web site|Company website|Open website|Go to website', re.I)),
                            modal.get_by_role('button', name=re.compile('Visit Website|Website|Web site|Company website|Open website|Go to website', re.I)),
                            modal.get_by_text(re.compile('Visit Website|Website|Web site|Company website|Open website|Go to website', re.I)),
                        ]:
                            try:
                                if await cand.count() > 0:
                                    btn = cand.first
                                    break
                            except Exception:
                                continue
                        # 1) Try attribute href on the control if it's a link
                        if btn and website is None:
                            try:
                                href = await btn.get_attribute('href')
                                if href:
                                    website = href
                            except Exception:
                                pass
                        # 2) If there is any external-looking link in the modal, use it
                        if website is None:
                            try:
                                link_locator = modal.locator('a[href^="http"]')
                                if await link_locator.count() > 0:
                                    # Prefer external (non-ntpns) link first
                                    urls = []
                                    for i in range(min(5, await link_locator.count())):
                                        href = await link_locator.nth(i).get_attribute('href')
                                        if href:
                                            urls.append(href)
                                    ext = [u for u in urls if 'ntpns.rs' not in (u or '')]
                                    website = (ext[0] if ext else (urls[0] if urls else None)) or None
                            except Exception:
                                pass
                        # 3) Look into data-* and onclick within modal, and text URLs
                        if website is None:
                            try:
                                urls = await modal.evaluate("""(root)=>{
                                    const urls=new Set();
                                    const add=u=>{ if(u && /^https?:\/\//i.test(u)) urls.add(u); };
                                    root.querySelectorAll('[data-href]').forEach(el=>add(el.getAttribute('data-href')));
                                    root.querySelectorAll('[onclick]').forEach(el=>{
                                        const oc = el.getAttribute('onclick')||'';
                                        const m = oc.match(/https?:\/\/[^'\"\s)]+/ig);
                                        if(m) m.forEach(add);
                                    });
                                    const txt = root.innerText||'';
                                    const mtxt = txt.match(/https?:\/\/[^\s]+/ig);
                                    if(mtxt) mtxt.forEach(add);
                                    return Array.from(urls);
                                }""")
                                ext = [u for u in urls if 'ntpns.rs' not in (u or '')]
                                website = (ext[0] if ext else (urls[0] if urls else None)) or None
                            except Exception:
                                pass
                        # 4) If still missing and we have a control, click and catch popup URL
                        if btn and website is None:
                            try:
                                async with page.expect_popup(timeout=1500) as pop_info:
                                    await btn.click()
                                popup = await pop_info.value
                                website = popup.url
                                await popup.close()
                            except Exception:
                                pass
                        # 5) As a last resort, attempt navigation capture and go back
                        if btn and website is None:
                            try:
                                prev_url = page.url
                                async with page.expect_navigation(timeout=1500):
                                    await btn.click()
                                website = page.url if page.url != prev_url else None
                                await page.go_back(timeout=PAGE_TIMEOUT)
                            except Exception:
                                pass
                        # Close modal (best-effort)
                        closed = False
                        for close_candidate in [
                            modal.get_by_role('button', name=re.compile('Close|×|✕|X', re.I)),
                            page.get_by_role('button', name=re.compile('Close|×|✕|X', re.I)),
                        ]:
                            try:
                                await close_candidate.first.click(timeout=800)
                                closed = True
                                break
                            except Exception:
                                continue
                        if not closed:
                            try:
                                await page.keyboard.press('Escape')
                            except Exception:
                                try:
                                    await page.mouse.click(10, 10)
                                except Exception:
                                    pass
                    except Exception:
                        # modal did not appear in time; proceed without website
                        pass
                # ALWAYS record the row even if modal/website not found
                rows.append({'name': name, 'website': website or '', 'category': cat})
        await browser.close()
    return rows

# Run (with a bounded total time by relying on small timeouts)
rows = asyncio.get_event_loop().run_until_complete(scrape_ntpns_async())
len(rows)

Opening page…
Category: Startups … Category: Startups … 33 cards
33 cards
Category: R&D Companies … Category: R&D Companies … 25 cards
25 cards
Category: Research Institutes … Category: Research Institutes … 1 cards
1 cards
Category: Virtual Members … Category: Virtual Members … 0 cards
Category: Innovation Partners … 0 cards
Category: Innovation Partners … 0 cards
0 cards


59

In [4]:
# Clean and save
def clean_rows(rows):
    out = []
    seen = set()
    for r in rows:
        name = re.sub(r'\s+', ' ', (r.get('name') or '').strip())
        site = (r.get('website') or '').strip()
        cat = r.get('category') or ''
        if not name:
            continue
        key = (name.lower(), site.lower(), cat)
        if key in seen:
            continue
        seen.add(key)
        out.append({'name': name, 'website': site, 'category': cat})
    return out

clean = clean_rows(rows)
df = pd.DataFrame(clean)
output_path = Path('/home/thiesen/Documents/AI-Innoscence_Ecosystem/Input/Novi Sad/ntpns_members.csv')
df.to_csv(output_path, index=False)
non_empty = (df['website'].str.len() > 0).sum() if not df.empty else 0
print(f'Saved {len(df)} rows to {output_path}')
print(f'Websites found: {non_empty} / {len(df)}')

Saved 59 rows to /home/thiesen/Documents/AI-Innoscence_Ecosystem/Input/Novi Sad/ntpns_members.csv
Websites found: 0 / 59
