# Digital Serbia Initiative — Members and Friends scraper

This notebook scrapes the members and friends listed on:

- https://www.dsi.rs/en/members/

What it extracts:
- Group: "member" or "friend"
- Name: best-effort from logo alt/title/aria-label; falls back to domain
- Website: external URL linked from each tile
- Domain: normalized domain of the website
- Source: the page URL

How it works:
- First tries a fast static parse using requests + BeautifulSoup
- If the static parse returns 0 rows, there is an optional Playwright fallback cell you can run to render the page and extract links from the interactive tiles

Outputs:
- digital_serbia_members_friends.csv (saved next to this notebook)


In [1]:
# Imports and helpers
import re
import time
from typing import List, Dict, Optional, Tuple
from urllib.parse import urlparse, urljoin

import requests
from bs4 import BeautifulSoup, Tag
import pandas as pd

SESSION = requests.Session()
SESSION.headers.update({
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
})

SOURCE_URL = "https://www.dsi.rs/en/members/"


def fetch_html(url: str, retries: int = 3, backoff: float = 1.5, timeout: int = 20) -> str:
    last_err = None
    for i in range(retries):
        try:
            resp = SESSION.get(url, timeout=timeout)
            resp.raise_for_status()
            return resp.text
        except Exception as e:
            last_err = e
            time.sleep(backoff ** i)
    raise last_err


def get_soup(url: str) -> BeautifulSoup:
    html = fetch_html(url)
    return BeautifulSoup(html, "lxml")


def canonicalize_url(url: Optional[str], base: str) -> Optional[str]:
    if not url:
        return None
    u = url.strip()
    if not u:
        return None
    u = urljoin(base, u)
    parsed = urlparse(u)
    if not parsed.scheme:
        u = f"https://{u}"
        parsed = urlparse(u)
    if parsed.scheme not in ("http", "https"):
        return None
    # strip fragments
    return parsed._replace(fragment="").geturl()


def extract_domain(url: Optional[str]) -> Optional[str]:
    if not url:
        return None
    try:
        netloc = urlparse(url).netloc.lower()
        if netloc.startswith("www."):
            netloc = netloc[4:]
        return netloc or None
    except Exception:
        return None


def looks_like_external(href: Optional[str]) -> bool:
    if not href:
        return False
    href = href.strip()
    if href.startswith("mailto:") or href.startswith("tel:"):
        return False
    # Skip on-site anchors
    if href.startswith("#"):
        return False
    # Consider absolute URLs external
    parsed = urlparse(href)
    if parsed.scheme in ("http", "https"):
        # Treat any absolute as external; we'll still canonicalize
        return True
    return True  # Relative could still be external after urljoin


def text_clean(s: str) -> str:
    return re.sub(r"\s+", " ", s).strip()


def tile_name_from_element(el: Tag) -> Optional[str]:
    # Try common attributes on images or anchors
    # 1) <img alt="...">
    img = el.find("img")
    if img:
        for key in ("alt", "title"):
            val = img.get(key)
            if val and text_clean(val):
                return text_clean(val)
    # 2) Anchor attributes
    for a in el.find_all("a"):
        for key in ("aria-label", "title"):
            val = a.get(key)
            if val and text_clean(val):
                return text_clean(val)
    # 3) Fallback to visible text inside the tile
    txt = text_clean(el.get_text(" "))
    return txt or None


def find_section_by_heading(soup: BeautifulSoup, title: str) -> Optional[Tag]:
    # Find heading with this text (case-insensitive), then return the nearest container that holds the grid
    heading = None
    for tag in soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6"]):
        if text_clean(tag.get_text()).lower() == title.lower():
            heading = tag
            break
    if not heading:
        # fallback: partial match
        for tag in soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6"]):
            if title.lower() in text_clean(tag.get_text()).lower():
                heading = tag
                break
    if not heading:
        return None
    # Heuristic: grid is commonly in a following sibling section or div
    container = heading.find_parent(["section", "div"]) or heading.parent
    # Explore next siblings to find a grid-like container with many links or images
    probe = container
    limit = 6
    while probe and limit > 0:
        # Candidate if it contains many <a> or <img>
        links = probe.find_all("a")
        imgs = probe.find_all("img")
        if len(links) >= 5 or len(imgs) >= 5:
            return probe
        probe = probe.find_next_sibling(["section", "div"]) or probe.find_next_sibling()
        limit -= 1
    return container


def scrape_grid(soup: BeautifulSoup, base_url: str, group: str, section_title: str) -> List[Dict]:
    sec = find_section_by_heading(soup, section_title)
    if not sec:
        return []

    tiles: List[Tag] = []
    # Prefer tiles that look like columns/cards
    for cls in ["grid", "row", "columns", "column", "col", "wp-block-columns", "wp-block-column", "logos", "partners", "members"]:
        tiles.extend(sec.find_all(class_=lambda c: isinstance(c, str) and cls in c))
    # If the above didn't find specific wrappers, fall back to direct link parents with images
    if not tiles:
        for a in sec.find_all("a"):
            if a.find("img"):
                tiles.append(a.parent or a)

    results: List[Dict] = []
    seen: set[Tuple[str, str]] = set()  # (group, domain)

    # Walk through anchors with external-ish hrefs
    for a in sec.find_all("a"):
        href = a.get("href")
        if not looks_like_external(href):
            continue
        url = canonicalize_url(href, base_url)
        if not url:
            continue
        dom = extract_domain(url)
        if not dom:
            continue
        # find a reasonable tile/container name for the anchor
        tile = a
        # climb up to a card-like element with an image inside
        climb = a
        depth = 0
        while climb and depth < 4 and not climb.find("img"):
            climb = climb.parent
            depth += 1
        if climb and climb.name in ("div", "a", "figure", "li"):
            tile = climb
        name = tile_name_from_element(tile) or dom

        key = (group, dom)
        if key in seen:
            continue
        seen.add(key)
        results.append({
            "group": group,
            "name": name,
            "website": url,
            "domain": dom,
            "source": base_url,
        })

    return results




In [2]:
# Static scrape: Members + Friends from DSI page
soup = get_soup(SOURCE_URL)

members = scrape_grid(soup, SOURCE_URL, group="member", section_title="Our members")
friends = scrape_grid(soup, SOURCE_URL, group="friend", section_title="Friends")

rows = members + friends

# Build DataFrame and clean
if rows:
    df = pd.DataFrame(rows)
    # Deduplicate by (group, domain)
    df = df.sort_values(["group", "name"]).drop_duplicates(subset=["group", "domain"]).reset_index(drop=True)
else:
    df = pd.DataFrame(columns=["group", "name", "website", "domain", "source"])  # empty

print(f"Static scrape results — members: {len(members)}, friends: {len(friends)}, total: {len(df)}")

# Save CSV next to this notebook
import os
notebook_dir = os.path.dirname(os.path.abspath("."))  # working dir is notebook dir in VS Code
# Prefer saving alongside this notebook path explicitly
notebook_path = r"/home/thiesen/Documents/AI-Innoscence_Ecosystem/Input/Novi Sad/extract_digital_serbia.ipynb"
output_path = os.path.join(os.path.dirname(notebook_path), "digital_serbia_members_friends.csv")

if len(df) > 0:
    df.to_csv(output_path, index=False)
    print(f"Saved {len(df)} rows to {output_path}")
else:
    print("No rows parsed via static HTML. Consider running the Playwright fallback cell below.")

# Show a quick preview
try:
    display(df.head(20))
except Exception:
    print(df.head(20))


Static scrape results — members: 0, friends: 0, total: 0
No rows parsed via static HTML. Consider running the Playwright fallback cell below.


Unnamed: 0,group,name,website,domain,source


In [22]:
# Optional fallback: Playwright (async, notebook-safe)
# Note: Only run if static scrape returns 0 rows or seems incomplete.
try:
    import asyncio
    import nest_asyncio
    nest_asyncio.apply()
    from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
except Exception as e:
    async_playwright = None

async def _accept_cookies(page):
    candidates = [
        "button:has-text('Accept')",
        "text=Accept",
        "button:has-text('I agree')",
        "text=I agree",
        "button:has-text('Prihvati')",
        "button:has-text('Slažem se')",
        "role=button[name='Accept']",
        "#onetrust-accept-btn-handler",
        ".ot-sdk-container button",
    ]
    for sel in candidates:
        try:
            loc = page.locator(sel)
            if await loc.count():
                await loc.first.click(timeout=1500)
                await page.wait_for_timeout(300)
                break
        except Exception:
            continue

async def _extract_with_js(page, title: str, group: str):
    js = r'''
    (args) => {
      const { title, group } = args;
      function textClean(s){return (s||'').replace(/\s+/g,' ').trim()}
      function findHeading(title){
        const hs = Array.from(document.querySelectorAll('h1,h2,h3,h4,h5,h6'))
        return hs.find(h => textClean(h.textContent).toLowerCase() === title.toLowerCase()) ||
               hs.find(h => textClean(h.textContent).toLowerCase().includes(title.toLowerCase()))
      }
      function sectionContainer(h){
        if(!h) return null
        let node = h.closest('section,div') || h.parentElement
        return node
      }
      function unique(arr, keyFn){
        const seen = new Set();
        const out = [];
        for(const x of arr){
          const k = keyFn(x);
          if(!seen.has(k)){ seen.add(k); out.push(x) }
        }
        return out
      }
      function cleanUrl(u){ try { const url = new URL(u, location.href); url.hash=''; return url.toString() } catch { return null } }
      function domain(u){ try { return new URL(u).hostname.replace(/^www\./,'') } catch { return null } }
      function nameFromTile(el){
        const img = el.querySelector('img');
        if(img){ if(img.alt && img.alt.trim()) return img.alt.trim(); if(img.title && img.title.trim()) return img.title.trim() }
        if(el.title && el.title.trim()) return el.title.trim();
        if(el.getAttribute('aria-label')) return el.getAttribute('aria-label').trim();
        const txt = textClean(el.textContent); return txt || null
      }
      const h = findHeading(title)
      const container = sectionContainer(h) || document

      const anchors = Array.from(container.querySelectorAll('a[href]'))
        .filter(a => !a.getAttribute('href').startsWith('#') && !a.getAttribute('href').startsWith('mailto:') && !a.getAttribute('href').startsWith('tel:'))
        .map(a => ({el:a, url: cleanUrl(a.getAttribute('href'))}))
        .filter(x => x.url)
        .map(x => ({ group, url: x.url, dom: domain(x.url), name: nameFromTile(x.el.closest('a,div,figure,li') || x.el) }))
        .filter(x => x.dom)

      const onclicks = Array.from(container.querySelectorAll('[onclick]'))
        .map(el => ({el, onclick: el.getAttribute('onclick')||''}))
        .filter(x => /https?:\/\//i.test(x.onclick))
        .map(x => {
          const m = x.onclick.match(/https?:\/\/[^'"\)\s]+/i)
          const url = m ? cleanUrl(m[0]) : null
          return { group, url, dom: domain(url), name: nameFromTile(x.el.closest('a,div,figure,li') || x.el) }
        })
        .filter(x => x.url && x.dom)

      const all = anchors.concat(onclicks)
      const final = unique(all, x => group + '|' + (x.dom||'') )
      return final
    }
    '''
    return await page.evaluate(js, {"title": title, "group": group})

async def _hover_extract_urls(page, title: str, group: str):
    results = []
    tiles = page.locator(
        f"xpath=(//h1[contains(., '{title}')] | //h2[contains(., '{title}')] | //h3[contains(., '{title}')])"
        " /ancestor-or-self::*[self::section or self::div][1]//*[img]")
    n = await tiles.count()
    for i in range(n):
        t = tiles.nth(i)
        try:
            await t.scroll_into_view_if_needed()
            await t.hover()
            links = await t.evaluate_all("(nodes) => nodes.map(n => Array.from(n.querySelectorAll('a[href]')).map(a => a.href)).flat()")
            for url in links:
                dom = extract_domain(url)
                if not dom:
                    continue
                results.append({"group": group, "website": url, "domain": dom, "name": dom})
        except Exception:
            continue
    uniq = {}
    for r in results:
        uniq[(r['group'], r['domain'])] = r
    return list(uniq.values())

async def _click_collect_urls(page, title: str, group: str):
    collected = []
    heading = page.locator(f"xpath=//h1[contains(., '{title}')] | //h2[contains(., '{title}')] | //h3[contains(., '{title}')] ").first
    if not await heading.count():
        return collected
    images = page.locator(
        f"xpath=(//h1[contains(., '{title}')] | //h2[contains(., '{title}')] | //h3[contains(., '{title}')])"
        " /ancestor-or-self::*[self::section or self::div][1]//img")
    n = await images.count()
    for i in range(n):
        img = images.nth(i)
        try:
            await img.scroll_into_view_if_needed()
            try:
                box = await img.bounding_box()
                if box:
                    await page.mouse.move(box['x'] + box['width']/2, box['y'] + box['height']/2)
                    await page.wait_for_timeout(100)
            except Exception:
                pass
            try:
                async with page.expect_event("popup", timeout=1500) as pop_wait:
                    await img.click(force=True)
                popup = await pop_wait.value
                await popup.wait_for_load_state("domcontentloaded")
                url = popup.url
                await popup.close()
                collected.append({"group": group, "url": url})
                continue
            except PlaywrightTimeoutError:
                pass
            try:
                async with page.expect_navigation(timeout=1500):
                    await img.click(force=True)
                url = page.url
                await page.goto(SOURCE_URL, wait_until="networkidle")
                await _accept_cookies(page)
                collected.append({"group": group, "url": url})
            except PlaywrightTimeoutError:
                pass
        except Exception:
            continue
    seen = set(); out=[]
    for x in collected:
        url = x.get("url"); dom = extract_domain(url)
        if not url or not dom: continue
        if (group, dom) in seen: continue
        seen.add((group, dom))
        out.append({"group": group, "website": url, "domain": dom, "name": dom})
    return out

async def scrape_with_playwright(url: str) -> pd.DataFrame:
    if async_playwright is None:
        print("Playwright not available; install 'playwright' and run 'playwright install' to enable fallback.")
        return pd.DataFrame(columns=["group", "name", "website", "domain", "source"])  
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        await page.goto(url, wait_until="networkidle")
        await _accept_cookies(page)
        try:
            await page.wait_for_timeout(1500)
        except Exception:
            pass

        data_members = await _extract_with_js(page, "Our members", "member")
        data_friends = await _extract_with_js(page, "Friends", "friend")
        data = (data_members or []) + (data_friends or [])

        if not data:
            data = await _hover_extract_urls(page, "Our members", "member") + await _hover_extract_urls(page, "Friends", "friend")
        if not data:
            data = await _click_collect_urls(page, "Our members", "member") + await _click_collect_urls(page, "Friends", "friend")

        await browser.close()

        if not data:
            return pd.DataFrame(columns=["group", "name", "website", "domain", "source"])  
        df = pd.DataFrame(data)
        if "website" not in df.columns and "url" in df.columns:
            df.rename(columns={"url": "website"}, inplace=True)
        if "domain" not in df.columns:
            df["domain"] = df["website"].map(extract_domain)
        if "name" not in df.columns:
            df["name"] = df["domain"]
        df["name"] = df["name"].fillna(df["domain"]).replace("", pd.NA).fillna(df["domain"]) 
        df["source"] = url
        df = df.sort_values(["group", "name"]).drop_duplicates(subset=["group", "domain"]).reset_index(drop=True)
        return df

# Helper to run fallback when needed
async def maybe_run_playwright_if_needed(current_df: pd.DataFrame) -> pd.DataFrame:
    if len(current_df) > 0:
        return current_df
    print("Attempting Playwright fallback…")
    df2 = await scrape_with_playwright(SOURCE_URL)
    if len(df2) > 0:
        import os
        notebook_path = r"/home/thiesen/Documents/AI-Innoscence_Ecosystem/Input/Novi Sad/extract_digital_serbia.ipynb"
        output_path = os.path.join(os.path.dirname(notebook_path), "digital_serbia_members_friends.csv")
        df2.to_csv(output_path, index=False)
        print(f"Saved {len(df2)} rows to {output_path} (Playwright fallback)")
    return df2


In [23]:
# If static parse returned 0 rows, you may run this to trigger the fallback automatically
# (This cell is safe to run regardless; it only uses Playwright when needed.)
try:
    loop = asyncio.get_event_loop()
except Exception:
    loop = None

if 'df' in globals():
    if async_playwright is None and len(df) == 0:
        print("Playwright not installed; static scrape returned 0 rows. Install fallback if needed:")
        print("  pip install playwright nest_asyncio")
        print("  python -m playwright install")
    elif len(df) == 0:
        if loop and loop.is_running():
            import nest_asyncio as _na
            _na.apply()
            df = await maybe_run_playwright_if_needed(df)
        else:
            df = loop.run_until_complete(maybe_run_playwright_if_needed(df)) if loop else asyncio.run(maybe_run_playwright_if_needed(df))
else:
    print("No 'df' found from the static scrape cell — please run the previous cell first.")

# Preview (again) if available
if 'df' in globals():
    print(f"Final row count: {len(df)}")
    try:
        display(df.head(20))
    except Exception:
        print(df.head(20))


Attempting Playwright fallback…
Final row count: 0


Unnamed: 0,group,name,website,domain,source


In [12]:
# Debug: Inspect HTML structure with Playwright (iframes too)
from bs4 import BeautifulSoup

async def debug_dump():
    if async_playwright is None:
        print("Playwright not available.")
        return
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        await page.goto(SOURCE_URL, wait_until="networkidle")
        await page.wait_for_timeout(1000)
        html = await page.content()
        soup = BeautifulSoup(html, "lxml")
        headings = [h.get_text(strip=True) for h in soup.select('h1,h2,h3,h4,h5,h6')]
        print("Headings found:", [h for h in headings if 'member' in h.lower() or 'friend' in h.lower()][:10])
        # Iframes
        iframes = soup.select('iframe')
        print("Iframes found:", len(iframes))
        for i, fr in enumerate(iframes[:5], 1):
            print(f"  iframe {i} src=", fr.get('src'))
        await browser.close()

try:
    if loop and loop.is_running():
        await debug_dump()
    else:
        asyncio.run(debug_dump())
except Exception as e:
    print("Debug failed:", e)


Headings found: ['Our members', 'Friends']
Iframes found: 0


In [17]:
# Alternate approach: probe WordPress REST API for members data
import json

endpoints_to_try = [
    "https://www.dsi.rs/wp-json",
    "https://www.dsi.rs/wp-json/wp/v2/pages?search=members",
    "https://www.dsi.rs/en/wp-json/wp/v2/pages?search=members",
    "https://www.dsi.rs/wp-json/wp/v2/pages?per_page=5",
]
responses = {}
for url in endpoints_to_try:
    try:
        r = SESSION.get(url, timeout=20)
        print(url, r.status_code, r.headers.get('content-type'))
        if r.ok and 'application/json' in (r.headers.get('content-type') or ''):
            data = r.json()
            # Keep it small for display
            snippet = json.dumps(data[:1] if isinstance(data, list) else data, indent=2)[:1000]
            responses[url] = snippet
            print("  sample:", snippet[:300], '...')
    except Exception as e:
        print(url, 'error', e)

responses


https://www.dsi.rs/wp-json 200 application/json; charset=UTF-8
  sample: {
  "name": "Inicijativa \u201eDigitalna Srbija\u201c",
  "description": "",
  "url": "https://www.dsi.rs",
  "home": "https://www.dsi.rs",
  "gmt_offset": 1,
  "timezone_string": "Europe/Belgrade",
  "namespaces": [
    "oembed/1.0",
    "akismet/v1",
    "contact-form-7/v1",
    "disqus/v1",
    " ...
https://www.dsi.rs/wp-json/wp/v2/pages?search=members 200 application/json; charset=UTF-8
  sample: [] ...
https://www.dsi.rs/en/wp-json/wp/v2/pages?search=members 200 application/json; charset=UTF-8
  sample: [
  {
    "id": 9044,
    "date": "2019-12-20T10:27:59",
    "date_gmt": "2019-12-20T09:27:59",
    "guid": {
      "rendered": "https://www.dsi.rs/politika-privatnosti-inicijative-digitalna-srbija/"
    },
    "modified": "2025-10-08T11:03:52",
    "modified_gmt": "2025-10-08T09:03:52",
    "slug": ...
https://www.dsi.rs/wp-json/wp/v2/pages?per_page=5 200 application/json; charset=UTF-8
  sample: [
  {
    "

{'https://www.dsi.rs/wp-json': '{\n  "name": "Inicijativa \\u201eDigitalna Srbija\\u201c",\n  "description": "",\n  "url": "https://www.dsi.rs",\n  "home": "https://www.dsi.rs",\n  "gmt_offset": 1,\n  "timezone_string": "Europe/Belgrade",\n  "namespaces": [\n    "oembed/1.0",\n    "akismet/v1",\n    "contact-form-7/v1",\n    "disqus/v1",\n    "mailin/v1",\n    "wordfence/v1",\n    "yoast/v1",\n    "wpml/tm/v1",\n    "dsi",\n    "regenerate-thumbnails/v1",\n    "wp/v2"\n  ],\n  "authentication": [],\n  "routes": {\n    "/": {\n      "namespace": "",\n      "methods": [\n        "GET"\n      ],\n      "endpoints": [\n        {\n          "methods": [\n            "GET"\n          ],\n          "args": {\n            "context": {\n              "required": false,\n              "default": "view"\n            }\n          }\n        }\n      ],\n      "_links": {\n        "self": "https://www.dsi.rs/wp-json/"\n      }\n    },\n    "/oembed/1.0": {\n      "namespace": "oembed/1.0",\n      "

In [18]:
# Explore custom REST namespace `dsi`
try:
    r = SESSION.get("https://www.dsi.rs/wp-json/dsi", timeout=20)
    print("/wp-json/dsi", r.status_code)
    print((r.text or "")[:800])
except Exception as e:
    print("dsi namespace fetch error:", e)


/wp-json/dsi 200
{"namespace":"dsi","routes":{"\/dsi":{"namespace":"dsi","methods":["GET"],"endpoints":[{"methods":["GET"],"args":{"namespace":{"required":false,"default":"dsi"},"context":{"required":false,"default":"view"}}}],"_links":{"self":"https:\/\/www.dsi.rs\/wp-json\/dsi"}},"\/dsi\/random-posts":{"namespace":"dsi","methods":["GET"],"endpoints":[{"methods":["GET"],"args":[]}],"_links":{"self":"https:\/\/www.dsi.rs\/wp-json\/dsi\/random-posts"}}},"_links":{"up":[{"href":"https:\/\/www.dsi.rs\/wp-json\/"}]}}


In [19]:
# Debug: capture network requests on page load to discover data endpoints
async def debug_network():
    if async_playwright is None:
        print("Playwright not available.")
        return
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        events = []
        page.on("response", lambda resp: events.append((resp.url, resp.request.resource_type)))
        await page.goto(SOURCE_URL, wait_until="networkidle")
        await _accept_cookies(page)
        await page.wait_for_timeout(1200)
        # Filter interesting URLs
        interesting = [
            (u,t) for (u,t) in events
            if any(k in u for k in ["admin-ajax.php", "wp-json", "api", "ajax", "json"])
        ]
        print("Interesting network calls (first 15):")
        for u, t in interesting[:15]:
            print("  ", t, u)
        await browser.close()

try:
    if loop and loop.is_running():
        await debug_network()
    else:
        asyncio.run(debug_network())
except Exception as e:
    print("Network debug failed:", e)


Interesting network calls (first 15):
   stylesheet https://fonts.googleapis.com/css?family=Open+Sans%3A400%2C600%2C700%7CRoboto%3A400%2C500%2C700&ver=1.6
   script https://www.dsi.rs/wp-content/plugins/wdv-mailchimp-ajax//assets/js/wdv_mailchimp_ajaxcall.js?ver=1.0
   script https://www.dsi.rs/wp-content/plugins/mc4wp-premium/ajax-forms/assets/js/ajax-forms.min.js?ver=4.5.13
   script https://www.dsi.rs/wp-content/plugins/mailchimp-for-wp/assets/js/forms-api.min.js?ver=4.5.3
   script https://ajax.googleapis.com/ajax/libs/webfont/1.5.3/webfont.js
   stylesheet https://fonts.googleapis.com/css?family=Exo+2:500,400,700%7COpen+Sans:400%7CExo+2:500italic,600italic,700italic
   xhr https://www.dsi.rs/wp-admin/admin-ajax.php
   stylesheet https://fonts.googleapis.com/css?family=Open+Sans%3A400%2C600%2C700%7CRoboto%3A400%2C500%2C700&ver=1.6
   script https://www.dsi.rs/wp-content/plugins/wdv-mailchimp-ajax//assets/js/wdv_mailchimp_ajaxcall.js?ver=1.0
   script https://www.dsi.rs/wp-content/p

In [21]:
# Deep network inspect: capture admin-ajax payload and response
async def inspect_admin_ajax():
    if async_playwright is None:
        print("Playwright not available.")
        return
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        captured = []
        page.on("response", lambda resp: captured.append(resp))
        await page.goto(SOURCE_URL, wait_until="networkidle")
        await _accept_cookies(page)
        await page.wait_for_timeout(1500)
        targets = [r for r in captured if 'admin-ajax.php' in r.url]
        print("admin-ajax responses:", len(targets))
        for resp in targets[:3]:
            try:
                req = resp.request
                body = req.post_data or ''
                print("URL:", resp.url)
                print("Method:", req.method)
                print("POST data:", (body[:500] if body else '(none)'))
                # Try to get text via fetch if method+params identifiable
                if req.method == 'POST' and body:
                    import urllib.parse as _up
                    parsed = dict(_up.parse_qsl(body))
                    r2 = SESSION.post(resp.url, data=parsed, timeout=20)
                    print("Re-fetched status:", r2.status_code)
                    print("Re-fetched snippet:", (r2.text[:500] if r2.text else ''))
            except Exception as e:
                print("inspect error:", e)
        await browser.close()

try:
    if loop and loop.is_running():
        await inspect_admin_ajax()
    else:
        asyncio.run(inspect_admin_ajax())
except Exception as e:
    print("Admin-ajax inspect failed:", e)


admin-ajax responses: 1
URL: https://www.dsi.rs/wp-admin/admin-ajax.php
Method: POST
POST data: action=save_consent_log&extras=%7B%22strict%22%3A%221%22%2C%22thirdparty%22%3A%221%22%2C%22advanced%22%3A%221%22%2C%22version%22%3A%221%22%7D
Re-fetched status: 200
Re-fetched snippet: 


In [24]:
# Debug: scan attributes for URL-like values within Members/Friends tiles
async def debug_scan_attributes():
    if async_playwright is None:
        print("Playwright not available.")
        return
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        await page.goto(SOURCE_URL, wait_until="networkidle")
        await _accept_cookies(page)
        await page.wait_for_timeout(1200)
        js = r'''
        (title) => {
          function textClean(s){return (s||'').replace(/\s+/g,' ').trim()}
          function findHeading(title){
            const hs = Array.from(document.querySelectorAll('h1,h2,h3,h4,h5,h6'))
            return hs.find(h => textClean(h.textContent).toLowerCase() === title.toLowerCase()) ||
                   hs.find(h => textClean(h.textContent).toLowerCase().includes(title.toLowerCase()))
          }
          const h = findHeading(title)
          const container = h ? (h.closest('section,div') || h.parentElement) : document
          const tiles = Array.from(container.querySelectorAll('*')).filter(el => el.querySelector && el.querySelector('img'))
          const out = []
          const urlRe = /https?:\/\//i
          for(const el of tiles.slice(0,200)){
            const attrs = el.getAttributeNames ? el.getAttributeNames() : []
            for(const name of attrs){
              const val = el.getAttribute(name)
              if(typeof val === 'string' && urlRe.test(val)){
                out.push({tag: el.tagName.toLowerCase(), attr: name, val})
              }
            }
          }
          return out
        }
        '''
        for section in ["Our members", "Friends"]:
            data = await page.evaluate(js, section)
            print(section, "tiles with URL-like attributes:", len(data))
            for row in data[:5]:
                print("  ", row)
        await browser.close()

try:
    if loop and loop.is_running():
        await debug_scan_attributes()
    else:
        asyncio.run(debug_scan_attributes())
except Exception as e:
    print("Attribute scan failed:", e)


Our members tiles with URL-like attributes: 0
Friends tiles with URL-like attributes: 0


In [25]:
# Discover WP post types to find a custom type for members/friends
try:
    r = SESSION.get("https://www.dsi.rs/wp-json/wp/v2/types", timeout=20)
    print("types status:", r.status_code)
    data = r.json()
    print("Known types:", list(data.keys())[:20])
    # Look for anything with 'member' or similar
    matches = {k:v for k,v in data.items() if any(x in k for x in ['member','partner','friend','company','logo'])}
    print("Candidate custom types:", list(matches.keys()))
except Exception as e:
    print("types fetch error:", e)


types status: 200
Known types: ['post', 'page', 'attachment']
Candidate custom types: []


In [26]:
# Robust spatial/hover-based extractor for DSI Members & Friends
# Strategy: filter anchors by vertical ranges between headings, infer names from image alts/labels,
# and save CSV. Falls back to providing a headful hint if nothing is found in headless mode.

import asyncio
import pandas as pd
from urllib.parse import urlparse

async def scrape_dsi_members_friends_spatial(headless: bool = True):
    """Collect anchors within the vertical ranges of 'Our members' and 'Friends'.
    Returns dict with lists under keys 'mem' and 'fri'."""
    try:
        from playwright.async_api import async_playwright
    except Exception as e:
        raise RuntimeError("Playwright is required for this cell. Please install it in the current kernel.") from e

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=headless, args=["--no-sandbox", "--disable-dev-shm-usage"])  # headful optional
        context = await browser.new_context()
        page = await context.new_page()
        await page.goto(SOURCE_URL, wait_until="domcontentloaded", timeout=60000)

        # Dismiss cookie banner if present
        for selector in [
            "button:has-text('Accept')",
            "#cn-accept-cookie",
            "button[aria-label='Accept']",
            "//button[contains(., 'Accept')]",
        ]:
            try:
                await page.locator(selector).first.click(timeout=2000)
                break
            except Exception:
                pass

        # Let lazy assets settle
        try:
            await page.wait_for_load_state("networkidle", timeout=15000)
        except Exception:
            pass

        data = await page.evaluate(
            """
            () => {
                const y = (el) => el.getBoundingClientRect().top + window.scrollY;
                const headings = Array.from(document.querySelectorAll('h1, h2, h3'));
                const findH = (txt) => headings.find(h => (h.textContent || '').trim().toLowerCase().includes(txt));

                const hMembers = findH('our members');
                const hFriends = findH('friends');

                const topMembers = hMembers ? y(hMembers) : 0;
                const topFriends = hFriends ? y(hFriends) : Number.POSITIVE_INFINITY;
                const bottomMembers = hFriends ? y(hFriends) : document.body.scrollHeight;
                const bottomFriends = document.body.scrollHeight;

                const all = Array.from(document.querySelectorAll('a[href]'));

                const nameFor = (a) => {
                    const altIn = a.querySelector('img[alt]');
                    if (altIn && altIn.alt && altIn.alt.trim()) return altIn.alt.trim();
                    let node = a;
                    for (let i = 0; i < 3 && node; i++) {
                        const img = node.querySelector && node.querySelector('img[alt]');
                        if (img && img.alt && img.alt.trim()) return img.alt.trim();
                        node = node.parentElement;
                    }
                    const label = a.getAttribute('aria-label') || a.title || (a.textContent || '').trim();
                    return label || null;
                };

                const collect = (rangeTop, rangeBottom, label) => {
                    const rows = [];
                    for (const a of all) {
                        const href = a.href;
                        if (!href) continue;
                        if (href.startsWith('mailto:') || href.startsWith('tel:')) continue;
                        const rect = a.getBoundingClientRect();
                        const ymid = (rect.top + rect.bottom) / 2 + window.scrollY;
                        if (isNaN(ymid)) continue;
                        if (ymid < rangeTop || ymid > rangeBottom) continue;
                        rows.push({ group: label, name: nameFor(a), website: href, y: ymid, w: rect.width, h: rect.height });
                    }
                    return rows;
                };

                const mem = collect(topMembers, bottomMembers, 'members');
                const fri = collect(topFriends, bottomFriends, 'friends');
                return { mem, fri };
            }
            """
        )

        await browser.close()
        return data

# Run the spatial extraction in headless mode first
try:
    res = asyncio.get_event_loop().run_until_complete(scrape_dsi_members_friends_spatial(headless=True))
except RuntimeError as err:
    print(str(err))
    res = {"mem": [], "fri": []}

members, friends = res.get("mem", []), res.get("fri", [])

# Utility helpers

def extract_domain(u: str) -> str:
    try:
        host = urlparse(u).netloc.lower()
        return host[4:] if host.startswith("www.") else host
    except Exception:
        return ""


def is_external(u: str) -> bool:
    try:
        netloc = urlparse(u).netloc.lower()
        return netloc and 'dsi.rs' not in netloc
    except Exception:
        return False

rows = []
for rec in (members or []):
    href = rec.get("website")
    if href and is_external(href):
        rows.append({
            "group": "members",
            "name": (rec.get("name") or "").strip(),
            "website": href,
            "domain": extract_domain(href),
            "source": SOURCE_URL,
        })
for rec in (friends or []):
    href = rec.get("website")
    if href and is_external(href):
        rows.append({
            "group": "friends",
            "name": (rec.get("name") or "").strip(),
            "website": href,
            "domain": extract_domain(href),
            "source": SOURCE_URL,
        })

# Dedupe by (group, domain)
seen = set()
dedup = []
for r in rows:
    key = (r["group"], r["domain"])
    if not r["domain"] or key in seen:
        continue
    seen.add(key)
    dedup.append(r)

print(f"Extracted rows with spatial filtering: total={len(dedup)} | members={sum(1 for r in dedup if r['group']=='members')} | friends={sum(1 for r in dedup if r['group']=='friends')}")

if dedup:
    pd.DataFrame(dedup).to_csv(output_path, index=False)
    print(f"Wrote {output_path}")
else:
    print("No rows found yet. If your environment supports it, try running a headful attempt by modifying headless=False inside scrape_dsi_members_friends_spatial().")


Extracted rows with spatial filtering: total=35 | members=22 | friends=13
Wrote /home/thiesen/Documents/AI-Innoscence_Ecosystem/Input/Novi Sad/digital_serbia_members_friends.csv
